In [None]:
# Find the total cost, total revenue, total profit on the basis of each region
# Find the Item List on the basis of each country
# Find the total number of items sold in each country
# Find the top five famous item lists on the basis of each region.(Consider units sold while doing this.)
# Find all the regions and their famous sales channels.
# Find  the list of countries and items and their respective units.
# In 2013, identify the regions which sold maximum and minimum units of item type Meat.
# List all the items whose unit cost is less than 500
# Find the total cost, revenue and profit of each year.

## Load Data

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('sales').getOrCreate()

22/10/19 18:19:53 WARN Utils: Your hostname, tars resolves to a loopback address: 127.0.1.1; using 192.168.1.66 instead (on interface wlan0)
22/10/19 18:19:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/19 18:19:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
sales_df = spark.read.format('json').load('data/sales_records.json')
sales_df.printSchema()
# sales_df.show()

[Stage 0:>                                                          (0 + 9) / 9]

root
 |-- Country: string (nullable = true)
 |-- Item Type: string (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Order ID: string (nullable = true)
 |-- Order Priority: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Sales Channel: string (nullable = true)
 |-- Ship Date: string (nullable = true)
 |-- Total Cost: string (nullable = true)
 |-- Total Profit: string (nullable = true)
 |-- Total Revenue: string (nullable = true)
 |-- Unit Cost: string (nullable = true)
 |-- Unit Price: string (nullable = true)
 |-- Units Sold: string (nullable = true)
 |-- _corrupt_record: string (nullable = true)



                                                                                

In [5]:
# all records are in the form of strings, so we need to do type casting
from pyspark.sql.types import StringType, FloatType, IntegerType
from pyspark.sql.functions import col


sales_df = sales_df\
            .withColumn('Total Cost', col('Total Cost').cast(FloatType()))\
            .withColumn('Total Profit', col('Total Profit').cast(FloatType()))\
            .withColumn('Total Revenue', col('Total Revenue').cast(FloatType()))\
            .withColumn('Unit Cost', col('Unit Cost').cast(FloatType()))\
            .withColumn('Unit Price', col('Unit Price').cast(FloatType()))\
            .withColumn('Units Sold', col('Units Sold').cast(IntegerType()))\
            .withColumnRenamed('_corrupt_record', 'Corrupt Record')

sales_df = sales_df.na.drop(subset=['Region'])
sales_df.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Item Type: string (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Order ID: string (nullable = true)
 |-- Order Priority: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Sales Channel: string (nullable = true)
 |-- Ship Date: string (nullable = true)
 |-- Total Cost: float (nullable = true)
 |-- Total Profit: float (nullable = true)
 |-- Total Revenue: float (nullable = true)
 |-- Unit Cost: float (nullable = true)
 |-- Unit Price: float (nullable = true)
 |-- Units Sold: integer (nullable = true)
 |-- Corrupt Record: string (nullable = true)



## 1. Find the total cost, total revenue, total profit on the basis of each region

In [6]:
from pyspark.sql.functions import sum as sum_, col

sales_df\
    .groupBy('Region')\
    .agg(sum_(col('Total Cost')).alias('Total Cost'), sum_(col('Total Revenue')).alias('Total Revenue'), sum_('Total Profit').alias('Total Profit'))\
    .show()

[Stage 1:>                                                          (0 + 9) / 9]

+--------------------+--------------------+--------------------+--------------------+
|              Region|          Total Cost|       Total Revenue|        Total Profit|
+--------------------+--------------------+--------------------+--------------------+
|Middle East and N...|1.194003115362056...|1.691834583280536...| 4.978314680986322E9|
|Australia and Oce...| 7.526098663258995E9|1.070152222371284...|3.1754235603909473E9|
|              Europe|2.415937816242763E10|3.423977049206286...|1.008039233312204...|
|  Sub-Saharan Africa|2.465031758112777...|3.495487197307492E10|1.030455438786905...|
|Central America a...|1.026651963972375...|1.455373016365411...|4.2872105216859627E9|
|       North America| 2.064450719133194E9|2.9370023334700775E9| 8.725516170647297E8|
|                Asia|1.358588970253493...|1.929340122315577...|5.7075115161706505E9|
+--------------------+--------------------+--------------------+--------------------+



                                                                                

## 2. Find the Item List on the basis of each country

In [70]:
# assuming item type list
from pyspark.sql import Window as W
from pyspark.sql.functions import collect_list, collect_set, explode

window_spec = W.partitionBy('Country')
country_item_list_df = sales_df\
                        .withColumn('Item List', collect_set('Item Type').over(window_spec))\
                        .select('Country', 'Item List')\
                        .distinct()

country_item_list_df.show()

+--------------------+--------------------+
|             Country|           Item List|
+--------------------+--------------------+
|         Afghanistan|[Beverages, Perso...|
|             Albania|[Beverages, Perso...|
|             Algeria|[Beverages, Perso...|
|             Andorra|[Beverages, Perso...|
|              Angola|[Beverages, Perso...|
|Antigua and Barbuda |[Beverages, Perso...|
|             Armenia|[Beverages, Perso...|
|           Australia|[Beverages, Perso...|
|             Austria|[Beverages, Perso...|
|          Azerbaijan|[Beverages, Perso...|
|             Bahrain|[Beverages, Perso...|
|          Bangladesh|[Beverages, Perso...|
|            Barbados|[Beverages, Perso...|
|             Belarus|[Beverages, Perso...|
|             Belgium|[Beverages, Perso...|
|              Belize|[Beverages, Perso...|
|               Benin|[Beverages, Perso...|
|              Bhutan|[Beverages, Perso...|
|Bosnia and Herzeg...|[Beverages, Perso...|
|            Botswana|[Beverages

## 3. Find the total number of items sold in each country

In [71]:
# we need to use units sold for this
from pyspark.sql.functions import sum as sum_
sales_df\
    .groupBy('Country')\
    .agg(sum_('Units Sold').alias('Num items sold'))\
    .show()

+-----------+--------------+
|    Country|Num items sold|
+-----------+--------------+
|       Chad|       2660461|
|     Russia|       2579558|
|      Yemen|       2966519|
|    Senegal|       2716010|
|     Sweden|       2698756|
|   Kiribati|       2555774|
|    Eritrea|       2552497|
|Philippines|       2610149|
|   Djibouti|       2699545|
|      Tonga|       2565238|
|  Singapore|       2693579|
|   Malaysia|       2587267|
|       Fiji|       2613373|
|     Turkey|       2732629|
|     Malawi|       2645975|
|       Iraq|       2765491|
|    Germany|       2502470|
|    Comoros|       2556790|
|   Cambodia|       2946963|
|Afghanistan|       2805640|
+-----------+--------------+
only showing top 20 rows



## 4. Find the top five famous items list on the basis of each region.(Consider units sold while doing this.)

In [78]:
from pyspark.sql.functions import row_number, rank

region_sales_df = sales_df\
                    .groupBy('Region', 'Item Type')\
                    .agg(sum_('Units Sold').alias('Total Units Sold'))\
                    .orderBy('Region')

window_spec = W.partitionBy('Region')\
                .orderBy(col('Total Units Sold').desc())

region_sales_ranked_df = region_sales_df\
                            .withColumn('rn', rank().over(window_spec))\
                            .where('rn <= 5')\
                            .drop('rn')

# regions and their five best item types
region_sales_ranked_df.select('Region', 'Item Type').show()

# final result
region_sales_ranked_df\
    .withColumn('Item List', collect_list('Item Type').over(W.partitionBy('Region')))\
    .select('Region', 'Item List')\
    .distinct()\
    .show()

+--------------------+---------------+
|              Region|      Item Type|
+--------------------+---------------+
|                Asia|         Cereal|
|                Asia|         Snacks|
|                Asia|Office Supplies|
|                Asia|     Vegetables|
|                Asia|        Clothes|
|Australia and Oce...|  Personal Care|
|Australia and Oce...|     Vegetables|
|Australia and Oce...|         Cereal|
|Australia and Oce...|      Beverages|
|Australia and Oce...|        Clothes|
|Central America a...|      Cosmetics|
|Central America a...|        Clothes|
|Central America a...|           Meat|
|Central America a...|Office Supplies|
|Central America a...|      Baby Food|
|              Europe|         Cereal|
|              Europe|Office Supplies|
|              Europe|     Vegetables|
|              Europe|      Beverages|
|              Europe|         Fruits|
+--------------------+---------------+
only showing top 20 rows

+--------------------+----------------

## 5. Find all the regions and their famous sales channels.

In [75]:
sales_df\
    .withColumn('Sales Channels', collect_set('Sales Channel').over(W.partitionBy('Region')))\
    .select('Region', 'Sales Channels')\
    .distinct()\
    .show()

+--------------------+-----------------+
|              Region|   Sales Channels|
+--------------------+-----------------+
|                Asia|[Online, Offline]|
|Australia and Oce...|[Online, Offline]|
|Central America a...|[Online, Offline]|
|              Europe|[Online, Offline]|
|Middle East and N...|[Online, Offline]|
|       North America|[Online, Offline]|
|  Sub-Saharan Africa|[Online, Offline]|
+--------------------+-----------------+



## 6. Find  the list of countries and items and their respective units.

In [77]:
sales_df.groupBy('Country', 'Item Type').agg(sum_('Units Sold').alias('Units Sold')).orderBy('Country').show()

+-----------+---------------+----------+
|    Country|      Item Type|Units Sold|
+-----------+---------------+----------+
|Afghanistan|     Vegetables|    219937|
|Afghanistan|      Cosmetics|    217192|
|Afghanistan|Office Supplies|    166911|
|Afghanistan|           Meat|    273402|
|Afghanistan|        Clothes|    220429|
|Afghanistan|  Personal Care|    255956|
|Afghanistan|         Cereal|    256936|
|Afghanistan|      Beverages|    206154|
|Afghanistan|      Household|    261953|
|Afghanistan|         Fruits|    257336|
|Afghanistan|         Snacks|    237350|
|Afghanistan|      Baby Food|    232084|
|    Albania|         Cereal|    215238|
|    Albania|           Meat|    266123|
|    Albania|        Clothes|    250884|
|    Albania|         Snacks|    210384|
|    Albania|Office Supplies|    236822|
|    Albania|      Baby Food|    191480|
|    Albania|         Fruits|    252283|
|    Albania|  Personal Care|    222874|
+-----------+---------------+----------+
only showing top

## 7. In 2013, identify the regions which sold maximum and minimum units of item type Meat.

In [123]:
from pyspark.sql.functions import substring, max as max_, min as min_

sales_13_df = sales_df\
                .withColumn('year', substring('Order Date', -4, 4))\
                .where(col('year') == '2013')

# sales_13_df.show()

meat_sales_grouped_region_df = sales_13_df\
                                .where(col('Item Type') == 'Meat')\
                                .groupBy('Region', 'Item Type')\
                                .agg(sum_('Units Sold').alias('Units Sold'))

meat_sales_grouped_region_df\
    .orderBy('Units Sold')\
    .show()

+--------------------+---------+----------+
|              Region|Item Type|Units Sold|
+--------------------+---------+----------+
|       North America|     Meat|    106193|
|Australia and Oce...|     Meat|    449346|
|Central America a...|     Meat|    615706|
|Middle East and N...|     Meat|    745940|
|                Asia|     Meat|    956367|
|              Europe|     Meat|   1468932|
|  Sub-Saharan Africa|     Meat|   1491277|
+--------------------+---------+----------+



## 8. List all the items whose unit cost is less than 500

In [141]:
sales_df.filter(col('Unit Cost') < 500).select('Item Type', 'Unit Cost').distinct().show()

+-------------+---------+
|    Item Type|Unit Cost|
+-------------+---------+
|       Cereal|   117.11|
|         Meat|   364.69|
|    Baby Food|   159.42|
|       Fruits|     6.92|
|   Vegetables|    90.93|
|    Beverages|    31.79|
|Personal Care|    56.67|
|      Clothes|    35.84|
|    Cosmetics|   263.33|
|       Snacks|    97.44|
+-------------+---------+



## 9. Find the total cost, revenue and profit of each year.

In [143]:
year_sales_df = sales_df\
                    .withColumn('Year', substring('Order Date', -4, 4))\

year_sales_df\
    .groupBy('Year')\
    .agg(sum_('Total Cost').alias('Total Cost'), sum_('Total Revenue').alias('Total Revenue'), sum_('Total Profit').alias('Total Profit'))\
    .orderBy('Year')\
    .show()

+----+--------------------+--------------------+--------------------+
|Year|          Total Cost|       Total Revenue|        Total Profit|
+----+--------------------+--------------------+--------------------+
|2010|1.233298120617372...|1.752972613683793...|  5.19674493173579E9|
|2011|1.233635124975034...|1.751684120852689...| 5.180489954286942E9|
|2012|1.245034203973387...|1.762118501057573...| 5.170842975163134E9|
|2013|1.254475780787666...|1.780262842300091...| 5.257870612787804E9|
|2014| 1.26472629863539E10|1.786939230240254...| 5.222129311173204E9|
|2015|1.256502197052444...|1.779198425534374...| 5.226962291379013E9|
|2016|1.229711736956142...|1.746406771565606...| 5.166950335527011E9|
|2017| 7.018850991852474E9|1.000281918959213...|2.9839682052368097E9|
+----+--------------------+--------------------+--------------------+

