In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count

In [3]:
spark = SparkSession.builder.appName('SalesOrdersbyRegionCountry').getOrCreate()

23/07/01 18:54:26 WARN Utils: Your hostname, systemd resolves to a loopback address: 127.0.1.1; using 192.168.0.141 instead (on interface wlp3s0)
23/07/01 18:54:26 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/01 18:54:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
type(spark)

pyspark.sql.session.SparkSession

In [5]:
data_file = 'data/sales_records.csv'

In [6]:
sales_dataframe = spark.read.format('csv')  # Specify the file format to read

sales_dataframe = sales_dataframe.option('header', 'true')  # Consider the first row as headers

sales_dataframe = sales_dataframe.option('inferSchema', 'true')  # Infer the schema of the dataframe

sales_dataframe = sales_dataframe.load(data_file)  # Load the data from the specified file

In [7]:
type(sales_dataframe)

pyspark.sql.dataframe.DataFrame

In [8]:
sales_dataframe

DataFrame[Region: string, Country: string, Item Type: string, Sales Channel: string, Order Priority: string, Order Date: string, Order ID: int, Ship Date: string, Units Sold: int, Unit Price: double, Unit Cost: double, Total Revenue: double, Total Cost: double, Total Profit: double]

In [9]:
sales_dataframe.show(n = 5)

+--------------------+--------------------+-------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+
|              Region|             Country|    Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|
+--------------------+--------------------+-------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+
|Middle East and N...|          Azerbaijan|       Snacks|       Online|             C| 10/8/2014|535113847|10/23/2014|       934|    152.58|    97.44|    142509.72|  91008.96|    51500.76|
|Central America a...|              Panama|    Cosmetics|      Offline|             L| 2/22/2015|874708545| 2/27/2015|      4551|     437.2|   263.33|    1989697.2|1198414.83|   791282.37|
|  Sub-Saharan Africa|Sao Tome and Prin...|       Fruit

In [10]:
sales_dataframe.select('Region','Country','Order ID').show(n = 5)

+--------------------+--------------------+---------+
|              Region|             Country| Order ID|
+--------------------+--------------------+---------+
|Middle East and N...|          Azerbaijan|535113847|
|Central America a...|              Panama|874708545|
|  Sub-Saharan Africa|Sao Tome and Prin...|854349935|
|  Sub-Saharan Africa|Sao Tome and Prin...|892836844|
|Central America a...|              Belize|129280602|
+--------------------+--------------------+---------+
only showing top 5 rows



In [11]:
sales_dataframe.select('Region','Country','Order ID').show(n = 5, truncate = False)

+---------------------------------+---------------------+---------+
|Region                           |Country              |Order ID |
+---------------------------------+---------------------+---------+
|Middle East and North Africa     |Azerbaijan           |535113847|
|Central America and the Caribbean|Panama               |874708545|
|Sub-Saharan Africa               |Sao Tome and Principe|854349935|
|Sub-Saharan Africa               |Sao Tome and Principe|892836844|
|Central America and the Caribbean|Belize               |129280602|
+---------------------------------+---------------------+---------+
only showing top 5 rows



## Group the data frame and perform aggregation on it 

In [32]:
group_by_region_country = sales_dataframe.groupBy('Region','Country')

count_by_region_country = group_by_region_country.agg(count('Order Id').alias('CountByRegionCountry'))

count_by_region_country.show()

+--------------------+--------------------+--------------------+
|              Region|             Country|CountByRegionCountry|
+--------------------+--------------------+--------------------+
|                Asia|               Japan|                 560|
|Middle East and N...|             Morocco|                 558|
|  Sub-Saharan Africa|            Ethiopia|                 554|
|              Europe|              Russia|                 521|
|Middle East and N...|                Iran|                 531|
|Central America a...|                Cuba|                 540|
|  Sub-Saharan Africa|         Seychelles |                 561|
|  Sub-Saharan Africa|            Tanzania|                 549|
|  Sub-Saharan Africa|               Gabon|                 534|
|  Sub-Saharan Africa|              Zambia|                 541|
|                Asia|           Singapore|                 548|
|  Sub-Saharan Africa|Central African R...|                 535|
|  Sub-Saharan Africa|   

### Sort the Grouped Dataframe based on the count ascending 

In [34]:
count_by_region_country_ordered = count_by_region_country.orderBy('CountByRegionCountry', ascending = False)

count_by_region_country_ordered.show()

+--------------------+--------------------+--------------------+
|              Region|             Country|CountByRegionCountry|
+--------------------+--------------------+--------------------+
|  Sub-Saharan Africa|               Sudan|                 623|
|Australia and Oce...|         New Zealand|                 593|
|              Europe|        Vatican City|                 590|
|              Europe|               Malta|                 589|
|  Sub-Saharan Africa|          Mozambique|                 589|
|Middle East and N...|            Tunisia |                 584|
|                Asia|            Cambodia|                 584|
|Central America a...|              Panama|                 578|
|  Sub-Saharan Africa|              Rwanda|                 576|
|  Sub-Saharan Africa|        South Africa|                 575|
|  Sub-Saharan Africa|       Cote d'Ivoire|                 575|
|Australia and Oce...|Federated States ...|                 574|
|Central America a...|   