In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count

In [2]:
spark = SparkSession.builder.appName("TotalOrdersPerRegionCountry").getOrCreate()

In [3]:
type(spark)

pyspark.sql.session.SparkSession

In [7]:
data = spark.read.csv("sales_records.csv",inferSchema=True, header=True)

In [8]:
data.printSchema()

root
 |-- Region: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Item Type: string (nullable = true)
 |-- Sales Channel: string (nullable = true)
 |-- Order Priority: string (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Order ID: integer (nullable = true)
 |-- Ship Date: string (nullable = true)
 |-- Units Sold: integer (nullable = true)
 |-- Unit Price: double (nullable = true)
 |-- Unit Cost: double (nullable = true)
 |-- Total Revenue: double (nullable = true)
 |-- Total Cost: double (nullable = true)
 |-- Total Profit: double (nullable = true)



In [9]:
data.show()

+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+
|              Region|             Country|      Item Type|Sales Channel|Order Priority|Order Date| Order ID| Ship Date|Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|
+--------------------+--------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+
|Middle East and N...|          Azerbaijan|         Snacks|       Online|             C| 10/8/2014|535113847|10/23/2014|       934|    152.58|    97.44|    142509.72|  91008.96|    51500.76|
|Central America a...|              Panama|      Cosmetics|      Offline|             L| 2/22/2015|874708545| 2/27/2015|      4551|     437.2|   263.33|    1989697.2|1198414.83|   791282.37|
|  Sub-Saharan Africa|Sao Tome and Prin...|  

In [11]:
data.select("Region","Country","Order ID").show(n=10,truncate=False)

+---------------------------------+---------------------+---------+
|Region                           |Country              |Order ID |
+---------------------------------+---------------------+---------+
|Middle East and North Africa     |Azerbaijan           |535113847|
|Central America and the Caribbean|Panama               |874708545|
|Sub-Saharan Africa               |Sao Tome and Principe|854349935|
|Sub-Saharan Africa               |Sao Tome and Principe|892836844|
|Central America and the Caribbean|Belize               |129280602|
|Europe                           |Denmark              |473105037|
|Europe                           |Germany              |754046475|
|Middle East and North Africa     |Turkey               |772153747|
|Europe                           |United Kingdom       |847788178|
|Asia                             |Kazakhstan           |471623599|
+---------------------------------+---------------------+---------+
only showing top 10 rows



In [12]:
type(data)

pyspark.sql.dataframe.DataFrame

In [14]:
count_sales_df = (data.select("Region","Country","Order ID")
                  .groupBy("Region","Country").agg(count("Order ID").alias("Total Orders"))
                  .orderBy("Total Orders",ascending=False))

In [15]:
count_sales_df.show(n=10, truncate=False)

+---------------------------------+------------+------------+
|Region                           |Country     |Total Orders|
+---------------------------------+------------+------------+
|Sub-Saharan Africa               |Sudan       |623         |
|Australia and Oceania            |New Zealand |593         |
|Europe                           |Vatican City|590         |
|Europe                           |Malta       |589         |
|Sub-Saharan Africa               |Mozambique  |589         |
|Middle East and North Africa     |Tunisia     |584         |
|Asia                             |Cambodia    |584         |
|Central America and the Caribbean|Panama      |578         |
|Sub-Saharan Africa               |Rwanda      |576         |
|Sub-Saharan Africa               |South Africa|575         |
+---------------------------------+------------+------------+
only showing top 10 rows



In [17]:
print("Total Rows: ",(count_sales_df.count()))

Total Rows:  185


# Spark has two types of Transformation:
    1. Narrow Transformation
    2. Wide Transformation and also Spark Action
    
Narrow Transformation is where output can be computed form a single input partition.
Data is distributed in partition consider one Data Partition has 200records, second Data Partition has 300 records, 
and other Data Partition has 250 Records so single input reverse to each partition and the calculation of perhaps 
data operations required carried out without partition having to exchange shuffle data.

Example: 
    import pyspark
    sc = pyspark.SparkContext('local[*]')
    
    big_list = range(1000)
    rdd = sc.parallelize(big_list,2)
    odds = rdd.filter(lambda x:x % 2 != 0)
    odds.take(5)
    
    age_list = [40,]

result will be combined into one and output operations and the partition will not 
required to exchange or shuffle data
    
1. Narrow Transformation:
    1. Map
    2. FlatMap
    3. Map Partition
    4. Filter
    5. Sample
    6. Union
    
2. Wide Transformation:
    Wide transformations require input from other partitions, data shuffling is needed before processing
    
    1. Intersection
    2. Distinct
    3. ReduceByKey
    4. GroupByKey
    5. Join
    6. Cartesian
    7. Repartition
    8. Coalesce.
    
1. Wide transformations requires input from other partitions, data shuffling is needed before processing.
2. Transformation always results in a new data frame, and actions don't result's a new dataframe / RDD

# Dag Visualization

PySpark code gets converted into Java Byte code using the py4j library which communicates 
with the built-in Java API.

Stages can tell  a story about our Spark Program