In [77]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.functions import regexp_replace, col
from pyspark.sql.window import Window
spark = SparkSession.builder.appName("PySparkPractice").getOrCreate()

In [78]:
#path = "C:/Users/srima/Documents/learning/Technology/Data Engineering/PySpark/Spark-The-Definitive-Guide-master/data/retail-data/all"
path = "C:/Users/srima/Documents/learning/Technology/Data Engineering/PySpark/Spark-The-Definitive-Guide-master/data/retail-data/by-day"

In [79]:
#Create Shema
schema = StructType([
    StructField("InvoiceNo", IntegerType(), True),
    StructField("StockCode", StringType(), True),
    StructField("Description", StringType(), True),
    StructField("Quantity", IntegerType(), True),
    StructField("InvoiceDate", TimestampType(), True),
    StructField("UnitPrice", DoubleType(), True),
    StructField("CustomerID", StringType(), True),
    StructField("Country", StringType(), True)
])

df = spark.read.format("csv").option("header", "true").option("inferschema", "true").schema(schema).load(f"{path}/2010-12-01.csv").coalesce(5)
df.cache()
df.show()

df.createOrReplaceTempView("dfTable")


+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|2010-12-01 08:26:00|     7.65|   17850.0|United Kingdom|
|   536365|    21730|GLASS S

In [80]:
df.count()
df.printSchema()

root
 |-- InvoiceNo: integer (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: string (nullable = true)
 |-- Country: string (nullable = true)



#### count

In [81]:
df.select(count("StockCode")).show()

+----------------+
|count(StockCode)|
+----------------+
|            3108|
+----------------+



#### CountDistinct  


In [82]:
df.select(count_distinct("StockCode")).show()

+-------------------------+
|count(DISTINCT StockCode)|
+-------------------------+
|                     1351|
+-------------------------+



#### approx_count_distinct

In [83]:
df.select(approx_count_distinct("StockCode")).show()

+--------------------------------+
|approx_count_distinct(StockCode)|
+--------------------------------+
|                            1282|
+--------------------------------+



#### first and last

In [84]:
df.select(first("StockCode"), last("StockCode")).show()

+----------------+---------------+
|first(StockCode)|last(StockCode)|
+----------------+---------------+
|          85123A|          20755|
+----------------+---------------+



#### min and max

In [85]:
df.select(min("Quantity"), max("Quantity")).show()

+-------------+-------------+
|min(Quantity)|max(Quantity)|
+-------------+-------------+
|          -24|          600|
+-------------+-------------+



In [86]:
df.select(sum("Quantity")).show()

+-------------+
|sum(Quantity)|
+-------------+
|        26814|
+-------------+



In [87]:
df.select(sum_distinct("Quantity")).show()

+----------------------+
|sum(DISTINCT Quantity)|
+----------------------+
|                  4690|
+----------------------+



#### avg


In [88]:
df.select(
    count("Quantity").alias("total_transactions"),
    sum("Quantity").alias("total_purches"),
    avg("Quantity").alias("avg_purches"),
    expr("mean(Quantity)").alias("mean_purches")        
    ).selectExpr('total_purches', 'avg_purches','mean_purches').show()

+-------------+-----------------+-----------------+
|total_purches|      avg_purches|     mean_purches|
+-------------+-----------------+-----------------+
|        26814|8.627413127413128|8.627413127413128|
+-------------+-----------------+-----------------+



#### Variance and Standerd Deviation

In [89]:
df.select(var_pop("Quantity"), var_samp("Quantity"), stddev_pop("Quantity"), stddev_samp("Quantity")).show()

+-----------------+------------------+--------------------+---------------------+
|var_pop(Quantity)|var_samp(Quantity)|stddev_pop(Quantity)|stddev_samp(Quantity)|
+-----------------+------------------+--------------------+---------------------+
|695.2492099104054| 695.4729785650273|  26.367578764657278|   26.371821677029203|
+-----------------+------------------+--------------------+---------------------+



#### skewness and kurtosis

In [90]:
df.select(skewness("Quantity"), kurtosis("Quantity")).show()

+------------------+------------------+
|skewness(Quantity)|kurtosis(Quantity)|
+------------------+------------------+
|11.384721296581182|182.91886804842397|
+------------------+------------------+



#### Covariance and Correlation

In [91]:
df.select(corr('InvoiceNo','Quantity'), covar_pop('InvoiceNo','Quantity'), covar_samp('InvoiceNo','Quantity')).show()

+-------------------------+------------------------------+-------------------------------+
|corr(InvoiceNo, Quantity)|covar_pop(InvoiceNo, Quantity)|covar_samp(InvoiceNo, Quantity)|
+-------------------------+------------------------------+-------------------------------+
|     -0.12225395743668731|            -235.4868448608685|            -235.56327681311157|
+-------------------------+------------------------------+-------------------------------+



#### Aggregation to complex Types

In [92]:
df.agg(collect_set("Country"), collect_list("Country")).show()

+--------------------+---------------------+
|collect_set(Country)|collect_list(Country)|
+--------------------+---------------------+
|[Netherlands, EIR...| [United Kingdom, ...|
+--------------------+---------------------+



#### Grouping

In [93]:
df.groupBy('InvoiceNo', 'CustomerID').count().show()

+---------+----------+-----+
|InvoiceNo|CustomerID|count|
+---------+----------+-----+
|   536460|   14849.0|   14|
|   536525|   14078.0|   13|
|   536536|   18144.0|    3|
|   536477|   16210.0|   14|
|   536545|      NULL|    1|
|   536405|   14045.0|    1|
|   536514|   17951.0|    5|
|   536416|   13255.0|    6|
|   536587|   14142.0|   22|
|   536398|   13448.0|   17|
|   536597|   18011.0|   28|
|   536369|   13047.0|    1|
|   536420|   16583.0|   14|
|   536437|   13694.0|    6|
|   536572|   16539.0|   21|
|   536520|   14729.0|   71|
|   536544|      NULL|  527|
|   536586|   18229.0|    7|
|   536464|   17968.0|   85|
|   536393|   13747.0|    1|
+---------+----------+-----+
only showing top 20 rows


#### Grouping with Expressions  

In [94]:
df.groupBy("InvoiceNo").agg(
    count("Quantity").alias('quan'),
    expr("count(Quantity)")
).show()

+---------+----+---------------+
|InvoiceNo|quan|count(Quantity)|
+---------+----+---------------+
|   536532|  73|             73|
|   536575|   8|              8|
|   536521|   1|              1|
|   536539|  27|             27|
|   536595|  11|             11|
|   536384|  13|             13|
|   536545|   1|              1|
|   536529|   9|              9|
|   536385|   7|              7|
|   536556|   8|              8|
|   536370|  20|             20|
|   536590|  14|             14|
|   536502|   5|              5|
|   536544| 527|            527|
|   536375|  16|             16|
|   536392|  10|             10|
|   536393|   1|              1|
|   536412|  81|             81|
|   536591|  40|             40|
|   536387|   5|              5|
+---------+----+---------------+
only showing top 20 rows


#### Grouping With Map

In [95]:
df.groupBy("InvoiceNo").agg(expr("avg(Quantity)"), expr('stddev_pop(Quantity)')).show()

+---------+------------------+--------------------+
|InvoiceNo|     avg(Quantity)|stddev_pop(Quantity)|
+---------+------------------+--------------------+
|   536532| 25.36986301369863|  16.850272831671976|
|   536575|            107.25|  62.609404245688204|
|   536521|               1.0|                 0.0|
|   536539| 5.925925925925926|   4.319858759058765|
|   536595| 9.818181818181818|   17.07107165948599|
|   536384|14.615384615384615|  15.750645708563392|
|   536545|               1.0|                 0.0|
|   536529| 4.222222222222222|   3.224137361899313|
|   536385| 7.571428571428571|   4.271404682207444|
|   536556|             8.875|   7.270445309607934|
|   536370|             22.45|   8.935742834258381|
|   536590| 5.285714285714286|   4.043134770881401|
|   536502|               7.8|  5.3814496188294845|
|   536544| 2.292220113851992|  3.2337751780564514|
|   536375|               5.5|  1.5000000000000002|
|   536392|              10.3|   6.450581369148055|
|   536393| 

### Window Functions

In [96]:
dfWithDate = df.withColumn("Date", to_date(col("InvoiceDate"), "MM/dd/yyyy H:mm"))
dfWithDate.createOrReplaceTempView("dfWithDate")
dfWithDate.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+----------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|      Date|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+----------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|2010-12-01|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|2010-12-01|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|2010-12-01|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|2010-12-01|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|2010-12-01|
|   536365|    22752|SET 7 BABUSHKA NE...|      

In [97]:
dfWithDate.printSchema()

root
 |-- InvoiceNo: integer (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Date: date (nullable = true)



In [98]:
windowSpace = Window.partitionBy("CustomerID", "date").orderBy(desc("Quantity")).rowsBetween(Window.unboundedPreceding, Window.currentRow)


In [99]:

maxPurchesQuentity = max(col("Quantity")).over(windowSpace)
purcheRank = rank().over(windowSpace)
purcheDenseRank = dense_rank().over(windowSpace)


In [100]:
dfWithDate.where("CustomerID IS NOT NULL").orderBy("CustomerID")\
.select(
    col("CustomerID"),
    col("date"),
    col("Quantity"),
    purcheDenseRank.alias('QuantityDenseRank'),
    purcheRank.alias("QuantityRank"),
    maxPurchesQuentity.alias("MaxPurchaseQuentity")
).show()

+----------+----------+--------+-----------------+------------+-------------------+
|CustomerID|      date|Quantity|QuantityDenseRank|QuantityRank|MaxPurchaseQuentity|
+----------+----------+--------+-----------------+------------+-------------------+
|   12431.0|2010-12-01|      24|                1|           1|                 24|
|   12431.0|2010-12-01|      24|                1|           1|                 24|
|   12431.0|2010-12-01|      12|                2|           3|                 24|
|   12431.0|2010-12-01|       8|                3|           4|                 24|
|   12431.0|2010-12-01|       6|                4|           5|                 24|
|   12431.0|2010-12-01|       6|                4|           5|                 24|
|   12431.0|2010-12-01|       6|                4|           5|                 24|
|   12431.0|2010-12-01|       4|                5|           8|                 24|
|   12431.0|2010-12-01|       4|                5|           8|             

In [101]:
from functools import reduce
#List of all column
columns = df.columns

# Filter rows where all columns are null
null_rows_df = df.filter(
    reduce(lambda x, y: x | y, [col(c).isNull() for c in columns])
)

null_rows_df.show()
null_rows_df.count()


+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|     NULL|        D|            Discount|      -1|2010-12-01 09:41:00|     27.5|   14527.0|United Kingdom|
|     NULL|   35004C|SET OF 3 COLOURED...|      -1|2010-12-01 09:49:00|     4.65|   15311.0|United Kingdom|
|     NULL|    22556|PLASTERS IN TIN C...|     -12|2010-12-01 10:24:00|     1.65|   17548.0|United Kingdom|
|     NULL|    21984|PACK OF 12 PINK P...|     -24|2010-12-01 10:24:00|     0.29|   17548.0|United Kingdom|
|     NULL|    21983|PACK OF 12 BLUE P...|     -24|2010-12-01 10:24:00|     0.29|   17548.0|United Kingdom|
|     NULL|    21980|PACK OF 12 RED RE...|     -24|2010-12-01 10:24:00|     0.29|   17548.0|United Kingdom|
|     NULL|    21484|CHICK G

1166

#### Grouping Sets

In [102]:
dfNoNull = dfWithDate.drop()
dfNoNull.createOrReplaceTempView("dfNoNull")

In [103]:
dfNoNull.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+----------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|      Date|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+----------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|2010-12-01|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|2010-12-01|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|2010-12-01|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|2010-12-01|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|2010-12-01|
|   536365|    22752|SET 7 BABUSHKA NE...|      

#### Rollups

In [104]:
rollUpDf = dfNoNull.rollup("Date", "Country").agg(sum("Quantity").alias("total_quantity")).orderBy("Date")
rollUpDf.show()

+----------+--------------+--------------+
|      Date|       Country|total_quantity|
+----------+--------------+--------------+
|      NULL|          NULL|         26814|
|2010-12-01|     Australia|           107|
|2010-12-01|United Kingdom|         23949|
|2010-12-01|        France|           449|
|2010-12-01|          NULL|         26814|
|2010-12-01|        Norway|          1852|
|2010-12-01|       Germany|           117|
|2010-12-01|          EIRE|           243|
|2010-12-01|   Netherlands|            97|
+----------+--------------+--------------+



#### Cube

In [107]:
dfNoNull.cube("date", "Country") \
    .agg(sum(col("Quantity")).alias("total_quantity")) \
    .select("date", "Country", "total_quantity") \
    .orderBy("date") \
    .show()

+----------+--------------+--------------+
|      date|       Country|total_quantity|
+----------+--------------+--------------+
|      NULL|       Germany|           117|
|      NULL|        France|           449|
|      NULL|United Kingdom|         23949|
|      NULL|          NULL|         26814|
|      NULL|     Australia|           107|
|      NULL|        Norway|          1852|
|      NULL|          EIRE|           243|
|      NULL|   Netherlands|            97|
|2010-12-01|     Australia|           107|
|2010-12-01|United Kingdom|         23949|
|2010-12-01|        France|           449|
|2010-12-01|          NULL|         26814|
|2010-12-01|        Norway|          1852|
|2010-12-01|       Germany|           117|
|2010-12-01|          EIRE|           243|
|2010-12-01|   Netherlands|            97|
+----------+--------------+--------------+



#### Pivot

In [113]:
pivotedDf = dfWithDate.groupBy("Date").pivot("Country").sum()
pivotedDf.show()


+----------+------------------------+-----------------------+------------------------+-------------------+------------------+-------------------+---------------------+--------------------+---------------------+----------------------+---------------------+----------------------+--------------------------+-------------------------+--------------------------+---------------------+--------------------+---------------------+-----------------------------+----------------------------+-----------------------------+
|      Date|Australia_sum(InvoiceNo)|Australia_sum(Quantity)|Australia_sum(UnitPrice)|EIRE_sum(InvoiceNo)|EIRE_sum(Quantity)|EIRE_sum(UnitPrice)|France_sum(InvoiceNo)|France_sum(Quantity)|France_sum(UnitPrice)|Germany_sum(InvoiceNo)|Germany_sum(Quantity)|Germany_sum(UnitPrice)|Netherlands_sum(InvoiceNo)|Netherlands_sum(Quantity)|Netherlands_sum(UnitPrice)|Norway_sum(InvoiceNo)|Norway_sum(Quantity)|Norway_sum(UnitPrice)|United Kingdom_sum(InvoiceNo)|United Kingdom_sum(Quantity)|Unite