In [1]:
from pyspark.sql import SparkSession

In [2]:
sess = SparkSession \
        .builder \
        .appName('Sales prediction') \
        .getOrCreate()

In [3]:
sess

In [4]:
# If a hive db related exception raises - delete the db.lck file from C:\tools\spark2\bin\code\metastore_db
salesDF = sess.read \
            .format('csv') \
            .option('header', 'true') \
            .load('dataset/sales.csv')

In [5]:
import pprint
pprint.pprint(type(salesDF))
pprint.pprint(salesDF.dtypes)

<class 'pyspark.sql.dataframe.DataFrame'>
[('InvoiceNo', 'string'),
 ('StockCode', 'string'),
 ('Description', 'string'),
 ('Quantity', 'string'),
 ('InvoiceDate', 'string'),
 ('UnitPrice', 'string'),
 ('CustomerID', 'string'),
 ('Country', 'string')]


In [16]:
salesDF.show(5)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 5 rows



In [14]:
#salesDF.drop('InvoiceNo', 'StockCode', 'CustomerID').show(5)
salesDF = salesDF.drop('InvoiceNo', 'StockCode', 'CustomerID')

In [43]:
salesDF.show(2)

+--------------------+--------+-------------------+---------+--------------+
|         Description|Quantity|        InvoiceDate|UnitPrice|       Country|
+--------------------+--------+-------------------+---------+--------------+
|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|United Kingdom|
| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|United Kingdom|
+--------------------+--------+-------------------+---------+--------------+
only showing top 2 rows



### Check nulls

In [63]:
#salesDF.filter("InvoiceDate is NULL").count()
salesDF.filter("Description is NULL").count()

0

### Fill nulls with fillna

In [59]:
salesDF = salesDF.fillna({'Description': 'Empty'})

### Examine with group by total and sorting

In [62]:
#salesDF.groupBy('Description').agg({"StockCode":"count"}).show(10)
salesDF.groupBy('Description').agg({"Quantity":"sum"}).withColumnRenamed("SUM(Quantity)", "Total Quantity").sort("Description", ascending=True).show(3)

+--------------------+--------------+
|         Description|Total Quantity|
+--------------------+--------------+
| 4 PURPLE FLOCK D...|           2.0|
| SET 2 TEA TOWELS...|          46.0|
|"CHARLIE+LOLA""EX...|           6.0|
+--------------------+--------------+
only showing top 3 rows



### Column cast

In [72]:
salesDF.show(2)

+--------------------+--------+-------------------+---------+--------------+
|         Description|Quantity|        InvoiceDate|UnitPrice|       Country|
+--------------------+--------+-------------------+---------+--------------+
|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|United Kingdom|
| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|United Kingdom|
+--------------------+--------+-------------------+---------+--------------+
only showing top 2 rows



In [76]:
# cast with string param
salesDF.withColumn("Qty", salesDF["Quantity"].cast("long")).show(2)

+--------------------+--------+-------------------+---------+--------------+---+
|         Description|Quantity|        InvoiceDate|UnitPrice|       Country|Qty|
+--------------------+--------+-------------------+---------+--------------+---+
|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|United Kingdom|  6|
| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|United Kingdom|  6|
+--------------------+--------+-------------------+---------+--------------+---+
only showing top 2 rows



In [83]:
# cast with type param
from pyspark.sql.types import DoubleType
salesDF .withColumn("Qty", salesDF["Quantity"].cast("long")) \
        .withColumn("Price", salesDF["UnitPrice"].cast(DoubleType())) \
        .show(2)

+--------------------+--------+-------------------+---------+--------------+---+-----+
|         Description|Quantity|        InvoiceDate|UnitPrice|       Country|Qty|Price|
+--------------------+--------+-------------------+---------+--------------+---+-----+
|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|United Kingdom|  6| 2.55|
| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|United Kingdom|  6| 3.39|
+--------------------+--------+-------------------+---------+--------------+---+-----+
only showing top 2 rows

