In [1]:
import findspark

findspark.init()

In [2]:
import pyspark

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

# Import, analyse and prepare data

## Store Data

In [3]:
stores = spark.read.format("csv").option("header", "true").load("data/stores.csv")

In [4]:
stores.show()

+-----+----+------+
|Store|Type|  Size|
+-----+----+------+
|    1|   A|151315|
|    2|   A|202307|
|    3|   B| 37392|
|    4|   A|205863|
|    5|   B| 34875|
|    6|   A|202505|
|    7|   B| 70713|
|    8|   A|155078|
|    9|   B|125833|
|   10|   B|126512|
|   11|   A|207499|
|   12|   B|112238|
|   13|   A|219622|
|   14|   A|200898|
|   15|   B|123737|
|   16|   B| 57197|
|   17|   B| 93188|
|   18|   B|120653|
|   19|   A|203819|
|   20|   A|203742|
+-----+----+------+
only showing top 20 rows



As I want to make predictions per store, the store data itself is not relevant for my model

## Feature Data

In [5]:
features = spark.read.format("csv").option("header", "true").load("data/features.csv")

In [6]:
features.show()

+-----+----------+-----------+----------+---------+---------+---------+---------+---------+-----------+------------+---------+
|Store|      Date|Temperature|Fuel_Price|MarkDown1|MarkDown2|MarkDown3|MarkDown4|MarkDown5|        CPI|Unemployment|IsHoliday|
+-----+----------+-----------+----------+---------+---------+---------+---------+---------+-----------+------------+---------+
|    1|05/02/2010|      42.31|     2.572|       NA|       NA|       NA|       NA|       NA|211.0963582|       8.106|    FALSE|
|    1|12/02/2010|      38.51|     2.548|       NA|       NA|       NA|       NA|       NA|211.2421698|       8.106|     TRUE|
|    1|19/02/2010|      39.93|     2.514|       NA|       NA|       NA|       NA|       NA|211.2891429|       8.106|    FALSE|
|    1|26/02/2010|      46.63|     2.561|       NA|       NA|       NA|       NA|       NA|211.3196429|       8.106|    FALSE|
|    1|05/03/2010|       46.5|     2.625|       NA|       NA|       NA|       NA|       NA|211.3501429|       8

In [7]:
features.printSchema()

root
 |-- Store: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Temperature: string (nullable = true)
 |-- Fuel_Price: string (nullable = true)
 |-- MarkDown1: string (nullable = true)
 |-- MarkDown2: string (nullable = true)
 |-- MarkDown3: string (nullable = true)
 |-- MarkDown4: string (nullable = true)
 |-- MarkDown5: string (nullable = true)
 |-- CPI: string (nullable = true)
 |-- Unemployment: string (nullable = true)
 |-- IsHoliday: string (nullable = true)



The data has to be transformed into the correct datatypes.

As MarkDown data is not available for more than half of the records, I decided to neglect this information for the model.

In [8]:
from pyspark.sql.types import DoubleType, IntegerType, DateType, BooleanType
from pyspark.sql.functions import to_date


features = features\
    .withColumn("Store", features.Store.cast(IntegerType()))\
    .withColumn("Date", to_date(features.Date, "dd/MM/yyyy"))\
    .withColumn("Temperature", features.Temperature.cast(DoubleType()))\
    .withColumn("Fuel_Price", features.Fuel_Price.cast(DoubleType()))\
    .withColumn("CPI", features.CPI.cast(DoubleType()))\
    .withColumn("Unemployment", features.Unemployment.cast(DoubleType()))\
    .withColumn("IsHoliday", features.IsHoliday.cast(BooleanType()))

features = features.select("Store",\
                           "Date",\
                           "Temperature",\
                           "Fuel_Price",\
                           "CPI",\
                           "Unemployment",\
                           "IsHoliday")

In [9]:
features.printSchema()

root
 |-- Store: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Temperature: double (nullable = true)
 |-- Fuel_Price: double (nullable = true)
 |-- CPI: double (nullable = true)
 |-- Unemployment: double (nullable = true)
 |-- IsHoliday: boolean (nullable = true)



In [10]:
features.show()

+-----+----------+-----------+----------+-----------+------------+---------+
|Store|      Date|Temperature|Fuel_Price|        CPI|Unemployment|IsHoliday|
+-----+----------+-----------+----------+-----------+------------+---------+
|    1|2010-02-05|      42.31|     2.572|211.0963582|       8.106|    false|
|    1|2010-02-12|      38.51|     2.548|211.2421698|       8.106|     true|
|    1|2010-02-19|      39.93|     2.514|211.2891429|       8.106|    false|
|    1|2010-02-26|      46.63|     2.561|211.3196429|       8.106|    false|
|    1|2010-03-05|       46.5|     2.625|211.3501429|       8.106|    false|
|    1|2010-03-12|      57.79|     2.667|211.3806429|       8.106|    false|
|    1|2010-03-19|      54.58|      2.72| 211.215635|       8.106|    false|
|    1|2010-03-26|      51.45|     2.732|211.0180424|       8.106|    false|
|    1|2010-04-02|      62.27|     2.719|210.8204499|       7.808|    false|
|    1|2010-04-09|      65.86|      2.77|210.6228574|       7.808|    false|

## Sales Data

In [11]:
sales = spark.read.format("csv").option("header", "true").load("data/sales.csv")

In [12]:
sales.show()

+-----+----+----------+------------+---------+
|Store|Dept|      Date|Weekly_Sales|IsHoliday|
+-----+----+----------+------------+---------+
|    1|   1|05/02/2010|     24924.5|    FALSE|
|    1|   1|12/02/2010|    46039.49|     TRUE|
|    1|   1|19/02/2010|    41595.55|    FALSE|
|    1|   1|26/02/2010|    19403.54|    FALSE|
|    1|   1|05/03/2010|     21827.9|    FALSE|
|    1|   1|12/03/2010|    21043.39|    FALSE|
|    1|   1|19/03/2010|    22136.64|    FALSE|
|    1|   1|26/03/2010|    26229.21|    FALSE|
|    1|   1|02/04/2010|    57258.43|    FALSE|
|    1|   1|09/04/2010|    42960.91|    FALSE|
|    1|   1|16/04/2010|    17596.96|    FALSE|
|    1|   1|23/04/2010|    16145.35|    FALSE|
|    1|   1|30/04/2010|    16555.11|    FALSE|
|    1|   1|07/05/2010|    17413.94|    FALSE|
|    1|   1|14/05/2010|    18926.74|    FALSE|
|    1|   1|21/05/2010|    14773.04|    FALSE|
|    1|   1|28/05/2010|    15580.43|    FALSE|
|    1|   1|04/06/2010|    17558.09|    FALSE|
|    1|   1|1

In [13]:
sales.printSchema()

root
 |-- Store: string (nullable = true)
 |-- Dept: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Weekly_Sales: string (nullable = true)
 |-- IsHoliday: string (nullable = true)



The data has to be transformed into the correct datatypes.

In [15]:
sales = sales\
    .withColumn("Store", sales.Store.cast(IntegerType()))\
    .withColumn("Dept", sales.Dept.cast(IntegerType()))\
    .withColumn("Date", to_date(sales.Date, "dd/MM/yyyy"))\
    .withColumn("Weekly_Sales", sales.Weekly_Sales.cast(DoubleType()))\
    .withColumn("IsHoliday", sales.IsHoliday.cast(BooleanType()))

TODO: think about how to access different weeks, transform weeks to week number? -> transform at the end?

In [16]:
sales.show()

+-----+----+----------+------------+---------+
|Store|Dept|      Date|Weekly_Sales|IsHoliday|
+-----+----+----------+------------+---------+
|    1|   1|2010-02-05|     24924.5|    false|
|    1|   1|2010-02-12|    46039.49|     true|
|    1|   1|2010-02-19|    41595.55|    false|
|    1|   1|2010-02-26|    19403.54|    false|
|    1|   1|2010-03-05|     21827.9|    false|
|    1|   1|2010-03-12|    21043.39|    false|
|    1|   1|2010-03-19|    22136.64|    false|
|    1|   1|2010-03-26|    26229.21|    false|
|    1|   1|2010-04-02|    57258.43|    false|
|    1|   1|2010-04-09|    42960.91|    false|
|    1|   1|2010-04-16|    17596.96|    false|
|    1|   1|2010-04-23|    16145.35|    false|
|    1|   1|2010-04-30|    16555.11|    false|
|    1|   1|2010-05-07|    17413.94|    false|
|    1|   1|2010-05-14|    18926.74|    false|
|    1|   1|2010-05-21|    14773.04|    false|
|    1|   1|2010-05-28|    15580.43|    false|
|    1|   1|2010-06-04|    17558.09|    false|
|    1|   1|2

In [17]:
sales.printSchema()

root
 |-- Store: integer (nullable = true)
 |-- Dept: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Weekly_Sales: double (nullable = true)
 |-- IsHoliday: boolean (nullable = true)



# Data Statistics

In [19]:
sales.describe().show()

+-------+------------------+------------------+------------------+
|summary|             Store|              Dept|      Weekly_Sales|
+-------+------------------+------------------+------------------+
|  count|            421570|            421570|            421570|
|   mean|22.200545579619043| 44.26031738501317|15981.258123467534|
| stddev| 12.78529738990308|30.492054015786014|22711.183519163187|
|    min|                 1|                 1|          -4988.94|
|    max|                45|                99|         693099.36|
+-------+------------------+------------------+------------------+



In [20]:
features.describe().show()

+-------+---------------+-----------------+-------------------+------------------+-----------------+
|summary|          Store|      Temperature|         Fuel_Price|               CPI|     Unemployment|
+-------+---------------+-----------------+-------------------+------------------+-----------------+
|  count|           8190|             8190|               8190|              7605|             7605|
|   mean|           23.0|59.35619780219781| 3.4059918192918217|172.46080918276078|7.826821038790305|
| stddev|12.987966099514| 18.6786068489072|0.43133657110071383| 39.73834609860842|1.877258593917429|
|    min|              1|            -7.29|              2.472|           126.064|            3.684|
|    max|             45|           101.95|              4.468|       228.9764563|           14.313|
+-------+---------------+-----------------+-------------------+------------------+-----------------+



# Join Data

# Define Features/Categories

# Index and Build Feature Vector

# Create and train Model

# Evaluate Predictions

# Final Conclusions