## MSCA 31013 - Big Data Platforms - Course Project

### Predictive analytics on Iowa Liquor Sales
### Submitted by:

#saurabhs
#dmcdonough
#dtallarico90

## 1. Exploratory Data Analysis

### 1.1 Reading and Cleaning Data

Step one is to import the necessary packages and load the data.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import HiveContext
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from pyspark.sql.functions import upper, col
from pyspark.sql import SQLContext
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

# ML Algorithms for regression
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import GBTRegressor




import warnings
warnings.filterwarnings('ignore')


%matplotlib inline

### Start

In [2]:
from notebook import notebookapp
servers = list(notebookapp.list_running_servers())

# Check if running local or on RCC
if (servers[0]['hostname'] == "localhost"):
    IsRCC = False
else:
    IsRCC = True

In [3]:
if (IsRCC):
    spark = SparkSession.builder.appName('Team10_project').getOrCreate()
    sql_sc = SQLContext(sc) # RCC spark context
else:
    spark = SparkSession.builder.appName('Team10_project_local').getOrCreate()
    sc = spark.sparkContext # Local spark context
    sql_sc = SQLContext(sc)

In [4]:
# Update to RCC file names
BD_RCC_File = '/user/saurabhs/BigData/Project/Iowa_Liquor_Sales-clean.csv'

#Update to local file names
BD_Local_File = 'Iowa_Liquor_Sales-clean.csv'

In [5]:
%%time

if (IsRCC):    
    df = (spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .option("multiLine", "true")
    .option("delimiter", ",")
    .format("csv")
    .load(BD_RCC_File))
else:
    df = (spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .option("multiLine", "true")
    .option("delimiter", ",")
    .format("csv")
    .load(BD_Local_File)) 

CPU times: user 102 ms, sys: 30.4 ms, total: 132 ms
Wall time: 12min 5s


In [6]:
# Repartition by County if required
df.rdd.getNumPartitions()
#df = df.repartition(16)

1

### Clean and transform

In [7]:
df = df.dropna(how='any')

In [8]:
#Drop the zero dollars or negative transaction rows
df=df.filter(df["Sale (Dollars)"]>0)

In [9]:
# Split the first column into invoice number and item number
df = df.withColumn('InvoiceItemNumClean', F.regexp_extract(F.col('Invoice/Item Number'), '(\d+)', 1))
df = df.withColumn('InvoiceNum', df["InvoiceItemNumClean"].substr(1, 6))
df = df.withColumn('ItemNum', df["InvoiceItemNumClean"].substr(7, 5))

#Add date columns
df = df.withColumn('saledate', F.to_date(F.from_unixtime(F.unix_timestamp('Date', 'MM/dd/yyy'))))
df = df.withColumn("salemonth", F.month("saledate"))
df = df.withColumn("saleyear", F.year("saledate"))
df = df.withColumn('dow', F.date_format("saledate", 'EEEE'))
df = df.withColumn('dow_number', F.date_format("saledate", 'u'))
df = df.withColumn("saleq", F.quarter("saledate"))

# Clean columns for inconsistencies in case
df = df.withColumn('City', F.upper(F.col('City')))
df = df.withColumn('Store Name', F.upper(F.col('Store Name')))
df = df.withColumn('Category Name', F.upper(F.col('Category Name')))
df = df.withColumn('Vendor Name', F.upper(F.col('Vendor Name')))
df = df.withColumn('Item Description', F.upper(F.col('Item Description')))

In [10]:
# Diff of current dates vs saledate column to calc age of store
df = df.withColumn('date_diff', F.datediff(F.to_date(F.from_unixtime(F.unix_timestamp(F.lit('03/09/2019'), 'MM/dd/yyy')))\
                                               , F.to_date(df.saledate)))


In [11]:
#Create temp view for creating subtables
df.createOrReplaceTempView("tempMain")

### Begin Modeling

First step is to create the selected market data.

In [12]:
# run this to create sample. df reference below would need to be changed to dftest

dftest = spark.sql('''
SELECT * FROM tempMain WHERE city = 'IOWA CITY' and saleyear=2018
''')
dftest.show(5)

+-------------------+----------+------------+--------------------+---------------+---------+--------+--------------------+-------------+-------+--------+--------------------+-------------+--------------------+-----------+--------------------+----+------------------+-----------------+-------------------+------------+--------------+--------------------+---------------------+-------------------+----------+-------+----------+---------+--------+-------+----------+-----+---------+
|Invoice/Item Number|      Date|Store Number|          Store Name|        Address|     City|Zip Code|      Store Location|County Number| County|Category|       Category Name|Vendor Number|         Vendor Name|Item Number|    Item Description|Pack|Bottle Volume (ml)|State Bottle Cost|State Bottle Retail|Bottles Sold|Sale (Dollars)|Volume Sold (Liters)|Volume Sold (Gallons)|InvoiceItemNumClean|InvoiceNum|ItemNum|  saledate|salemonth|saleyear|    dow|dow_number|saleq|date_diff|
+-------------------+----------+--------

In [13]:
print('Total Rows: %d' % dftest.select(['saleyear']).count())

Total Rows: 67701


In [14]:
from pyspark.sql.functions import collect_set, col, count

#rawData = spark.sql("select p.product_name, o.order_id from products p inner join order_products_train o where o.product_id = p.product_id")
baskets = dftest.groupBy(['Store Number','InvoiceNum']).agg(collect_set('Item Description').alias('items'))
baskets.createOrReplaceTempView('baskets')
baskets.show(truncate=False)

+------------+----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Next is to fit the model and select merchant example to highlight.

In [15]:
from pyspark.ml.fpm import FPGrowth

#set the minimum thresholds for support and confidence
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.1, minConfidence=.25)

model = fpGrowth.fit(baskets)

#Calculate frequent itemsets
mostPopularItemInABasket = model.freqItemsets
mostPopularItemInABasket.createOrReplaceTempView("mostPopularItemInABasket")

In [16]:
rows = spark.sql('''
select count(*) as rows from mostPopularItemInABasket
''')
rows.show()

+----+
|rows|
+----+
| 412|
+----+



In [17]:
#only show item groups larger than 2
spark.sql('''select items, freq from mostPopularItemInABasket 
          where size(items) > 2 order by freq desc limit 20''').show(truncate =False)

+-------------------------------------------------------------------------------+----+
|items                                                                          |freq|
+-------------------------------------------------------------------------------+----+
|[BLACK VELVET, TITOS HANDMADE VODKA, HAWKEYE VODKA]                            |343 |
|[SMIRNOFF 80PRF, BLACK VELVET, HAWKEYE VODKA]                                  |299 |
|[SMIRNOFF 80PRF, TITOS HANDMADE VODKA, HAWKEYE VODKA]                          |284 |
|[NEW AMSTERDAM VODKA, BLACK VELVET, HAWKEYE VODKA]                             |274 |
|[NEW AMSTERDAM VODKA, TITOS HANDMADE VODKA, HAWKEYE VODKA]                     |266 |
|[SMIRNOFF 80PRF, BLACK VELVET, TITOS HANDMADE VODKA]                           |261 |
|[JIM BEAM, BLACK VELVET, HAWKEYE VODKA]                                        |258 |
|[ADMIRAL NELSON SPICED RUM, BLACK VELVET, HAWKEYE VODKA]                       |256 |
|[ADMIRAL NELSON SPICED RUM, TITOS HANDMADE

In [18]:
#sort by confidence
model.associationRules.orderBy("lift", ascending=False).show(25, truncate=False)

+---------------------------------------------------+---------------------------------+-------------------+------------------+
|antecedent                                         |consequent                       |confidence         |lift              |
+---------------------------------------------------+---------------------------------+-------------------+------------------+
|[NEW AMSTERDAM VODKA, TITOS HANDMADE VODKA]        |[NEW AMSTERDAM PEACH]            |0.5182926829268293 |2.420228073962156 |
|[NEW AMSTERDAM VODKA, BLACK VELVET]                |[NEW AMSTERDAM PEACH]            |0.5123456790123457 |2.3924578462434907|
|[NEW AMSTERDAM VODKA, HAWKEYE VODKA]               |[NEW AMSTERDAM PEACH]            |0.5090439276485789 |2.3770399335330397|
|[NEW AMSTERDAM PEACH, BLACK VELVET]                |[NEW AMSTERDAM VODKA]            |0.7186147186147186 |2.3215655093206116|
|[NEW AMSTERDAM PEACH, HAWKEYE VODKA]               |[NEW AMSTERDAM VODKA]            |0.7137681159420289 |2.30

In [19]:
dftest.createOrReplaceTempView("dftestsql")
prodsales = spark.sql('''
SELECT `Store Number` as storenum, `Item Description` as item, sum(`Sale (Dollars)`) as prodsales
FROM dftestsql
WHERE `Item Description` In ("SMIRNOFF 80PRF", "HAWKEYE VODKA", "FIREBALL CINNAMON WHISKEY")
GROUP BY `Store Number`,`Item Description`
''')

In [20]:
totalsales = spark.sql('''
SELECT `Store Number` as storenum, sum(`Sale (Dollars)`) as totalsales
FROM dftestsql
GROUP BY `Store Number`
''')

In [21]:
prodsales_pd = prodsales.toPandas()
totalsales_pd = totalsales.toPandas()
combined_pd = prodsales_pd.merge(totalsales_pd)

In [22]:
combined_pd['ratio']=combined_pd['prodsales']/combined_pd['totalsales']

In [23]:
combined_pd.head()

Unnamed: 0,storenum,item,prodsales,totalsales,ratio
0,5219,SMIRNOFF 80PRF,1986.72,94120.91,0.021108
1,5219,HAWKEYE VODKA,2548.68,94120.91,0.027079
2,5219,FIREBALL CINNAMON WHISKEY,1354.24,94120.91,0.014388
3,3565,HAWKEYE VODKA,2921.19,175434.16,0.016651
4,3565,SMIRNOFF 80PRF,2521.04,175434.16,0.01437


In [24]:
combined_pd_pv = pd.pivot_table(combined_pd, values='ratio', index=['storenum']\
                               ,columns=['item'], aggfunc=np.sum, fill_value=0)
combined_pd_pv_rows = pd.DataFrame(combined_pd_pv.to_records())

In [25]:
combined_pd_pv_rows.head()

Unnamed: 0,storenum,FIREBALL CINNAMON WHISKEY,HAWKEYE VODKA,SMIRNOFF 80PRF
0,2285,0.009562,0.004954,0.008429
1,2512,0.017269,0.008291,0.002337
2,2513,0.012607,0.029093,0.001408
3,2545,0.004867,0.015453,0.006629
4,2622,0.009963,0.017506,0.007759


In [26]:
#calc mean values of each
combined_pd_pv_rows.median()

storenum                     4712.500000
FIREBALL CINNAMON WHISKEY       0.011590
HAWKEYE VODKA                   0.043135
SMIRNOFF 80PRF                  0.019173
dtype: float64

In [27]:
#search for store where first two are high but last one is low
combined_pd_pv_rows = combined_pd_pv_rows.merge(totalsales_pd)
combined_pd_pv_rows[(combined_pd_pv_rows["HAWKEYE VODKA"]>0.1)&(combined_pd_pv_rows["SMIRNOFF 80PRF"]>0.03)]

Unnamed: 0,storenum,FIREBALL CINNAMON WHISKEY,HAWKEYE VODKA,SMIRNOFF 80PRF,totalsales
12,4457,0.002537,0.114482,0.03121,79811.11
15,4836,0.027317,0.101264,0.04562,42501.74
16,4837,0.015374,0.100559,0.039863,52685.32


In [28]:
combined_pd[combined_pd['storenum']==4457]

Unnamed: 0,storenum,item,prodsales,totalsales,ratio
79,4457,SMIRNOFF 80PRF,2490.89,79811.11,0.03121
80,4457,FIREBALL CINNAMON WHISKEY,202.5,79811.11,0.002537
81,4457,HAWKEYE VODKA,9136.96,79811.11,0.114482


In [29]:
dftest_pd = dftest.toPandas()

In [30]:
dftest_pd[dftest_pd['Store Number']==4457]

Unnamed: 0,Invoice/Item Number,Date,Store Number,Store Name,Address,City,Zip Code,Store Location,County Number,County,...,InvoiceItemNumClean,InvoiceNum,ItemNum,saledate,salemonth,saleyear,dow,dow_number,saleq,date_diff
224,INV-14219700001,09/04/2018,4457,KUM & GO #422 / IOWA CITY,731 S Riverside Dr,IOWA CITY,52246,731 S Riverside Dr\nIowa City 52246\n(41.65069...,52,JOHNSON,...,14219700001,142197,00001,2018-09-04,9,2018,Tuesday,2,3,186
225,INV-14219700002,09/04/2018,4457,KUM & GO #422 / IOWA CITY,731 S Riverside Dr,IOWA CITY,52246,731 S Riverside Dr\nIowa City 52246\n(41.65069...,52,JOHNSON,...,14219700002,142197,00002,2018-09-04,9,2018,Tuesday,2,3,186
226,INV-14219700003,09/04/2018,4457,KUM & GO #422 / IOWA CITY,731 S Riverside Dr,IOWA CITY,52246,731 S Riverside Dr\nIowa City 52246\n(41.65069...,52,JOHNSON,...,14219700003,142197,00003,2018-09-04,9,2018,Tuesday,2,3,186
227,INV-14219700004,09/04/2018,4457,KUM & GO #422 / IOWA CITY,731 S Riverside Dr,IOWA CITY,52246,731 S Riverside Dr\nIowa City 52246\n(41.65069...,52,JOHNSON,...,14219700004,142197,00004,2018-09-04,9,2018,Tuesday,2,3,186
228,INV-14219700005,09/04/2018,4457,KUM & GO #422 / IOWA CITY,731 S Riverside Dr,IOWA CITY,52246,731 S Riverside Dr\nIowa City 52246\n(41.65069...,52,JOHNSON,...,14219700005,142197,00005,2018-09-04,9,2018,Tuesday,2,3,186
229,INV-14219700006,09/04/2018,4457,KUM & GO #422 / IOWA CITY,731 S Riverside Dr,IOWA CITY,52246,731 S Riverside Dr\nIowa City 52246\n(41.65069...,52,JOHNSON,...,14219700006,142197,00006,2018-09-04,9,2018,Tuesday,2,3,186
230,INV-14219700007,09/04/2018,4457,KUM & GO #422 / IOWA CITY,731 S Riverside Dr,IOWA CITY,52246,731 S Riverside Dr\nIowa City 52246\n(41.65069...,52,JOHNSON,...,14219700007,142197,00007,2018-09-04,9,2018,Tuesday,2,3,186
231,INV-14219700008,09/04/2018,4457,KUM & GO #422 / IOWA CITY,731 S Riverside Dr,IOWA CITY,52246,731 S Riverside Dr\nIowa City 52246\n(41.65069...,52,JOHNSON,...,14219700008,142197,00008,2018-09-04,9,2018,Tuesday,2,3,186
232,INV-14219700009,09/04/2018,4457,KUM & GO #422 / IOWA CITY,731 S Riverside Dr,IOWA CITY,52246,731 S Riverside Dr\nIowa City 52246\n(41.65069...,52,JOHNSON,...,14219700009,142197,00009,2018-09-04,9,2018,Tuesday,2,3,186
233,INV-14219700010,09/04/2018,4457,KUM & GO #422 / IOWA CITY,731 S Riverside Dr,IOWA CITY,52246,731 S Riverside Dr\nIowa City 52246\n(41.65069...,52,JOHNSON,...,14219700010,142197,00010,2018-09-04,9,2018,Tuesday,2,3,186


In [31]:
dftest_pd.groupby('Store Number')['Sale (Dollars)'].sum().median()

96465.18999999994