<h1> Task 1 - Market Basket Analysis and Product Recommendation  </h1>

<h6> GOAL: The goal of this task is to use Spark MLlib to build a model to generate association rules to quickly run the market basket analysis to uncover associations between different items, then further to provide recommendations for purchase on a distributed platform.   </h6>

In [0]:
# Importing the library
from pyspark.ml.fpm import FPGrowth

In [0]:
# 1.3 - Creating the training set
# Reading the dataframe
train_df = spark.read.format("csv").option("header", "false").load("dbfs:/FileStore/shared_uploads/clvrashmika@gmail.com/Lab5_Part1_TrainData.csv")

In [0]:
# Displaying the dataframe
display(train_df)

_c0,_c1,_c2,_c3,_c4
bread,,,,
peanut butter,apple,,,
peanut butter,bread,,,
peanut butter,bread,apple,,
peanut butter,bread,milk,,
peanut butter,bread,milk,chocolate,
bread,milk,orange,,
apple,chocolate,milk,,
peanut butter,milk,chocolate,apple,
cheese,bread,milk,potatoes,


In [0]:
# Modifying the dataframe to perform analysis
count = 0
mylist = []
for row in train_df.collect():
    temp = (count,list(filter(None, row)))
    mylist.append(temp)
    count = count + 1

In [0]:
# Printing the list to show the new format
mylist

Out[122]: [(0, ['bread']),
 (1, ['peanut butter', ' apple']),
 (2, [' peanut butter', ' bread']),
 (3, ['peanut butter', ' bread', ' apple']),
 (4, ['peanut butter', ' bread', ' milk']),
 (5, ['peanut butter', ' bread', ' milk', ' chocolate']),
 (6, ['bread', ' milk', ' orange']),
 (7, ['apple', ' chocolate', ' milk']),
 (8, ['peanut butter', ' milk', ' chocolate', ' apple']),
 (9, ['cheese', ' bread', ' milk', ' potatoes ']),
 (10, ['cheese', ' pasta', ' ketchup']),
 (11, ['milk', ' cheese', ' pasta', ' ketchup']),
 (12, ['pasta', ' ketchup', ' cheese', ' potatoes', ' milk']),
 (13, ['bread', ' milk', ' chocolate', ' pasta']),
 (14, [' milk', ' pasta', ' potatoes', ' ketchup', ' bread']),
 (15, ['apple', ' chocolate', ' pasta']),
 (16, ['milk', 'bread']),
 (17, ['apple', 'milk']),
 (18, ['milk', 'chocolate']),
 (19, ['milk'])]

In [0]:
# Creating a new dataframe from the above list
train_df1 = spark.createDataFrame(mylist, ["id", "items"])

In [0]:
# 1.4 - FP Growth Model
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.05, minConfidence=0.07)
model = fpGrowth.fit(train_df1)

In [0]:
#1.5 -  Display frequent itemsets.
model.freqItemsets.show()

+--------------------+----+
|               items|freq|
+--------------------+----+
|            [ apple]|   3|
|[ apple, peanut b...|   3|
|[ apple, peanut b...|   1|
|[ apple, peanut b...|   1|
|[ apple, peanut b...|   1|
|[ apple, peanut b...|   1|
|    [ apple,  bread]|   1|
|[ apple,  chocolate]|   1|
|[ apple,  chocola...|   1|
|     [ apple,  milk]|   1|
|             [pasta]|   1|
|   [pasta,  ketchup]|   1|
|[pasta,  ketchup,...|   1|
|  [pasta,  potatoes]|   1|
|[pasta,  potatoes...|   1|
|[pasta,  potatoes...|   1|
|[pasta,  potatoes...|   1|
|    [pasta,  cheese]|   1|
|[pasta,  cheese, ...|   1|
|[pasta,  cheese, ...|   1|
+--------------------+----+
only showing top 20 rows



In [0]:
#1.6  Display generated association rules.
display(model.associationRules)

antecedent,consequent,confidence,lift,support
"List( potatoes , milk)",List(cheese),1.0,10.0,0.05
"List( potatoes , milk)",List( bread),1.0,3.333333333333333,0.05
List( potatoes ),List(cheese),1.0,10.0,0.05
List( potatoes ),List( bread),1.0,3.333333333333333,0.05
List( potatoes ),List( milk),1.0,2.2222222222222223,0.05
List( potatoes),List(pasta),0.5,10.0,0.05
List( potatoes),List( pasta),0.5,2.0,0.05
List( potatoes),List( ketchup),1.0,5.0,0.1
List( potatoes),List( bread),0.5,1.6666666666666667,0.05
List( potatoes),List( milk),1.0,2.2222222222222223,0.1


In [0]:
# 1.7 Creating a test set
test_df = spark.createDataFrame([
    (0, ['bread']),
    (1, ['potatoes','milk']),
    (2, ['chocolate']),
    (3, ['pasta', 'cheese']),
    (4, ['apple','milk']),
    (5, ['milk']),
    (6, ['chocolate', 'bread', 'milk']),
    (7, ['bread', 'milk'])
], ["id", "items"])

In [0]:
# 1.8 Making predictions
# transform examines the input items against all the association rules and summarize the consequents as prediction
display(model.transform(test_df))

id,items,prediction
0,List(bread),"List(milk, pasta, chocolate, milk, orange)"
1,"List(potatoes, milk)","List(apple, pasta, cheese, chocolate, bread, ketchup)"
2,List(chocolate),List(milk)
3,"List(pasta, cheese)","List( pasta, ketchup, bread, milk, potatoes , potatoes, cheese)"
4,"List(apple, milk)","List( pasta, cheese, chocolate, bread, ketchup, chocolate, milk)"
5,List(milk),"List(apple, pasta, cheese, chocolate, bread, ketchup)"
6,"List(chocolate, bread, milk)","List(apple, pasta, cheese, ketchup, chocolate, milk, orange)"
7,"List(bread, milk)","List(apple, pasta, cheese, chocolate, ketchup, chocolate, milk, orange)"


#####  1.10 Task 1 - Additional

In [0]:
# Reading and displaying the dataframe
extra_df = spark.read.format("csv").option("header", "false").load("dbfs:/FileStore/shared_uploads/clvrashmika@gmail.com/groceries.csv")
display(extra_df)

_c0,_c1,_c2,_c3
citrus fruit,semi-finished bread,margarine,ready soups
tropical fruit,yogurt,coffee,
whole milk,,,
pip fruit,yogurt,cream cheese,meat spreads
other vegetables,whole milk,condensed milk,long life bakery product
whole milk,butter,yogurt,rice
rolls/buns,,,
other vegetables,UHT-milk,rolls/buns,bottled beer
pot plants,,,
whole milk,cereals,,


In [0]:
# Modifying the dataframe to perform analysis
count = 0
mylist = []
for row in extra_df.collect():
    temp = (count,list(filter(None, row)))
    mylist.append(temp)
    count = count + 1
    
# Creating a new dataframe from the above list
extra_df1 = spark.createDataFrame(mylist, ["id", "items"])

In [0]:
# Splitting the dataset into train and test dataframes
trainDF, testDF = extra_df1.randomSplit([0.8, 0.2], seed=25)
print(trainDF.cache().count()) # Cache because accessing training data multiple times
print(testDF.count())

7884
1951


In [0]:
# FP Growth Model
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.01, minConfidence=0.01)
model = fpGrowth.fit(trainDF)

In [0]:
# Display frequent itemsets.
display(model.freqItemsets)

items,freq
List(margarine),209
List(citrus fruit),644
"List(citrus fruit, root vegetables)",109
"List(citrus fruit, sausage)",86
"List(citrus fruit, other vegetables)",151
"List(citrus fruit, tropical fruit)",154
"List(citrus fruit, yogurt)",87
"List(citrus fruit, whole milk)",148
List(white wine),92
List(frozen meals),153


In [0]:
# Display generated association rules.
display(model.associationRules)

antecedent,consequent,confidence,lift,support
List(other vegetables),List(citrus fruit),0.1107850330154072,1.3562565221948295,0.0191527143581938
List(other vegetables),List(chicken),0.0748349229640499,1.7664626726005073,0.0129375951293759
List(other vegetables),List(whole milk),0.3000733675715333,1.3358432693020716,0.0518772196854388
List(other vegetables),List(pip fruit),0.0946441672780631,1.2625628000342632,0.0163622526636225
List(other vegetables),List(rolls/buns),0.0814380044020542,0.6551604354140775,0.0140791476407914
List(other vegetables),List(frankfurter),0.061628760088041,1.018618751643849,0.0106544901065449
List(other vegetables),List(tropical fruit),0.1401320616287601,1.3294839637558902,0.0242262810755961
List(other vegetables),List(root vegetables),0.1966250917094644,1.949927324575368,0.0339928970065956
List(other vegetables),List(pork),0.0843727072633895,1.4492253247593976,0.0145865043125317
List(other vegetables),List(hamburger meat),0.0586940572267057,1.7202377218414442,0.0101471334348046


In [0]:
# transform examines the input items against all the association rules and summarize the
# consequents as prediction
display(model.transform(testDF))

id,items,prediction
13,"List(frankfurter, rolls/buns, soda)","List(other vegetables, whole milk, yogurt, sausage)"
19,List(specialty bar),List()
25,"List(bottled water, canned beer)",List(whole milk)
37,List(canned beer),List()
44,"List(butter milk, yogurt, cream cheese, spread cheese)","List(citrus fruit, rolls/buns, other vegetables, tropical fruit, whole milk)"
46,"List(pastry, bottled water)",List(whole milk)
56,"List(packaged fruit/vegetables, brown bread, canned beer)",List()
59,"List(rolls/buns, pastry, sugar)","List(whole milk, other vegetables, soda, yogurt, sausage)"
60,"List(other vegetables, whole milk, frozen vegetables, canned fish)","List(citrus fruit, chicken, pip fruit, rolls/buns, frankfurter, tropical fruit, root vegetables, pork, hamburger meat, yogurt, sausage, beef, bottled water, whipped/sour cream, butter, curd, pastry)"
65,List(whole milk),"List(citrus fruit, chicken, other vegetables, pip fruit, rolls/buns, bottled water, whipped/sour cream, frankfurter, butter, tropical fruit, curd, root vegetables, pork, pastry, yogurt, sausage, beef)"


<h3> 1.9 References: </h3>

<p>
  <b>1.</b>  Dr. Liao’s Code Examples & Tutorials: Blackboard/Liao_PySpark_basic_databricks.html
  <br>
  <b>2.</b> PySpark: https://spark.apache.org/docs/2.4.0/api/python/pyspark.html  
  <br>
  <b>3.</b> Frequent Pattern Mining : https://spark.apache.org/docs/latest/ml-frequent-pattern-mining.html
</p>