### Description of dataset
* Dataset's name: [Instacart Market Basket Analysis](https://www.kaggle.com/c/instacart-market-basket-analysis)
* Description: Which products will an Instacart consumer purchase again?

### 1. Prepare dataset

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.fpm import FPGrowth
spark = SparkSession \
    .builder \
    .appName("Frequent Itemsets") \
    .getOrCreate()
print(spark)

<pyspark.sql.session.SparkSession object at 0x0000016E68243A20>


In [3]:
products = spark.read.csv("D://Documents//Semester_8//Apache-Spark//instacart-market-basket-analysis//products.csv", header=True).select("product_id", "product_name")
products.show(20, False)
products.printSchema()
products.createOrReplaceTempView("products")

+----------+-----------------------------------------------------------------+
|product_id|product_name                                                     |
+----------+-----------------------------------------------------------------+
|1         |Chocolate Sandwich Cookies                                       |
|2         |All-Seasons Salt                                                 |
|3         |Robust Golden Unsweetened Oolong Tea                             |
|4         |Smart Ones Classic Favorites Mini Rigatoni With Vodka Cream Sauce|
|5         |Green Chile Anytime Sauce                                        |
|6         |Dry Nose Oil                                                     |
|7         |Pure Coconut Water With Orange                                   |
|8         |Cut Russet Potatoes Steam N' Mash                                |
|9         |Light Strawberry Blueberry Yogurt                                |
|10        |Sparkling Orange Juice & Prickly Pear Be

In [4]:
transactions = spark.read.csv("D://Documents//Semester_8//Apache-Spark//instacart-market-basket-analysis//order_products__prior.csv", header=True).select("order_id", "product_id")
transactions.show()
transactions.printSchema()
transactions.createOrReplaceTempView("transactions")

+--------+----------+
|order_id|product_id|
+--------+----------+
|       2|     33120|
|       2|     28985|
|       2|      9327|
|       2|     45918|
|       2|     30035|
|       2|     17794|
|       2|     40141|
|       2|      1819|
|       2|     43668|
|       3|     33754|
|       3|     24838|
|       3|     17704|
|       3|     21903|
|       3|     17668|
|       3|     46667|
|       3|     17461|
|       3|     32665|
|       4|     46842|
|       4|     26434|
|       4|     39758|
+--------+----------+
only showing top 20 rows

root
 |-- order_id: string (nullable = true)
 |-- product_id: string (nullable = true)



In [5]:
spark.sql("select * from transactions where order_id=='100010'").show()

+--------+----------+
|order_id|product_id|
+--------+----------+
|  100010|     16714|
|  100010|     11266|
|  100010|       196|
|  100010|     19660|
+--------+----------+



In [6]:
# Mengelompokkan dataset
from pyspark.sql import functions
dataset = transactions.groupby("order_id").agg(functions.collect_list("product_id").alias("items"))
dataset.show()

+--------+--------------------+
|order_id|               items|
+--------+--------------------+
|  100010|[16714, 11266, 19...|
| 1000240|[21616, 24799, 28...|
| 1000280|[46676, 35842, 56...|
| 1000665|[24838, 37067, 11...|
| 1000795|[37335, 24852, 12...|
| 1000839|[41950, 35496, 45...|
| 1000888|[26209, 44910, 33...|
|  100140|[6069, 42585, 15892]|
| 1001866|[16349, 25931, 13...|
| 1002011|[18127, 19348, 21...|
|  100227|[47626, 16797, 47...|
| 1002442|[27966, 5240, 352...|
|  100263|[27744, 24154, 24...|
| 1002783|             [46041]|
| 1002883|[35633, 13176, 82...|
| 1002887|[26831, 22395, 48...|
|  100320|[7781, 12614, 494...|
| 1003202|[25230, 32839, 49...|
| 1003366|[28132, 23579, 49...|
| 1003397|[21938, 24184, 44...|
+--------+--------------------+
only showing top 20 rows



### 2. Frequent Pattern Mining
#### FP-Growth 1

In [7]:
#Frequent Pattern Mining
from pyspark.ml.fpm import FPGrowth

fpGrowth = FPGrowth(itemsCol="items", minSupport=0.05, minConfidence=0.05)
model = fpGrowth.fit(dataset)

#Displlay frequent itemsets
model.freqItemsets.show()

+-------+------+
|  items|  freq|
+-------+------+
|[24852]|472565|
|[13176]|379450|
|[21137]|264683|
|[21903]|241921|
|[47209]|213584|
|[47766]|176815|
+-------+------+



In [8]:
# Display generated association rules.
model.associationRules.show()

+----------+----------+----------+----+
|antecedent|consequent|confidence|lift|
+----------+----------+----------+----+
+----------+----------+----------+----+



In [9]:
# transform examines the input items against all the association rules and summarize the
# consequents as prediction
model.transform(dataset).show()

+--------+--------------------+----------+
|order_id|               items|prediction|
+--------+--------------------+----------+
|  100010|[16714, 11266, 19...|        []|
| 1000240|[21616, 24799, 28...|        []|
| 1000280|[46676, 35842, 56...|        []|
| 1000665|[24838, 37067, 11...|        []|
| 1000795|[37335, 24852, 12...|        []|
| 1000839|[41950, 35496, 45...|        []|
| 1000888|[26209, 44910, 33...|        []|
|  100140|[6069, 42585, 15892]|        []|
| 1001866|[16349, 25931, 13...|        []|
| 1002011|[18127, 19348, 21...|        []|
|  100227|[47626, 16797, 47...|        []|
| 1002442|[27966, 5240, 352...|        []|
|  100263|[27744, 24154, 24...|        []|
| 1002783|             [46041]|        []|
| 1002883|[35633, 13176, 82...|        []|
| 1002887|[26831, 22395, 48...|        []|
|  100320|[7781, 12614, 494...|        []|
| 1003202|[25230, 32839, 49...|        []|
| 1003366|[28132, 23579, 49...|        []|
| 1003397|[21938, 24184, 44...|        []|
+--------+-

#### FP-Growth 2

In [10]:
fpGrowth1 = FPGrowth(itemsCol="items", minSupport=0.005, minConfidence=0.01)
model1 = fpGrowth1.fit(dataset)

#Displlay frequent itemsets
model1.freqItemsets.show()

+--------------+------+
|         items|  freq|
+--------------+------+
|       [31683]| 19345|
|       [24852]|472565|
|       [13176]|379450|
|       [26940]| 19286|
|       [21137]|264683|
|[21137, 13176]| 61628|
|[21137, 24852]| 56156|
|       [28465]| 19257|
|       [21903]|241921|
|[21903, 21137]| 38134|
|[21903, 13176]| 50372|
|[21903, 24852]| 51395|
|       [35108]| 19163|
|       [47209]|213584|
|[47209, 21137]| 40794|
|[47209, 13176]| 62341|
|[47209, 21903]| 34901|
|[47209, 24852]| 31222|
|       [26283]| 18906|
|       [47766]|176815|
+--------------+------+
only showing top 20 rows



In [11]:
# Display generated association rules.
model1.associationRules.show()

+----------+----------+-------------------+------------------+
|antecedent|consequent|         confidence|              lift|
+----------+----------+-------------------+------------------+
|   [21137]|   [13176]|0.23283701635541385|1.9727017264951765|
|   [21137]|   [24852]|0.21216322922137046|1.4433528707793088|
|   [21137]|   [21903]| 0.1440742321947386|1.9145940333944889|
|   [21137]|   [47209]|0.15412398982934303|2.3198798022259126|
|   [21137]|   [47766]|0.09061405530389183|1.6475568839241237|
|   [21137]|   [47626]|0.06160199181662593|1.2973046885467647|
|   [21137]|   [26209]|0.07178020499994332|1.6409673445994568|
|   [21137]|   [27845]|0.08996799945595298|2.0973552973638183|
|   [21137]|   [27966]|0.12793794841376288|3.0009732007029744|
|   [21137]|   [22935]|0.06290921592999928|1.7830585813988016|
|   [21137]|   [39275]|0.08975264750664001|2.8837043064187666|
|   [21137]|   [30391]|0.06254651791010378| 2.501233632945156|
|   [47766]|   [21137]|0.13564460028843706|1.6475568839

In [12]:
# transform examines the input items against all the association rules and summarize the
# consequents as prediction
model1.transform(dataset).show()

+--------+--------------------+--------------------+
|order_id|               items|          prediction|
+--------+--------------------+--------------------+
|  100010|[16714, 11266, 19...|                  []|
| 1000240|[21616, 24799, 28...|[21137, 47209, 47...|
| 1000280|[46676, 35842, 56...|[21137, 13176, 21...|
| 1000665|[24838, 37067, 11...|      [24852, 26209]|
| 1000795|[37335, 24852, 12...|[13176, 21137, 21...|
| 1000839|[41950, 35496, 45...|                  []|
| 1000888|[26209, 44910, 33...|[21137, 47766, 47...|
|  100140|[6069, 42585, 15892]|                  []|
| 1001866|[16349, 25931, 13...|                  []|
| 1002011|[18127, 19348, 21...|[21137, 13176, 24...|
|  100227|[47626, 16797, 47...|[22935, 47209, 13...|
| 1002442|[27966, 5240, 352...|[21137, 47766, 47...|
|  100263|[27744, 24154, 24...|[21137, 21903, 47...|
| 1002783|             [46041]|                  []|
| 1002883|[35633, 13176, 82...|[24852, 21903, 47...|
| 1002887|[26831, 22395, 48...|               

#### FP-Growth 3

In [13]:
fpGrowth2 = FPGrowth(itemsCol="items", minSupport=0.05, minConfidence=0.1)
model2 = fpGrowth2.fit(dataset)

#Displlay frequent itemsets
model2.freqItemsets.show()

+-------+------+
|  items|  freq|
+-------+------+
|[24852]|472565|
|[13176]|379450|
|[21137]|264683|
|[21903]|241921|
|[47209]|213584|
|[47766]|176815|
+-------+------+



In [14]:
# Display generated association rules.
model2.associationRules.show()

+----------+----------+----------+----+
|antecedent|consequent|confidence|lift|
+----------+----------+----------+----+
+----------+----------+----------+----+



In [15]:
# transform examines the input items against all the association rules and summarize the
# consequents as prediction
model2.transform(dataset).show()

+--------+--------------------+----------+
|order_id|               items|prediction|
+--------+--------------------+----------+
|  100010|[16714, 11266, 19...|        []|
| 1000240|[21616, 24799, 28...|        []|
| 1000280|[46676, 35842, 56...|        []|
| 1000665|[24838, 37067, 11...|        []|
| 1000795|[37335, 24852, 12...|        []|
| 1000839|[41950, 35496, 45...|        []|
| 1000888|[26209, 44910, 33...|        []|
|  100140|[6069, 42585, 15892]|        []|
| 1001866|[16349, 25931, 13...|        []|
| 1002011|[18127, 19348, 21...|        []|
|  100227|[47626, 16797, 47...|        []|
| 1002442|[27966, 5240, 352...|        []|
|  100263|[27744, 24154, 24...|        []|
| 1002783|             [46041]|        []|
| 1002883|[35633, 13176, 82...|        []|
| 1002887|[26831, 22395, 48...|        []|
|  100320|[7781, 12614, 494...|        []|
| 1003202|[25230, 32839, 49...|        []|
| 1003366|[28132, 23579, 49...|        []|
| 1003397|[21938, 24184, 44...|        []|
+--------+-

### Example output recommendation

In [16]:
df = spark.read.csv("D://Documents//Semester_8//Apache-Spark//instacart-market-basket-analysis//sample_submission.csv", header=True)
df_sample = df.groupby("order_id").agg(functions.collect_list("products").alias("items"))
df_sample.show()
df.printSchema()

+--------+--------------------+
|order_id|               items|
+--------+--------------------+
|       3|             [28465]|
|       5|[35633, 35842, 35...|
|       1|[6069, 13176, 32839]|
|       4|             [47766]|
|       2|      [21137, 13176]|
+--------+--------------------+

root
 |-- order_id: string (nullable = true)
 |-- products: string (nullable = true)



In [17]:
# Menggunakan Model1 untuk melakukan prediksi items
model1.transform(df_sample).show()

+--------+--------------------+--------------------+
|order_id|               items|          prediction|
+--------+--------------------+--------------------+
|       3|             [28465]|                  []|
|       5|[35633, 35842, 35...|                  []|
|       1|[6069, 13176, 32839]|[21137, 21903, 47...|
|       4|             [47766]|[21137, 13176, 21...|
|       2|      [21137, 13176]|[24852, 21903, 47...|
+--------+--------------------+--------------------+



In [18]:
model2.transform(df_sample).show()

+--------+--------------------+----------+
|order_id|               items|prediction|
+--------+--------------------+----------+
|       3|             [28465]|        []|
|       5|[35633, 35842, 35...|        []|
|       1|[6069, 13176, 32839]|        []|
|       4|             [47766]|        []|
|       2|      [21137, 13176]|        []|
+--------+--------------------+----------+



In [19]:
model.transform(df_sample).show()

+--------+--------------------+----------+
|order_id|               items|prediction|
+--------+--------------------+----------+
|       3|             [28465]|        []|
|       5|[35633, 35842, 35...|        []|
|       1|[6069, 13176, 32839]|        []|
|       4|             [47766]|        []|
|       2|      [21137, 13176]|        []|
+--------+--------------------+----------+

