In [1]:
import pyspark
conf = pyspark.SparkConf().setAppName("FpGrowth")
sc = pyspark.SparkContext(conf=conf)
sc

In [2]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

df = sqlContext.read.format('com.databricks.spark.csv').options(header='true',
                                                                inferschema='true').load('transactional_tshirt.csv')
df.show()

+--------------+------------+
|id_transaction|product_name|
+--------------+------------+
|             0|     T-shirt|
|             0|    Trousers|
|             0|        Belt|
|             1|     T-shirt|
|             1|      Jacket|
|             2|      Jacket|
|             2|      Gloves|
|             3|     T-shirt|
|             3|    Trousers|
|             3|        Belt|
|             4|     T-shirt|
|             4|    Trousers|
|             4|    Sneakers|
|             4|      Jacket|
|             4|        Belt|
|             5|      Jacket|
|             5|        Belt|
|             6|    Trousers|
|             6|    Sneakers|
|             6|        Belt|
+--------------+------------+
only showing top 20 rows



In [3]:
# Organize the data by shopping basket
from pyspark.sql.functions import collect_set, col, count
baskets = df.groupBy('id_transaction').agg(collect_set('product_name').alias('items'))
baskets.show(truncate=False)

+--------------+-------------------------------------------+
|id_transaction|items                                      |
+--------------+-------------------------------------------+
|1             |[T-shirt, Jacket]                          |
|6             |[Sneakers, Belt, Trousers]                 |
|3             |[Belt, T-shirt, Trousers]                  |
|5             |[Belt, Jacket]                             |
|4             |[Sneakers, Belt, T-shirt, Trousers, Jacket]|
|8             |[Belt, T-shirt, Trousers, Jacket]          |
|7             |[Sneakers, Belt, Trousers]                 |
|2             |[Gloves, Jacket]                           |
|0             |[Belt, T-shirt, Trousers]                  |
+--------------+-------------------------------------------+



In [4]:
from pyspark.ml.fpm import FPGrowth
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.4, minConfidence=0.6)
model = fpGrowth.fit(baskets)

In [5]:
# Exibe os itensfrequentes
model.freqItemsets.show()

# Exibe as association rules geradas.
model.associationRules.show()

# transforma e examina os antecedentes, consequentes, confidence e lift.
model.transform(baskets).show()


+--------------------+----+
|               items|freq|
+--------------------+----+
|              [Belt]|   7|
|          [Trousers]|   6|
|    [Trousers, Belt]|   6|
|           [T-shirt]|   5|
| [T-shirt, Trousers]|   4|
|[T-shirt, Trouser...|   4|
|     [T-shirt, Belt]|   4|
|            [Jacket]|   5|
+--------------------+----+

+-------------------+----------+------------------+------------------+
|         antecedent|consequent|        confidence|              lift|
+-------------------+----------+------------------+------------------+
|    [T-shirt, Belt]|[Trousers]|               1.0|               1.5|
|[T-shirt, Trousers]|    [Belt]|               1.0|1.2857142857142856|
|         [Trousers]|    [Belt]|               1.0|1.2857142857142856|
|         [Trousers]| [T-shirt]|0.6666666666666666|               1.2|
|   [Trousers, Belt]| [T-shirt]|0.6666666666666666|               1.2|
|             [Belt]|[Trousers]|0.8571428571428571|1.2857142857142858|
|          [T-shirt]|[Tr

## References and other contents

Official Documentation Spark - Frequent Pattern Mining - Fp-Growth <br>
https://spark.apache.org/docs/latest/ml-frequent-pattern-mining.html#fp-growth<br>
How to set up PySpark for your Jupyter notebook<br>
https://opensource.com/article/18/11/pyspark-jupyter-notebook<br>
MLlib is Apache Spark's scalable machine learning library.<br>
https://spark.apache.org/mllib/<br>
Frequent Itemsets via the FP-Growth Algorithm (comp: Apriori mlxtend)<br>
http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/fpgrowth/#example-1-generating-frequent-itemsets<br>
An example demonstrating FPGrowth<br>
https://github.com/apache/spark/blob/master/examples/src/main/python/ml/fpgrowth_example.py<br>
Extracting, transforming and selecting features<br>
https://spark.apache.org/docs/latest/ml-features<br>
FPGrowth.train<br>
https://spark.apache.org/docs/2.1.0/mllib-frequent-pattern-mining.html<br>
Simplify Market Basket Analysis using FP-growth on Databricks:<br>
https://databricks.com/blog/2018/09/18/simplify-market-basket-analysis-using-fp-growth-on-databricks.html<br>
Market Basket Analysis on 3 million orders from Instacart using Spark.<br>
https://medium.com/analytics-vidhya/market-basket-analysis-on-3-million-orders-from-instacart-using-spark-24cc6469a92e
<br>
R-Apriori: An Efficient Apriori based Algorithm on Spark:<br>
https://iith.ac.in/~mkaul/papers/pikm09-rathee.pdf<br>
Optimize conversion between Apache Spark and pandas DataFrames<br>
https://docs.databricks.com/spark/latest/spark-sql/spark-pandas.html<br>
FP-growth algorithm (implementation sample and comments)<br>
http://qtdynamics.com/index.php/content/index/pid/276/cid/6533.html<br>


