## FP-Growth 

In [1]:
from pyspark.ml.fpm import FPGrowth

df = spark.createDataFrame([
        (0, ["Apple", "Orange", "Banana"]),
        (1, ["Apple", "Orange"]),
        (2, ["Butter", "Banana"]),
        (3, ["Apple", "Peach","Orange","Banana"]),
        (4, ["Peach","Orange"]),
        (5, ["Banana","Apple"]),
        (6, ["Orange"]),
        (7, ["Apple","Orange"])
    ], ["id", "items"])

df.show()

+---+--------------------+
| id|               items|
+---+--------------------+
|  0|[Apple, Orange, B...|
|  1|     [Apple, Orange]|
|  2|    [Butter, Banana]|
|  3|[Apple, Peach, Or...|
|  4|     [Peach, Orange]|
|  5|     [Banana, Apple]|
|  6|            [Orange]|
|  7|     [Apple, Orange]|
+---+--------------------+



### Frequent Itemsets

In [2]:
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.2, minConfidence=0.6)
model = fpGrowth.fit(df)

# Display frequent itemsets.
model.freqItemsets.show()

+--------------------+----+
|               items|freq|
+--------------------+----+
|            [Banana]|   4|
|     [Banana, Apple]|   3|
|[Banana, Apple, O...|   2|
|    [Banana, Orange]|   2|
|            [Orange]|   6|
|             [Apple]|   5|
|     [Apple, Orange]|   4|
|             [Peach]|   2|
|     [Peach, Orange]|   2|
+--------------------+----+



### Association Rules

In [3]:
# Display generated association rules.
model.associationRules.show()

+----------------+----------+------------------+------------------+
|      antecedent|consequent|        confidence|              lift|
+----------------+----------+------------------+------------------+
|        [Orange]|   [Apple]|0.6666666666666666|1.0666666666666667|
|         [Apple]|  [Banana]|               0.6|               1.2|
|         [Apple]|  [Orange]|               0.8|1.0666666666666667|
| [Banana, Apple]|  [Orange]|0.6666666666666666|0.8888888888888888|
|         [Peach]|  [Orange]|               1.0|1.3333333333333333|
|[Banana, Orange]|   [Apple]|               1.0|               1.6|
|        [Banana]|   [Apple]|              0.75|               1.2|
+----------------+----------+------------------+------------------+



### Predictions

In [4]:
# transform examines the input items against all the association rules and summarize the consequents as prediction
model.transform(df).show()

+---+--------------------+----------+
| id|               items|prediction|
+---+--------------------+----------+
|  0|[Apple, Orange, B...|        []|
|  1|     [Apple, Orange]|  [Banana]|
|  2|    [Butter, Banana]|   [Apple]|
|  3|[Apple, Peach, Or...|        []|
|  4|     [Peach, Orange]|   [Apple]|
|  5|     [Banana, Apple]|  [Orange]|
|  6|            [Orange]|   [Apple]|
|  7|     [Apple, Orange]|  [Banana]|
+---+--------------------+----------+



### PrefixSpan

In [5]:
from  pyspark.mllib.fpm import PrefixSpan
data = [
   [[1],[1, 2, 3], [1,3],[4],[3, 6]],
   [[1,4], [3],[2, 3], [1, 5]],
   [[5, 6], [1, 2], [4,6],[3],[2]],
   [[5], [7],[1, 6],[3],[2],[3]]
   ]
rdd = sc.parallelize(data, 3)    
#support = 0.7
model = PrefixSpan.train(rdd, 0.7,3)     
sorted(model.freqSequences().collect())

[FreqSequence(sequence=[[1]], freq=4),
 FreqSequence(sequence=[[1], [2]], freq=4),
 FreqSequence(sequence=[[1], [3]], freq=4),
 FreqSequence(sequence=[[1], [3], [2]], freq=3),
 FreqSequence(sequence=[[1], [3], [3]], freq=3),
 FreqSequence(sequence=[[2]], freq=4),
 FreqSequence(sequence=[[2], [3]], freq=3),
 FreqSequence(sequence=[[3]], freq=4),
 FreqSequence(sequence=[[3], [2]], freq=3),
 FreqSequence(sequence=[[3], [3]], freq=3),
 FreqSequence(sequence=[[4]], freq=3),
 FreqSequence(sequence=[[4], [3]], freq=3),
 FreqSequence(sequence=[[5]], freq=3),
 FreqSequence(sequence=[[6]], freq=3)]