# Association Rule

## Setting Pyspark in Colab

In [None]:
!wget -q https://archive.apache.org/dist/spark/spark-3.2.4/spark-3.2.4-bin-hadoop3.2.tgz
!tar xf spark-3.2.4-bin-hadoop3.2.tgz
!pip install -q findspark

In [None]:
import findspark
findspark.init("/content/spark-3.2.4-bin-hadoop3.2")

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

## Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/gdrive')

In [None]:
gpath = '/gdrive/MyDrive/data/'

## FPGrowth

In [None]:
from pyspark.ml.fpm import FPGrowth

In [None]:
# df 생성
df = spark.createDataFrame([
    (0, [1, 2, 5]),
    (1, [1, 2, 3, 5]),
    (2, [1, 2])
], ["id", "items"])
df.show()

In [None]:
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.5, minConfidence=0.6)
model = fpGrowth.fit(df)

In [None]:
# Display frequent itemsets.
model.freqItemsets.show()

In [None]:
# Display generated association rules.
model.associationRules.show()

In [None]:
# transform examines the input items against all the association rules and summarize the consequents as prediction
model.transform(df).show()

## 실습 : market_basket.csv

In [None]:
raw_data = []
import csv
with open("association_rule_market_basket.csv") as f:
    reader = csv.reader(f)
    for r in reader:
        raw_data.append(r)

In [None]:
parsed_data = []
header = raw_data[0]
id_cnt = 0
for d in raw_data[1:]:
    id_cnt += 1
    temp = []
    for i in range(len(d)):
        if not d[i] == "":
            temp.append(i)
    if len(temp) > 10:
        parsed_data.append([id_cnt, temp])

len(parsed_data)

In [None]:
data_rdd = spark.sparkContext.parallelize(parsed_data)

In [None]:
data_df = spark.createDataFrame(data_rdd, ["id", "items"])
data_df.take(2)

In [None]:
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.1, minConfidence=0.7)
model = fpGrowth.fit(data_df)

# Display frequent itemsets.
model.freqItemsets.show()

In [None]:
# read data
import csv

raw_data = []

with open("association_rule_market_basket.csv") as f:
    reader = csv.reader(f)

    for row in reader:
        raw_data.append(row)

In [None]:
# format data
header = raw_data[0]

parsed_data = []
for basket_id, items in enumerate(raw_data[1:], 1):
    basket = []
    for item_id, value in enumerate(items):
        if not value == "":
            basket.append(item_id)
    if len(basket) > 10:
        parsed_data.append([basket_id, basket])

len(parsed_data)

In [None]:
parsed_data[:3]

In [None]:
# to spark df
data_df = spark.createDataFrame(parsed_data, schema=["id", "items"])
data_df.show(3)

In [None]:
# build model
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.1, minConfidence=0.7)
model = fpGrowth.fit(data_df)

In [None]:
# Display frequent itemsets.
model.freqItemsets.sort("freq", ascending=False).show()

In [None]:
# check item [141, 132]
print(header[141])
print(header[132])

In [None]:
# Display generated association rules.
model.associationRules.sort("confidence", ascending=False).show()

In [None]:
# transform examines the input items against all the association rules and summarize the consequents as prediction
model.transform(data_df).show()

## A Priori Algorithm (written in Pandas)

In [None]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [None]:
# read data
df = pd.read_csv("Online Retail.csv")
df.head()

In [None]:
# format data

# Some of the descriptions have spaces that need to be removed
df["Description"] = df["Description"].str.strip()

# Drop the rows that don't have invoice numbers
df.dropna(axis=0, subset=["InvoiceNo"], inplace=True)

# InvoiceNo를 string으로
df["InvoiceNo"] = df["InvoiceNo"].astype("str")

# Remove the credit transactions (those with invoice numbers containing "C")
df = df[~df["InvoiceNo"].str.contains("C")]

In [None]:
df

In [None]:
# Consolidate the items into 1 transaction per row with each product 1 hot encoded.
# For the sake of keeping the data set small, I'm only looking at sales for France
basket = (df[df["Country"] == "France"]
          .groupby(["InvoiceNo", "Description"])["Quantity"]
          .sum().unstack().reset_index().fillna(0)
          .set_index("InvoiceNo")
          )

In [None]:
basket

In [None]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

# Make sure any positive values are converted to "1" and anything less than 0 is set to "0"
basket_sets = basket.applymap(encode_units)

# Remove POSTAGE column
basket_sets.drop("POSTAGE", inplace=True, axis=1)

In [None]:
basket_sets

In [None]:
frequent_itemsets = apriori(basket_sets, min_support=0.07, use_colnames=True)

In [None]:
frequent_itemsets

In [None]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules.head()

In [None]:
rules[(rules["lift"] >= 6) & (rules["confidence"] >= 0.8)]

In [None]:
basket["SET/6 RED SPOTTY PAPER PLATES"].sum()