In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
# Load the dataset
df = pd.read_csv("../input/online-retail-ii-uci/online_retail_II.csv")

In [None]:
# Display first 5 rows/transactions
df.head(5)

In [None]:
# Summary stats
df.info()

# Data cleaning

In [None]:
# checking for duplicate transactions
df.duplicated().sum()

In [None]:
print("Number of transactions before duplicates removal : %d " % df.shape[0])
# Dropping the duplicated transactions
df = df.drop(index=df[df.duplicated()].index)
print("Number of transactions after duplicates removal  : %d " % df.shape[0])

In [None]:
# Checking for cancelled transactions
df[df['Invoice'].astype(str).str[0] == 'C'].tail()

In [None]:
print("Number of transactions before dropping the cancelled transactions : %d " % df.shape[0])
# Dropping the cancelled transactions
df = df.drop(index=df[df['Invoice'].astype(str).str[0] == 'C'].index)
print("Number of transactions after dropping the cancelled transactions  : %d " % df.shape[0])

In [None]:
# Checking for missing values
df.isnull().sum()

In [None]:
# Remove transactions with missing product description
df = df.drop(index=df[df['Description'].isnull()].index)
# still any missing product descriptions ?
df.isnull().sum()

In [None]:
# Dropping transactions with negative quantity 
df = df.drop(index = df[df['Quantity'] <= 0].index)

In [None]:
# Summary stats for feature 'Country'
df['Country'].describe()

In [None]:
# transactions count by country
df['Country'].value_counts()

# We will analyse transactions from 'Japan' 

In [None]:
country = 'Japan'
df_country = df[df['Country'] == country]

In [None]:
print("Number of unique invoices : %d " % len(df_country['Invoice'].value_counts()))
print("Number of unique products : %d " % len(df_country['Description'].value_counts()))

In [None]:
# Product sold quantity per invoice
freq = df_country.groupby(['Invoice', 'Description'])['Quantity'].sum()
freq.head()

In [None]:
prod_freq = freq.unstack().fillna(0).reset_index().set_index('Invoice')
prod_freq.head(33)

In [None]:
# Set value to 1 for postivie quantity. Anything else set to 0
product_set = prod_freq.applymap(lambda x : 1 if x > 0 else 0 )
product_set.head(33)

In [None]:
# item dictionary = {Description: Ordinal}
item_dict = dict()
for i in range(product_set.keys().size):
    item_dict[product_set.keys()[i]] = 'item_'+str(i+1)
pd.DataFrame.from_dict(item_dict, orient='index').rename(columns={0:'Ordinal'}).head(300)

In [None]:
product_simple_set=product_set.rename(columns=lambda s: item_dict[s])
product_simple_set.head(33)

In [None]:
total = product_simple_set.shape[0]
print ("Total numer of transactions: %d" % total)

# Frequent Productsets via Apriori Algorithm

## Provide Mini-support = 10%

In [None]:
#Define mini support = 10%
mini_sup = 0.1
print("Mini Support : %2f " % mini_sup)

In [None]:
#Generate C1
C1_dict = product_simple_set.sum(axis=0).to_dict()
C1_tmp = {}
for d in C1_dict:
    C1_tmp[frozenset([d])]=C1_dict[d]/total
C1 = pd.DataFrame.from_dict(C1_tmp, orient='index').rename(columns={0:'Support'})
print("Total number of candidate 1-itemsets  : %d " % C1.shape[0])
C1.head(300)

In [None]:
# # Generate L1 by C1 and mini support
L1 = C1[C1['Support']>=mini_sup]
print("Total number of frequent 1-itemsets  : %d " % L1.shape[0])
L1.head(15)

In [None]:
L = L1.copy()

In [None]:
# Determin to prun or not
def prun(candidates, l):
    for candidate in candidates:
        sub = candidates - frozenset([candidate])
        if sub not in l:
            return True
    return False

In [None]:
# Calculate Support
def calc_support(candidates):
    query = ' & '.join(['{}>0'.format(k) for k in candidates])
    return product_simple_set.query(query).shape[0]/total
    

In [None]:
# Join
def generate_ck(Lksub1, k):
    Ck = {}
    total_candidates = len(Lksub1.index)
    candidates = list(Lksub1.index)
    for i in range(total_candidates):
        for j in range(1, total_candidates):
            candidate1 = list(candidates[i])
            candidate2 = list(candidates[j])
            candidate1.sort()
            candidate2.sort()
            if candidate1[0:k-2]==candidate2[0:k-2]:
                new_candidates = candidates[i] | candidates[j]
                if prun(new_candidates, candidates)==False:
                    Ck[new_candidates]=calc_support(new_candidates)
    return pd.DataFrame.from_dict(Ck, orient='index').rename(columns={0:'Support'})

In [None]:
# Generate C2 by L1
C2 = generate_ck(L1, 2)
print("Total number of candidate 2-itemsets  : %d " % C2.shape[0])
C2.head(105)

In [None]:
# Generate L2 by C2
L2 = C2[C2['Support']>=mini_sup]
print("Total number of frequent 2-itemset  : %d " % L2.shape[0])
L2.head(7)

In [None]:
L = L.append(L2)

In [None]:
# Generate C3 by L2
C3 = generate_ck(L2, 3)
print("Total number of candidate 3-itemsets  : %d " % C3.shape[0])
C3.head()

In [None]:
# Generate L3 by C3
L3 = C3[C3['Support']>=mini_sup]
print("Total number of frequent 3-itemsets  : %d " % L3.shape[0])
L3.head()

In [None]:
L = L.append(L3)

In [None]:
# Generate C4 by L3
C4 = generate_ck(L3, 4)
print("Total number of candidate 4-itemsets : %d " % C4.shape[0])

## Define My Apriori Function

In [None]:
def generate_C1(data_frame):
    total = data_frame.shape[0]
    C1_dict = product_simple_set.sum(axis=0).to_dict()
    C1_tmp = {}
    for d in C1_dict:
        C1_tmp[frozenset([d])]=C1_dict[d]/total
    C1 = pd.DataFrame.from_dict(C1_tmp, orient='index').rename(columns={0:'Support'})
    print("Total number of candidate 1-itemsets  : %d " % C1.shape[0])
    return total, C1

def generate_Lk (mini_sup, Ck):
    Lk = Ck[Ck['Support']>=mini_sup]
    return Lk

# Define my apriori functions
def my_apriori(mini_sup, data_frame, max_k):
    total, C1 = generate_C1(data_frame)
    Lksub1 = generate_Lk(mini_sup, C1)
    print("Total number of frequent 1-itemsets  : %d " % Lksub1.shape[0])
    C_ALL = [C1]
    L_ALL = [Lksub1]
    for i in range(2, max_k+1):
        Ck = generate_ck(Lksub1, i)
        if (Ck.shape[0] == 0):
            break
        print("Total number of candidate %d-itemsets  : %d " % (i, Ck.shape[0]))
        Lksub1 = generate_Lk(mini_sup, Ck)
        print("Total number of frequent %d-itemsets  : %d " % (i, Lksub1.shape[0]))
        C_ALL.append(Ck)
        L_ALL.append(Lksub1)
    return C_ALL, L_ALL

## Calculate different mini-support

In [None]:
# Test mini_sup = 18%
C_18, L_18 = my_apriori(0.18, product_simple_set, 5)

In [None]:
# Test mini_sup = 15%
C_15, L_15 = my_apriori(0.15, product_simple_set, 5)

In [None]:
# Test mini_sup = 10%
C_10, L_10 = my_apriori(0.1, product_simple_set, 5)

In [None]:
# Test mini_sup=8%
C_8, L_8 = my_apriori(0.08, product_simple_set, 5)

# Association Rules generation from Frequent Productsets


In [None]:
L18 = L_18[0].copy()
L18.shape[0]
for i in range(1, len(L_18)):
    L18 = L18.append(L_18[i])
print("There are %d frequent itemsets with mini-support=0.18" % L18.shape[0])
L18.head()

In [None]:
L15 = L_15[0].copy()
L15.shape[0]
for i in range(1, len(L_15)):
    L15 = L15.append(L_15[i])
print("There are %d frequent itemsets with mini-support=0.15" % L15.shape[0])
L15.head()

In [None]:
L10 = L_10[0].copy()
L10.shape[0]
for i in range(1, len(L_10)):
    L10 = L10.append(L_10[i])
print("There are %d frequent itemsets with mini-support=0.1" % L10.shape[0])
L10.head(24)

In [None]:
L8 = L_8[0].copy()
L8.shape[0]
for i in range(1, len(L_8)):
    L8 = L8.append(L_8[i])
L8.shape[0]
print("There are %d frequent itemsets with mini-support=0.08" % L8.shape[0])
L8.head(803)

In [None]:
# Convert item to product
def convert_to_product(items):
    result = set()
    for item in items:
        for k, v in item_dict.items():
            if (v == item):
                result.add(k)
    return frozenset(result)

In [None]:
# CalCulate Confidents
def calculate_confident (frequent_df):
    frequent_products = frequent_df.to_dict()['Support']
    rules = []
    sub_set_list = []
    for product in frequent_products.keys():
        for sub_set in sub_set_list:
            if (sub_set.issubset(product)):
                conf = frequent_products[product]/frequent_products[product - sub_set]
                rule = (convert_to_product(product - sub_set), convert_to_product(sub_set), conf)
                if (rule not in rules):
                    rules.append(rule)
        sub_set_list.append(product)
    confident_df = pd.DataFrame(rules, columns=['Occurrence', 'Co-Occurrence', 'Confident'])
    total = confident_df.shape[0]
    return total, confident_df

In [None]:
confident_10_total, confident_10 = calculate_confident(L10)
confident_10.head(confident_10_total)

In [None]:
confident_8_total, confident_8 = calculate_confident(L8)
confident_8.head(confident_8_total)

## Mini Confident = 1

In [None]:
mini_conf = 1

In [None]:
rules_10 = confident_10[confident_10['Confident']>=mini_conf]
print ("Total number of rules: %d" % rules_10.shape[0])
rules_10.head(20)

In [None]:
rules_8 = confident_8[confident_8['Confident']>=mini_conf]
print ("Total number of rules: %d" % rules_8.shape[0])
rules_8.head(11632)

# Conclusion
1. When mini-support is too hight, there are only one-itemsets in frequent itemsets.
2. When confidence is the same, there are much more association rules with lower mini-support.
3. There will be too many association rules when mini-support is too low. It's hard to find out what we want.

# For Project 3

In [None]:
rules_10

In [None]:
%%time
proj3Data = []
for item  in rules_8.to_numpy():
    if (len(item[0]) == 1 and len(item[1]) == 1):
        proj3Data.append([next(iter(item[0])), next(iter(item[1]))])
pd.DataFrame(proj3Data).to_csv('for_proj3.csv', index = False, header = False)