In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
pd.set_option('display.expand_frame_repr', False)
!pip install openpyxl

from mlxtend.frequent_patterns import apriori, association_rules

df_ = pd.read_excel("../input/online-retail-ii-data-set-from-ml-repository/online_retail_II.xlsx", sheet_name= "Year 2010-2011")

df = df_.copy()

In [None]:
# To avoid the outliers
def outlier_thresholds(dataframe, variable):
    quartile1 = dataframe[variable].quantile(0.01)
    quartile3 = dataframe[variable].quantile(0.99)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit
def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

In [None]:
# preprocessing
def retail_data_prep(dataframe):
    dataframe.dropna(inplace=True)
    dataframe = dataframe[~dataframe["Invoice"].str.contains("C", na=False)]
    dataframe = dataframe[dataframe["Quantity"] > 0]
    dataframe = dataframe[dataframe["Price"] > 0]
    replace_with_thresholds(dataframe, "Quantity")
    replace_with_thresholds(dataframe, "Price")
    return dataframe

df = retail_data_prep(df)
df.head()

In [None]:
df_ger = df[df['Country'] == "Germany"]

In [None]:
df_ger.groupby(['Invoice', 'Description']).agg({"Quantity": "sum"}).head(20)

In [None]:
df_ger.groupby(['Invoice', 'Description']).agg({"Quantity": "sum"}).unstack().iloc[0:5, 0:5]


In [None]:
df_ger.groupby(['Invoice', 'Description']).agg({"Quantity": "sum"}).unstack().fillna(0).iloc[0:5, 0:5]

In [None]:
df_ger.groupby(['Invoice', 'Description']).agg({"Quantity": "sum"}).unstack().fillna(0).applymap(
    lambda x: 1 if x > 0 else 0).iloc[0:5, 0:5]

In [None]:
def create_invoice_product_df(dataframe, id=False):
    if id:
        return dataframe.groupby(['Invoice', "StockCode"])['Quantity'].sum().unstack().fillna(0). \
            applymap(lambda x: 1 if x > 0 else 0)
    else:
        return dataframe.groupby(['Invoice', 'Description'])['Quantity'].sum().unstack().fillna(0). \
            applymap(lambda x: 1 if x > 0 else 0)

In [None]:
ger_inv_pro_df = create_invoice_product_df(df_ger)

ger_inv_pro_df = create_invoice_product_df(df_ger, id=True)

In [None]:
# for the find the names of id
def check_id(dataframe, stock_code):
    product_name = dataframe[dataframe["StockCode"] == stock_code][["Description"]].values[0].tolist()
    print(product_name)

In [None]:
check_id(df_ger, 21987) #PACK OF 6 SKULL PAPER CUPS


In [None]:
check_id(df_ger, 23235) # STORAGE TIN VINTAGE LEAF


In [None]:
check_id(df_ger, 22747) #POPPY'S PLAYHOUSE BATHROOM

**Association Rules**

Antecedents : previous item

Consequents : next item

Antecedents support : gives the probability of previous item alone

Consequents support : gives the probability that the next item will appear alone

Support : expresses the probability of seeing two products together

Confidence : probability of getting y when x is taken

Lift : probability of taking y when x is taken

In [None]:
frequent_itemsets = apriori(ger_inv_pro_df, min_support=0.01, use_colnames=True)
frequent_itemsets.sort_values("support", ascending=False).head(20)

rules = association_rules(frequent_itemsets, metric="support", min_threshold=0.01)
rules.sort_values("support", ascending=False).head(100)

rules.sort_values("lift", ascending=False).head(100)

for product recommendation for users

In [None]:
def arl_recommender(rules_df, product_id, rec_count=1):
    sorted_rules = rules_df.sort_values("lift", ascending=False)
    recommendation_list = []
    for i, product in enumerate(sorted_rules["antecedents"]):
        for j in list(product):
            if j == product_id:
                recommendation_list.append(list(sorted_rules.iloc[i]["consequents"])[0])

    return recommendation_list[0:rec_count]

In [None]:
# product recommendation for user who has item 21987 in their cart
arl_recommender(rules, 21987,2)

In [None]:
# find the product name
check_id(df_ger, 21086)

In [None]:
# try another product
arl_recommender(rules, 22747,1)

In [None]:
# find the product name
check_id(df_ger, 22746)