# <a>Contents</a>

- <a href="#import">Importing libraries</a>
- <a href="#eda">Exploratory data analysis</a>
 - <a href="#eda1">Customer distribution among age groups and gender</a>
 - <a href="#eda2">Customer distribution among occupation</a>
 - <a href="#eda3">Customer distribution among other features</a>
 - <a href="#eda4">Top selling products</a>
 - <a href="#eda5">Top selling product categories</a>
- <a href="#bs">Market basket analysis</a>
 - <a href="#bs1">Data transformation</a>
 - <a href="#bs2">Association rules</a>
 - <a href="#bs3">RCD of association rules</a>
- <a href="#ref">References</a>

<a id="import"></a>
# Importing libraries

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from mlxtend.frequent_patterns import association_rules, fpgrowth
from mlxtend.preprocessing import TransactionEncoder
import warnings

sns.set()
warnings.filterwarnings('ignore')

<a id="eda"></a>
# Exploratory data analysis
### Load dataset

In [None]:
dst = pd.read_csv('/kaggle/input/black-friday/train.csv')
print(f'Data dimentions: {dst.shape}')
print(f'Number of transactions: {dst.shape[0]}')
print(f'Number of customers: {dst["User_ID"].nunique()}')
print(f'Number of products: {dst["Product_ID"].nunique()}')
dst.head()

### Extract customer related information

In [None]:
user_data = dst.groupby('User_ID').first()
user_data = user_data.loc[:, ['Gender', 'Age', 'Occupation', 'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status']]
user_data['Purchase'] = dst.groupby('User_ID').sum()['Purchase']
user_data['Products_Purchased'] = dst.groupby('User_ID').count()['Purchase']
user_data = user_data.reset_index()
user_data.head()

<a id="eda1"></a>
### Customer distribution among age groups and gender

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(15, 15))
sns.countplot(ax=axes[0], x='Age', hue='Gender', data=user_data.sort_values('Age'))
axes[0].set_ylabel('Number of customers')
sns.barplot(ax=axes[1], x='Age', y='Purchase', hue='Gender', data=user_data.sort_values('Age'), estimator=np.median, capsize=.2)
axes[1].set_ylabel('Median purchase amount')
sns.barplot(ax=axes[2], x='Age', y='Products_Purchased', hue='Gender', data=user_data.sort_values('Age'), estimator=np.median, capsize=.2)
axes[2].set_ylabel('Median number of purchased products')
plt.show()

<a id="eda2"></a>
### Customer distribution among occupation

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(15, 15))
sns.countplot(ax=axes[0], x='Occupation', data=user_data.sort_values('Occupation'))
axes[0].set_ylabel('Number of customers')
sns.barplot(ax=axes[1], x='Occupation', y='Purchase', data=user_data, estimator=np.median, capsize=.2)
axes[1].set_ylabel('Median purchase amount')
sns.barplot(ax=axes[2], x='Occupation', y='Products_Purchased', data=user_data, estimator=np.median, capsize=.2)
axes[2].set_ylabel('Median number of purchased products')
plt.show()

<a id="eda3"></a>
### Customer distribution among other features

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(15, 15))
for i, feat in enumerate(('City_Category', 'Stay_In_Current_City_Years', 'Marital_Status')):
    sns.countplot(ax=axes[0, i], x=feat, data=user_data.sort_values(feat))
    axes[0, i].set_ylabel('Number of customers')
    sns.barplot(ax=axes[1, i], x=feat, y='Purchase', data=user_data.sort_values(feat), 
            estimator=np.median, capsize=.2)
    axes[1, i].set_ylabel('Median purchase amount')
    sns.barplot(ax=axes[2, i], x=feat, y='Products_Purchased', data=user_data.sort_values(feat), 
            estimator=np.median, capsize=.2)
    axes[2, i].set_ylabel('Median number of purchased products')
    
fig.tight_layout(pad=3.0)

<a id="eda4"></a>
### Top selling products

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(16, 12))

top_products_count = pd.DataFrame(dst['Product_ID'].value_counts()).head(10)
sns.barplot(ax=axes[0], x=top_products_count.index, y=top_products_count.values.reshape(-1))
axes[0].set_xlabel('Product ID')
axes[0].set_ylabel('Purchase count')

top_products_sum = dst.groupby('Product_ID')['Purchase'].sum().sort_values(ascending=False).head(10)
sns.barplot(ax=axes[1], x=top_products_sum.index, y=top_products_sum.values.reshape(-1))
axes[1].set_xlabel('Product ID')
axes[1].set_ylabel('Purchase sum')

plt.show()

<a id="eda5"></a>
### Top selling product categories

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(16, 12))

cat_1_count = dst['Product_Category_1'].value_counts()
cat_2_count = dst['Product_Category_2'].value_counts()
cat_2_count.index = cat_2_count.index.astype(int)
cat_3_count = dst['Product_Category_3'].value_counts()
cat_3_count.index = cat_3_count.index.astype(int)
cat_count = cat_1_count.add(cat_2_count, fill_value=0)
cat_count = cat_count.add(cat_3_count, fill_value=0)
sns.barplot(ax=axes[0], x=cat_count.index, y=cat_count)
axes[0].set_xlabel('Product category')
axes[0].set_ylabel('Purchase count')

cat_1_sum = dst.groupby('Product_Category_1')['Purchase'].sum()
cat_2_sum = dst.groupby('Product_Category_2')['Purchase'].sum()
cat_2_sum.index = cat_2_sum.index.astype(int)
cat_3_sum = dst.groupby('Product_Category_3')['Purchase'].sum()
cat_3_sum.index = cat_3_sum.index.astype(int)
cat_sum = cat_1_sum.add(cat_2_sum, fill_value=0)
cat_sum = cat_sum.add(cat_3_sum, fill_value=0)
sns.barplot(ax=axes[1], x=cat_sum.index, y=cat_sum)
axes[0].set_xlabel('Product category')
axes[0].set_ylabel('Purchase sum')

plt.show()

<a id="bs"></a>
# Market basket analysis using association rules (FP-Growth Algorithm)

<a id="bs1"></a>
### Perform data transformation

In [None]:
basket = list(dst.groupby('User_ID')['Product_ID'].apply(list).values)

te = TransactionEncoder()
tmp_arr = te.fit(basket).transform(basket, sparse=True)
sparse_df = pd.DataFrame.sparse.from_spmatrix(tmp_arr, columns=te.columns_)

<a id="bs2"></a>
### Find association rules
**Thresholds:** minimum support = 0.03, lift = 1, confidence = 0.5

In [None]:
frequent_itemsets = fpgrowth(sparse_df, min_support=0.02, use_colnames=True)
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1)
rules = rules[rules['confidence'] >= 0.5].sort_values(by=['lift', 'confidence'], ascending=False)
rules = rules.reset_index(drop=True)

print(f'Total number of rules found: {rules.shape[0]}\n')
antecedent_lengths = rules['antecedents'].apply(lambda x: len(x))
for n in np.unique(antecedent_lengths):
    print(f'\nNumber of rules with {n} antecedent(s): {rules[antecedent_lengths == n].shape[0]}')
    print('Rules with highest lift value:')
    display(rules[antecedent_lengths == n].reset_index(drop=True).head())
    
plt.figure(figsize=(12, 8))
plt.scatter(rules['support'], rules['confidence'], alpha=0.2)
plt.title('Distribution of association rules')
plt.xlabel('Support')
plt.ylabel('Confidence')
plt.show()

<a id="bs3"></a>
### Reverse cumulative distribution of association rules based on different values for the minimum support and confidence thresholds

In [None]:
support_vals = (0.02, 0.025, 0.03, 0.035, 0.04)
confidence_vals = np.arange(0.1, 1, 0.1)
plt.figure(figsize=(12,8))
for s in support_vals:
    frequent_itemsets_tmp = fpgrowth(sparse_df, min_support=s, use_colnames=True)
    n_rules = []
    for c in confidence_vals:
        rules_tmp = association_rules(frequent_itemsets_tmp, metric='confidence', min_threshold=c)
        n_rules.append(rules_tmp.shape[0])
    plt.plot(confidence_vals, n_rules, label=f'min_support={s}')
plt.title('RCD')
plt.xlabel('Confidence')
plt.ylabel('Number of rules')
plt.legend()
plt.show()

<a id="ref"></a>
# References

- https://en.wikipedia.org/wiki/Association_rule_learning
- https://towardsdatascience.com/association-rules-2-aa9a77241654
- https://www.softwaretestinghelp.com/fp-growth-algorithm-data-mining/