In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#import requires libraries
import plotly.graph_objects as go
import plotly.express as px

In [None]:
#loading dataset
df = pd.read_csv('../input/groceries-dataset/Groceries_dataset.csv', parse_dates=['Date'])
df.head()

Exploratory Data Analysis

In [None]:
df.isna().sum()

In [None]:
df.nunique()

In [None]:
df.shape

In [None]:
df.info()

How many individual items do we have?

In [None]:
all_products=df['itemDescription'].value_counts()
all_products

Looking at What Items are in this set

In [None]:
all_products = df['itemDescription'].unique()
all_products

 Data preparation and visualization


1.most purchased items
2.least purchased items
3.top customers

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import permutations
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

plt.style.use('seaborn-pastel')

In [None]:
#Chart 1 - Most purchased items
df['itemDescription'].value_counts().head(20).plot.bar(figsize=(8, 6), alpha=0.8, color='violet')
plt.title('20 most purchased items', size=15)
plt.ylabel('Quantity')

In [None]:
#Chart 2 - Least purchased items
df['itemDescription'].value_counts().tail(20).plot.bar(figsize=(8, 6), alpha=0.8, color='lightseagreen')
plt.title('20 least purchased items', size=15)
plt.ylabel('Quantity')

In [None]:
#Chart 3 - Top Costumers
plt.figure(figsize=(8,6))
ax = sns.countplot(x='Member_number', palette='winter', data=df, alpha=0.6, order=df.Member_number.value_counts().iloc[:20].index)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
plt.title('Top Costumers', size=15)
plt.xlabel('Costumer')
plt.ylabel('Quantity Purchased')

Data modelling for Association Rules and Apriori

In [None]:
#Grouping by Costumers and date to create transactions
transactions = df.groupby(['Member_number', 'Date'])['itemDescription'].unique().reset_index()

In [None]:
#Taking a look at the number of transactions
print(transactions.shape)
transactions.head()

In [None]:
#Separating the transactions as a list of lists and taking a look
trsct = list(list(i) for i in transactions.itemDescription.values)
trsct

In [None]:
#one hot encoding and creating the encoded Dataframe
encoder = TransactionEncoder().fit(trsct)
onehot = encoder.transform(trsct)
dfonehot = pd.DataFrame(onehot, columns=encoder.columns_)
dfonehot.head()

The Apriori algorithm

In [None]:
#Applying the apriori algorithm with a min_support of 0.002
frequent_itemsets = apriori(dfonehot, min_support=0.002, use_colnames=True)
print(len(frequent_itemsets))

In [None]:
#Compute association rules with a lift threshold of 1
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1)

In [None]:
#Printing final rules
rules