Importing relevant libraries

In [None]:
import os
import random
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import tensorflow as tf
import math
from scipy import special #comb, factorial
from keras import backend as K
from scipy.stats import uniform
from matplotlib import pyplot as plt
from sklearn import tree
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.preprocessing import MinMaxScaler, StandardScaler,LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score, recall_score, make_scorer, plot_confusion_matrix, confusion_matrix, accuracy_score,f1_score





import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
sns.set_style('darkgrid')

In [None]:
df = pd.read_csv('/kaggle/input/groceries-dataset/Groceries_dataset.csv')
df.head()

Basic info about the dataset

In [None]:
df.info()

Do we have nulls?

In [None]:
df.isnull().sum()

No, we don't

# How many unique shopping items do we have?

In [None]:
df['itemDescription'].unique().size

# When were the purchases made?

In [None]:
df['Date'] = pd.to_datetime(df['Date'])
plt.figure(figsize=(10,7))
sns.countplot(df['Date'].apply(lambda x: x.year))
plt.title('When were the purchases made?')
plt.xlabel('Year')
plt.show()

We see that the dataset contains the purchases made only in 2014 and 2015. Now let's see more specific distribution

In [None]:
plt.figure(figsize=(10,7))
df3 = df.groupby('Date').count()['itemDescription'].reset_index()
plt.plot(df3['Date'],df3['itemDescription'])
plt.xlabel('Date')
plt.ylabel('Number of items bought')
plt.title('Number of items sold (each day)')
plt.show()

Let's reduce the noise by consider total count of items bougth **each month**.

In [None]:
plt.figure(figsize=(10,7))
df3 = df.copy()
df3['Date'] = df3['Date'].apply(lambda x: pd.to_datetime(f"{x.year}/{x.month}/{1}"))


df3 = df3.groupby('Date').count()['itemDescription'].reset_index()
plt.plot(df3['Date'],df3['itemDescription'])
plt.xlabel('Date')
plt.ylabel('Number of items bought')
plt.title('Number of items sold (each month)')
plt.show()

To get a better idea of whether there is some yearly trend, we will adjust the graph as follows:

In [None]:
df3 = df.copy()
df3['Year'] = df['Date'].apply(lambda x: x.year)
df3['Month'] = df['Date'].apply(lambda x: x.month)
df3.drop(['Member_number','Date'],axis=1,inplace=True)
df3 = df3.groupby(['Year','Month']).count().reset_index()



d_2014 = df3[df3['Year'] == 2014]
d_2015 = df3[df3['Year'] == 2015]


plt.figure(figsize=(10,7))
plt.plot(d_2014['Month'],d_2014['itemDescription'],label='2014')
plt.plot(d_2015['Month'],d_2015['itemDescription'],label='2015')
plt.title('Number of items sold (each month)')
plt.xlabel('Month')
plt.ylabel('item count')
plt.legend()
plt.show()

In [None]:
corr=d_2014.merge(right=d_2015,on='Month')[['itemDescription_x','itemDescription_y']].corr().values[0][1]
print(f'Correlation between sales in 2014 and 2015: {corr}')

Couple notes can be made here:
1. Besides February, sales in each month of 2015 were higher than in the same month of 2014.
2. There doesn't seem to be any yearly trend: correlation between the sales in 2014 and 2015 is roughly 1%.

Now let's look at the sales per each item

In [None]:
df1 = df.groupby('itemDescription').count().sort_values(by='Member_number',ascending=False).reset_index()
df1.rename(columns={'itemDescription': 'Item',
                   'Member_number': 'Number of sales'},inplace=True)
df1.drop(['Date'],axis=1,inplace=True)
df1

Summary of the `Number of sales`

In [None]:
df1['Number of sales'].describe()

Let's look at the histogram plotting the distribution of the `Number of sales`

In [None]:
plt.figure(figsize=(10,7))
df1['Number of sales'].hist(alpha=0.6)
plt.xlabel('sales count')
plt.ylabel('item count')
plt.title("How many times each item was sold?")
plt.show()

We see that half of the items were purchased less than $86$ times. However, there are some outliers. Let's have a look at what the outliers are.

In [None]:
df1 = df.groupby('itemDescription').count().sort_values(by='Member_number',ascending=False).head(10).reset_index()
df1.drop(['Date'],axis=1,inplace=True)
df1.rename(columns={'itemDescription': 'Item',
                   'Member_number' : 'Number of sales'},inplace=True)
fig = px.bar(df1,
             x='Item',
             y='Number of sales',
             title= 'Most purchased items')
fig.show()

Now let's have a look at the customers that bought the most items.

In [None]:
df1 = df.groupby('Member_number').count().sort_values(by='itemDescription',ascending=False).head(10).reset_index()
df1.drop(['Date'],axis=1,inplace=True)
df1.rename(columns={'itemDescription': 'Item count',
                   'Member_number' : 'Customer ID'},inplace=True)
df1['Customer ID'] = df1['Customer ID'].astype(str)
fig = px.bar(df1,
             x='Customer ID',
             y='Item count',
             title='Customers that purchased the most items')
fig.show()

As things stand, our data is organized in a way that we can not determine how many items (and what items exactly) each customer bought PER EACH VISIT to the store. For example, have a look at the following table

In [None]:
df[df['Member_number'] == 1000].sort_values(by='Date').head(3)

We see that the customer with an ID 1000 bought 3 items on July 4, 2014. The problem is though, we don't know how many times he went to the store, and what he bought per each visit. For example, it is possible that he went to the store 3 times, each time buying single item. It is also possible that he went to the store 2 times, the first time he bought pastry and snack, and the second time he bought milk. Or maybe he went to the store only once, and bought all 3 items at once. Which one is it? We don't know. But to perform any meaningful association analysis, we need to know. Since no information was provided, we will make the following assumption:

> **Assumption**. On each day, arbitrary customer went to the store ONLY ONCE.


With this assumption in mind, we can reorganize the dataset in a following way:


In [None]:
df1 = df.copy()
df1['itemDescription'] = df1['itemDescription'].apply(lambda x: [x,]).copy()
df1 = df1.groupby(['Member_number','Date']).agg(sum).reset_index()
df1.rename(columns={'itemDescription': 'Items bought'},inplace=True)
df1.head()

`Items bought` now represents the set of all items which were bought during a single visit to the store.

Let's see how many items customers purchase per each visit to the store.

In [None]:
df1['Basket size'] = df1['Items bought'].apply(lambda x: len(x))

In [None]:
df1['Basket size'].describe()

In [None]:
plt.figure(figsize=(10,7))
df1['Basket size'].hist(alpha=0.6)
plt.xlabel('item count')
plt.ylabel('customer count')
plt.title("How many items are being purchased each visit?")
plt.show()

We see that most customers purchase 2 items per each visit to the store.

Now we will use [Association rule learning](https://en.wikipedia.org/wiki/Association_rule_learning) to check whether there are some patterns in the customers purchasing behavior. To generate set of relevant rules, we will use [Apriori algorithm](https://en.wikipedia.org/wiki/Apriori_algorithm). Before proceeding, make sure that you are fimiliar with following concepts: association rule, support, confidence, lift (information regarding all of these concepts can be found in the [here](https://en.wikipedia.org/wiki/Association_rule_learning) (see section "Useful Concepts")).


We will only generate rules where support exceeds $0.1\%$ and confidence exceeds $10\%$
    

In [None]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import mlxtend as ml


df1 = df.copy()
df1['itemDescription'] = df1['itemDescription'].apply(lambda x: [x,]).copy()
df1 = df1.groupby(['Member_number','Date']).agg(sum).reset_index()
df1.rename(columns={'itemDescription': 'Items bought'},inplace=True)



all_items = df['itemDescription'].unique()
data = []


for transaction in df1['Items bought']:
    row = []
    for item in all_items:
        if item in transaction:
            row.append(1)
        else:
            row.append(0)
    data.append(row)

df2 = pd.DataFrame(data,columns=all_items)    
df2 = df2.rename_axis('Transcation ID')
            

frequent_itemsets = apriori(df2, min_support=0.001, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift")
rules.sort_values('confidence', ascending = False, inplace = True)

In [None]:
rules = rules[rules['confidence'] > 0.1].copy()
rules.head()

Let's see how many assofication rules we are dealing with:

In [None]:
rows = rules.shape[0]
print(f'Number of rules: {rows}')

Sstatistics summary regarding `support`:

In [None]:
rules['support'].describe()

We see that the support for all rules in our dataset is very low (i.e., the proportion of transactions that involve items from both baskets), which may be problematic, due to the fact that any results obtained from analysis may not be statistically significant.

Let's see the rules with the highest lift

In [None]:
rules.sort_values(by='lift',ascending=False).head(10).iloc[:,:-2][['antecedents',
                                                                  'consequents',
                                                                   'consequent support',
                                                                  'lift']]

We see that itemsets (yogurt, whole milk) and (sausage) have the highest lift. That means that once we know that a customer has bought yogurt and whole milk, it becomes 2.2 times more likely that he will also buy sausage. But as we've pointed out, due to the fact that the support is very low, we cannot really determine whether this is just a fluke or a real association.

Similarly, we will check the the rules with the lower lift (i.e., rules where the items in antecedent and consequent are unlikely to be bought together).

In [None]:
rules.sort_values(by='lift',ascending=True).head(10).iloc[:,:-2][['antecedents',
                                                                  'consequents',
                                                                   'consequent support',
                                                                  'lift']]

Let's visualize the relation between support, confidence and lift

In [None]:
sup = rules['support'].values
conf = rules['confidence'].values
lift = rules['lift'].values

plt.figure(figsize=(10,6))
sc = plt.scatter(sup,conf,c=lift)
plt.colorbar(sc,label='Lift')
plt.xlabel('support')
plt.ylabel('confidence')
plt.show()

As we see, most rules have both low confidence and low support. However, there is one rule that we may want to single out: the rule with the confidence of about $26\%$:

In [None]:
rules[rules['confidence'] > 0.24]

We see that the probability of a customer buying milk is roughly $16\%$. But given that the customer has bougth yougurt and sausage, the probability of buying milk increases to $25\%$ (this also signifies that the rule has a high lift). But again, support is very low (only $0.1\%$). So while this rule seems to be the most promising out  of rules our dataset contains, due to the fact that the support is very low, one should not make any rash conclusions about the association between the two itemsets without further investigation. How could one investigate? One  option is to come up with a way to directly (or indirectly) ask those customers who bought yogurt, sausage and whole milk about why they buy these items together.

Based on the graph above, one could also see some rules with a relatively decent support (i.e., support exceeds $1\%$), let's check what those rules are:

In [None]:
rules[rules['support'] > 0.01]

Not only confidence is low for both of these rules, but the lift is actually less than 1. For example, the probability of a customer buying whole milk is about $16\%$. But if we know that the customer bought yogurt, then the probability of buying whole milk drops to $13\%$. The upshot is: these rules are not of much interest to us.

### Conclusions:

1. The dataset contains transactions made in 2014 and 2015.
2. In each month of 2015 (besides February), the sales (i.e, total count of items sold) grew from a year earlier. Furthermore, the correlation between the sales of 2014 and 2015 is low (around 1%), which implies there is no yearly sales trend.
3. The dataset contains 167 unique shopping items.
4. Over the span of two years, half of the shopping items were bought less than 85 items each. The most popular items (i.e, those items that were bought more than 1k times) are:

    - whole milk (bought 2502 times)
    - other vegetables (bought 1898 times)
    - rolls/buns (bought 1716 times)
    - soda (bought 1514 times)
    - yogurt (bought 1334 times)
    - root vegetables (bought 1071 times)
    - tropical fruit (bought 1032 times)
5. Per each visit to the store, half of the customers only purchases 2 items (or less), and 95% of the customers purchase 5 items (or less).

6. Using Apriori algorithm (AA), we've found that most rules have very low support (which implies that even if one finds one rule to have a strong association, the association might not be statistically significant). Furthermore, using AA we've found that most rules have low confidence and low lift (which signifies a weak association). The only rule which may have a meanignful association is $$\text{(yogurt, sausage)} \implies \text{(whole milk)}$$
the reason why the rule may be meaningful is because of the highest confidence out of all rules ($\approx 26\%$), and one of the highest lifts ($\approx 1.6$). This means that, once we know the customer has purchased yogurt and sausage, the probability of the customer also buying whole milk significantly increases. However, given that the support is low ($\approx 0.1\%$), one should be careful before making any conclusions about whether the association is significant.