### Questions to Answer:
1. What is the best time so sell?

2. What are the popular items?

3. What items are usually bought together? (Association Mining Rule) 

4. What is the trend?

# Data Auditing

In [None]:
#pip install apriori
#pip install mlxtend
#pip install seaborn

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from mlxtend.frequent_patterns import apriori, association_rules

In [None]:
#to check the directory of the file
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# read csv 
df = pd.read_csv('/kaggle/input/the-bread-basket/bread basket.csv')

In [None]:
# first look
df.head()

In [None]:
# check for null values
print(df.shape)
print('----------------------------')
print('There are no NULL values:')
print(df.isna().sum())
print('----------------------------')
print(df.info())

In [None]:
# look at figures from categorical data
df.describe(include = ['O'])

In [None]:
df.Item.unique()

## Preparing data for analysis

In [None]:
df['Quantity'] = 1 #assign quantity bought for each row
df1 = df.copy() # create a copy of original data
df1 = df.groupby(['Transaction','Item','date_time','period_day','weekday_weekend']).sum()

#pivot Items to columns to run Apriori Algorithm later. The new dataframe has 98 columns now.
df1 = df1.pivot_table('Quantity',['Transaction','date_time','period_day','weekday_weekend'],'Item').reset_index().rename_axis(None, axis=1).fillna(0)
df1.head(2)

In [None]:
# looks like customers will only buy maximum 3 items at once
bought = df1['date_time'].value_counts()
bought = pd.DataFrame(bought).reset_index().rename(columns = {'index': 'datetime', 'date_time':'number'})

# items bought in 1 transaction
# it looks like only less than 4% of people will buy more than 1 item
print("3 items:", round(len(bought[bought['number'] == 3])/len(bought['number'])*100,2),'%',sep='')
print("2 items:", round(len(bought[bought['number'] == 2])/len(bought['number'])*100,2),'%',sep='')
print("1 items:", round(len(bought[bought['number'] == 1])/len(bought['number'])*100,2),'%',sep='')

In [None]:
# split the date_time into Date, Time, Month, Year and Date for easier analysis later
df1['date_time'] = pd.to_datetime(df['date_time'])
df1['Date'] = df1['date_time'].dt.strftime('%d-%m-%Y')
df1['Time'] = df1['date_time'].dt.strftime('%H:%M:%S')
df1['Month'] = df1['date_time'].dt.strftime('%b')
df1['Year'] = df1['date_time'].dt.strftime('%Y')
df1['Day'] = df1['date_time'].dt.strftime('%a')

# drop date_time column as we don't need it anymore
#df1 = df1.drop(columns='date_time') 

# move the date time columns to the front so that it is easier to visualise
col = df1.pop('Date')
df1.insert(1, col.name, col)
col = df1.pop('Time')
df1.insert(2, col.name, col)
col = df1.pop('Day')
df1.insert(3, col.name, col)
col = df1.pop('Month')
df1.insert(4, col.name, col)
col = df1.pop('Year')
df1.insert(5, col.name, col)
df1.head(2)

## The number of transactions by day: Friday, Saturday and Monday are the busiest.

In [None]:
df1['Day'] = pd.Categorical(df1['Day'], categories= ['Mon','Tue','Wed','Thu','Fri','Sat', 'Sun'], ordered=True)
day = df1['Day'].value_counts()
day = day.sort_index().reset_index().rename(columns={'index':'Day','Day':'Transactions'})
ax = sns.barplot(x="Day", y="Transactions", data=day)

## The number of transactions by month: November and December are the busiest.

In [None]:
df1['Month'] = pd.Categorical(df1['Month'], categories= ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'], ordered=True)
month = df1['Month'].value_counts()
month = month.sort_index().reset_index().rename(columns={'index':'Month','Month':'Transactions'})
ax = sns.barplot(x="Month", y="Transactions", data=month)

## The number of transactions by period of day: Morning and Afternoon are the busiest.

In [None]:
df1['period_day'] = pd.Categorical(df1['period_day'], categories= ['morning','afternoon','evening','night'], ordered=True)
period_day = df1['period_day'].value_counts()
period_day = period_day.sort_index().reset_index().rename(columns={'index':'period_day','period_day':'Transactions'})
ax = sns.barplot(x="period_day", y="Transactions", data=period_day)

## The number of transactions by year: Sales in 2017 seems to have plunged compared to 2016.

In [None]:
year = pd.DataFrame(df1.Year.value_counts())
year = year.reset_index().rename(columns={'index':'Year','Year':'Transactions'})
ax = sns.barplot(x="Year", y="Transactions", data=year)

## % change compared to previous year:

In [None]:
# select data by year to for a new dataframe
df_2016 = df1[df1['Year'] == '2016']
df_2017 = df1[df1['Year'] == '2017']

# extract number of transactions by month
monthly_sales_2016 = pd.DataFrame(df_2016['Month'].value_counts().reset_index().rename(columns = {'index': 'Month', 'Month':'2016'}))
monthly_sales_2017 = pd.DataFrame(df_2017['Month'].value_counts()).reset_index().rename(columns = {'index': 'Month', 'Month':'2017'})

# merge sales data for both years
monthly_sales = pd.merge(monthly_sales_2016,monthly_sales_2017)

# sort by month
monthly_sales['Month'] = pd.Categorical(monthly_sales['Month'], categories= ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'], ordered=True)
monthly_sales.sort_values(inplace=True, by='Month')
monthly_sales = monthly_sales.reset_index().drop('index', axis=1)

# % change compared to previous year
print('% change compared to previous year:')
monthly_sales['% Change'] = round(((monthly_sales['2017']-monthly_sales['2016'])/monthly_sales['2016']*100),2).astype(str) + '%'
monthly_sales

## Compare month-by-month for 2016 & 2017

In [None]:
# plot grouped bar plot to compare month-by-month for 2016 & 2017
labels = monthly_sales['Month']
sales_2016 = monthly_sales['2016']
sales_2017 = monthly_sales['2017']

x = np.arange(len(monthly_sales['Month']))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, sales_2016, width, label='2016')
rects2 = ax.bar(x + width/2, sales_2017, width, label='2017')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('No. of Transactions')
ax.set_title('No. of Transactions by Month')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

fig.tight_layout()

plt.show()

## What are the popular items?

In [None]:
total_transactions = len(df1)
support = 0.02
min_items = total_transactions*support

itemset_1 = pd.DataFrame(df.Item.value_counts()).reset_index().rename(columns = {'index':'Item','Item':'Count'})
itemset_1 = itemset_1[itemset_1['Count'] >= min_items]
itemset_1

## Visualise popular items on a barplot

In [None]:
# Initialize the matplotlib figure
#f, ax = plt.subplots(figsize=(6, 15))

# Plot the total crashes
sns.set_color_codes("pastel")
t = sns.barplot(x="Item", y="Count", data=itemset_1,
            label="Item", color="b")

# Add a legend and informative axis label
ax.legend(ncol=2, loc="lower right", frameon=True)
#ax.set(xlim=(0, 24), ylabel="", xlabel="Popular Items Sold")
sns.despine(left=True, bottom=True)
t.set_xticklabels(itemset_1['Item'],rotation=90)

## Frequent Pattern Mining via Apriori Algorithm

### Although there is only a small amount of customers that would buy more than 1 item. However, it is still worthwhile to find any Associations on multiple items bought to upsell. Another benefit is to possibly create a 'deal' say, Coffee + Scone at a cheaper price. This will encourage customers to buy multiple items instead of just 1 item only.

#### Apriori Algorithm reference: https://www.geeksforgeeks.org/implementing-apriori-algorithm-in-python/

In [None]:
def hot_encode(x): 
    if(x<= 0): 
        return 0
    if(x>= 1): 
        return 1

basket = df1.iloc[:,9:]
basket_encoded = basket.applymap(hot_encode)
basket_encoded.head(1)

In [None]:
# Building the model 
frequent_items = apriori(basket_encoded, min_support = 0.01, use_colnames = True) 
  
# Collecting the inferred rules in a dataframe 
rules = association_rules(frequent_items, metric ="lift", min_threshold = 1) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[True, True]) 
print(pd.DataFrame(rules))

In [None]:
rules = association_rules(frequent_items, metric = "lift", min_threshold = 1)
rules.sort_values('confidence', ascending = False, inplace = True)
rules

#### Drawing from the fact that only about 3% of customers buy more than 1 item, the association between items purchased are not expected to be high as shown in the lift, support and confidence figures. However, they are still worth noting for every opportunity to upsell to increase revenue. 

## Visualisation on number of transactions over time

In [None]:
# prepare data for 2016
# sorting Month
monthly_sales_2016['Month'] = pd.Categorical(monthly_sales_2016['Month'], categories= ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'], ordered=True)
monthly_sales_2016 = monthly_sales_2016.rename(columns={'2016':'Transactions'})
monthly_sales_2016.sort_values(inplace=True, by='Month')
monthly_sales_2016['Month'] = monthly_sales_2016['Month'].astype(str) + '_2016'

# prepare data for 2017
# sorting Month
monthly_sales_2017['Month'] = pd.Categorical(monthly_sales_2017['Month'], categories= ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'], ordered=True)
monthly_sales_2017 = monthly_sales_2017.rename(columns={'2017':'Transactions'})
monthly_sales_2017.sort_values(inplace=True, by='Month')
monthly_sales_2017['Month'] = monthly_sales_2017['Month'].astype(str) + '_2017'

# merge both datasets
merged = monthly_sales_2016.append([monthly_sales_2017])
merged = merged.reset_index().drop(['index'], axis=1)
merged

In [None]:
# plot bargraph
sns.set_style(style="whitegrid")
s = sns.barplot(x='Month',y='Transactions', data=merged)
s.set_xticklabels(merged['Month'],rotation=90)

#### In 2016, business seemed to be pretty good and peaked at the end of the year. However, starting Feb 2017, number of transactions plunged to zero. There were some transactions in the subsequent months but never recovered to the level in 2016.