

##  Market Based Analysis Using Apriori algorithm

### Importing neccessary libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

import warnings
warnings.filterwarnings('ignore')


In [None]:
data = pd.read_csv('market.csv', encoding='ISO-8859-1')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.columns.values

In [None]:
data.info()

In [None]:
data.describe()

### Data Cleaning

In [None]:
#Converting the invoiceNo column to be string
data['InvoiceNo'] = data['InvoiceNo'].astype('str')

In [None]:
data.sort_values('InvoiceNo',ascending=False)

##### Invoice number starts with C is canceled transactions.We will delete this data,because we want successful transactions only

In [None]:
data = data[~data['InvoiceNo'].str.contains('C')]

In [None]:
data.shape

In [None]:
def missingvalue(data):
    missing_value = data.isnull().sum(axis = 0).reset_index()
    missing_value.columns = ['variable', 'number_of_missing']
    missing_value['percentage_of_missing'] = (missing_value['number_of_missing'])/data.shape[0]*100
    mv = missing_value.sort_values('percentage_of_missing', ascending = False).reset_index(drop=True)
    return mv
missingvalue(data)

##### We have missing values  in the customerID and description we drop the missing values that will not cause any problem because of large data

In [None]:
data=data.dropna(subset=['CustomerID','Description'])

In [None]:
missingvalue(data)

In [None]:
#Removing the spaces trail and head spaces in the description
data['Description'] = data['Description'].str.strip()

In [None]:
data.shape

In [None]:
data.info()
data['CustomerID'] =data['CustomerID'].astype('int')

In [None]:
data.describe()

## EDA(Exploratory Data Analysis)

##### Analyze the sales performance of this company,and providing insights regarding the same

In [None]:
#Grouping the countries by Total amount of sales
country_price = data.groupby('Country')['Quantity'].sum().sort_values(ascending = False)
country_price

In [None]:
#Top 5 countries with high number of purchase
country_price[:5].plot(kind='bar')

In [None]:
#5 countries with least number of purchase
country_price[33:].plot(kind='bar')

#### What products do customers buy the most?

In [None]:
plt.figure(figsize = (14,7))
ax = sns.countplot(y = data['Description'], 
                   order = data["Description"].value_counts().iloc[:10].index, 
                   palette = "GnBu_d")
ax.set_title("Top 5 Frequently Purchased Products", size = 20, pad=15)
ax.set_xlabel("Count", size = 15)
ax.set_ylabel("Product", size = 15)
ax.xaxis.set_tick_params(labelsize=11)
ax.yaxis.set_tick_params(labelsize=11)
plt.show()

#### How many orders(per month)

In [None]:

# Assuming 'InvoiceDate' is not already in datetime format, convert it to datetime
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])

# Group by month and count unique 'InvoiceNo'
order_month = data.groupby(data['InvoiceDate'].dt.month)['InvoiceNo'].nunique()

# Plot the result
plt.figure(figsize=(8, 4))
ax = sns.barplot(x=order_month.index, y=order_month.values, palette="Set2")
plt.xlabel('Month')
plt.ylabel('Number of Orders')
plt.show()


#### How many orders(per week)

In [None]:

# Assuming 'InvoiceDate' is not already in datetime format, convert it to datetime
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])

# Group by day of the week and count unique 'InvoiceNo'
order_day = data.groupby(data['InvoiceDate'].dt.dayofweek)['InvoiceNo'].nunique()

# Map the day of the week numeric values to their names
day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
order_day.index = [day_names[idx] for idx in order_day.index]

# Plot the result
plt.figure(figsize=(8, 4))
ax = sns.barplot(x=order_day.index, y=order_day.values, palette="Set2")
plt.xlabel('Day of the Week')
plt.ylabel('Number of Orders')
ax.xaxis.set_tick_params(labelsize=11)
ax.yaxis.set_tick_params(labelsize=11)
plt.show()


#### How many orders(per hour)

In [None]:
order_hour = data.groupby(data['InvoiceDate'].dt.hour)['InvoiceNo'].nunique()

plt.figure(figsize = (8,4))
ax = sns.barplot(x = 'InvoiceDate', y = 'InvoiceNo', data = order_hour.reset_index(), palette = "colorblind")
ax.set_title('Number of Transactions Occured Each Hour', size = 20)
ax.set_xlabel('Hour', size = 14)
ax.set_ylabel('Number of Transaction', size = 14)
ax.xaxis.set_tick_params(labelsize=11)
ax.yaxis.set_tick_params(labelsize=11)
plt.show()

In [None]:
# Adding year feature to the dataset 

timest = data['InvoiceDate'].dt.year

data['Year'] = timest

data.head()

In [None]:
TotalAmount = data['Quantity'] * data['UnitPrice']
data.insert(loc=5,column='TotalAmount',value=TotalAmount)

In [None]:
price_cust = pd.pivot_table(data, index='CustomerID', values='TotalAmount', aggfunc=np.sum)
print('The following are 5 customers who spend the most money on Online Retail:')
price_cust.sort_values('TotalAmount', ascending=False)[:5]

In [None]:
order_cust = pd.pivot_table(data, index='CustomerID', values='InvoiceNo', aggfunc=pd.Series.nunique)
print('The following are the 5 customers who most frequently shop at Online Retail:')
order_cust.sort_values('InvoiceNo', ascending=False)[:5]

In [None]:
data.groupby('Year')['TotalAmount'].sum().plot(kind = 'bar')

In [None]:
#Sales for different month on the year 2011
data['Mon'] = data['InvoiceDate'].dt.month
data['month'] = data['InvoiceDate'].dt.month_name() 
data.groupby(['Mon','Year'])['TotalAmount'].sum().plot(kind = 'bar', title = 'Sales month wise')

In [None]:
print('Min: {}\nMax: {}'.format(min(data['InvoiceDate']), max(data['InvoiceDate'])))

In [None]:
#Countries with more number of customers
cus_id = pd.DataFrame(data.groupby('Country')['CustomerID'].count().sort_values(ascending = False))
cus_id[:5].plot(kind = 'bar', title = 'Most Customers for country')

In [None]:
# Countries with less number of customers
cus_id[-5:].plot(kind = 'bar', title = 'Least customers for country')

#### Support
It measures the percentage of itemset occurrence in all transactions.

#### Confidence
Confidence measures how strong the association rule is. How often item Y appears in the purchase transaction of item X.



#### Lift
Lift of the rule is defined as the ratio of observed support to the support expected in the case the elements of the rule were independent. Lift values > 1 are generally more “interesting” and could be indicative of a useful rule pattern.

In [None]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [None]:
print('Top 5 Country in Online Retail:')
data['Country'].value_counts()[:5]

In [None]:
data = data[data['Country']=='United Kingdom']
data.head()

In [None]:
basket = data.groupby(['InvoiceNo','Description'])['Quantity'].sum().unstack().reset_index().fillna(0).set_index('InvoiceNo')
basket.head()

In [None]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1
basket_sets = basket.applymap(encode_units)
basket_sets.drop('POSTAGE', axis = 1, inplace = True)
basket_sets.head()

In [None]:
frequent_itemsets = apriori(basket_sets, min_support = 0.02, use_colnames= True)

In [None]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules.sort_values('support', ascending=False)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


# Generate scatterplot using support and confidence
sns.scatterplot(x = "support", y = "confidence", 
                size = "lift", data = rules)
plt.show()

In [None]:
rules[ (rules['lift'] >= 6) &
       (rules['confidence'] >= 0.8) ]

In [None]:
print("ALARM CLOCK BAKELIKE GREEN : ",basket['ALARM CLOCK BAKELIKE GREEN'].sum())
print("ALARM CLOCK BAKELIKE RED : ",basket['ALARM CLOCK BAKELIKE RED'].sum())

#### Conclusion Market Basket Analysis
From the market basket analysis, it is found that customers who buy PINK REGENCY TEACUP AND SAUCER tend to buy GREEN REGENCY TEACUP AND SAUCER. The three most frequent associations for lift over 6 and confidence above 8 are items related to teacup and saucer. Perhaps we can provide more color variants on the teacup. 