# <center>Online Retail Clustering</center>

### Agenda : 
We are required to cluster the customers using different attributes shared in the data
### Data used : 
We have used the data to define the frequency of purchase, recency of purchase and amount of purchase to cluster the customers

### This notebook has been created after multiple iteration:<br>
1. Iteration 1 : Implemented the K means algorithm using python
2. Iteration 2 : Created other data attributes which may help in Clustering
3. Iteration 3 : Scaled the data to improve the clustering technique
4. Iteration 4 : Lastly, realised there are anomalies and so implemented IQR to eliminate iteration


In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Importing libraries

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import datetime

## Reading dataframe

In [None]:
# Pulling invoice data
df_retail = pd.read_csv("/kaggle/input/online-retail-customer-clustering/OnlineRetail.csv")

In [None]:
df_retail.head()

## Null value treatment

In [None]:
df_retail.info()

In [None]:
print('Number of na in customer id =' ,df_retail['CustomerID'].isna().sum())
print('Number of null in customer id =' ,df_retail['CustomerID'].isnull().sum())

In [None]:
df_retail.dropna(inplace=True)
df_retail.info()

## Creating attributes for clustering

In [None]:
# First attribute : Total amount paid
df_retail['Amount'] = df_retail['Quantity'] * df_retail['UnitPrice']
df_amount = df_retail.groupby(['CustomerID'],as_index=False)["Amount"].sum()
df_amount.head()

In [None]:
# Second attribute : Purchase recency

# UDF to convert string to datetime
def convertDate(x):
    conv_date = datetime.datetime.strptime(x, '%d-%m-%Y %H:%M')
    return conv_date

In [None]:
# UDF to split date and pick out recent purchase days 
def splitDate(y):
    y = str(y)
    num_days = y.split()[0]
    num_days = int(num_days)
    return num_days

In [None]:
df_retail['InvoiceDate'] = df_retail.loc[:,'InvoiceDate'].apply(convertDate)

In [None]:
max_date = df_retail['InvoiceDate'].max()
df_retail['Recent_purchase_days'] = max_date - df_retail['InvoiceDate']

In [None]:
df_retail['Recent_purchase_days'] = df_retail['Recent_purchase_days'].apply(splitDate)
df_rec_purch = df_retail.groupby(['CustomerID'],as_index=False)["Recent_purchase_days"].min()
df_rec_purch.head()

In [None]:
# Attribute : Frequency of purchase
df_purchase_freq = df_retail.groupby(['CustomerID'],as_index=False)["InvoiceNo"].count()
df_purchase_freq.rename(columns={'CustomerID':'CustomerID','InvoiceNo':'PurchaseCount'},inplace=True)
df_purchase_freq.head()

## Create Analytical data set for the clustering model

In [None]:
df_amt_purch = df_amount.merge(df_rec_purch,how='left',on=['CustomerID'])
df_model_inp = df_amt_purch.merge(df_purchase_freq,how='left',on=['CustomerID'])
df_model_inp.head()

## Anomaly detection

In [None]:
# UDF to determine outliers in data for all the columns
def viewDistribution(df):
    sns.boxplot(data=df)
    plt.xticks(rotation=90)
    plt.show

In [None]:
# INSIGHTS: There are outliers in amount and frequency column
df_anomaly = df_model_inp.iloc[:,1:4]
viewDistribution(df_anomaly)

In [None]:
# UDF to remove outlier
def remove_outlier_IQR(df):
    Q1=df.quantile(0.25)
    Q3=df.quantile(0.75)
    IQR=Q3-Q1
    df_final=df[~((df<(Q1-1.5*IQR)) | (df>(Q3+1.5*IQR)))]
    return df_final

In [None]:
# Removed the anomalies in all the columns
df_IQR = remove_outlier_IQR(df_anomaly)
df_IQR.fillna(0,inplace=True)
viewDistribution(df_IQR)

## Performing scaling on the analytical data set

In [None]:
from sklearn.preprocessing import StandardScaler
# define standard scaler
scaler = StandardScaler()
# transform data
df_scalar = scaler.fit_transform(df_IQR)

df_scalar1 = pd.DataFrame(df_scalar)
df_scalar1.columns = ['Amount' , 'Frequency' , 'Recency']
df_scalar1.head()

## Mathematical implementation of K means clustering techinique
1.Pick K points as the initial centroids from the data set, either randomly or the first K.<br>
2.Find the Euclidean distance of each point in the data set with the identified K points — cluster centroids.<br>
3.Assign each data point to the closest centroid using the distance found in the previous step.<br>
4.Find the new centroid by taking the average of the points in each cluster group.<br>
5.Repeat iteration till the centroids don’t change.<br>


In [None]:
# Defining centroids
k = 5

# Filtering the columns required for clustering
data = df_IQR.iloc[:,0:3]
data.head()

In [None]:
# Storing the sample dataframe to determine the number of centroids
k_means = (data.sample(k, replace=False))    # store current means
k_means2 = pd.DataFrame()                    # store previous means
clusters = pd.DataFrame()    

In [None]:
while not k_means2.equals(k_means):
    # distance matrix (euclidean distance)
    cluster_count = 0
    for idx, k_mean in k_means.iterrows():
        clusters[cluster_count] = (data[k_means.columns] - np.array(k_mean)).pow(2).sum(1).pow(0.5)
        cluster_count += 1

    # update cluster
    data['MDCluster'] = clusters.idxmin(axis=1)

    # store previous cluster
    k_means2 = k_means
    k_means = pd.DataFrame()
    k_means_frame = data.groupby('MDCluster').agg(np.mean)
    k_means[k_means_frame.columns] = k_means_frame[k_means_frame.columns]

In [None]:
data.head()

## Plotting clusters 

In [None]:
plt.figure(figsize=(15,5))
sns.scatterplot(x=data['Amount'] ,y=data['Recent_purchase_days'] , hue = data['MDCluster'], palette = sns.color_palette('hls',5))
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.scatterplot(x=data['Amount'] ,y=data['PurchaseCount'] , hue = data['MDCluster'], palette = sns.color_palette('hls',5))
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.scatterplot(x=data['Recent_purchase_days'] ,y=data['PurchaseCount'] , hue = data['MDCluster'], palette = sns.color_palette('hls',5))
plt.show()