# **Steps involved in Clustering**
1. Importing Libraries
2. Data Exploration
3. Data Prepration
4. Outlier Detection
5. Clutering

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cluster import KMeans

# **Importing the .csv file**

In [None]:
df = pd.read_csv("/kaggle/input/online-retail-customer-clustering/OnlineRetail.csv")

# **DATA EXPLORATION**

In [None]:
df.shape

In [None]:
df.info()

# **Checking for nulls**

In [None]:
df.isnull().sum()

In [None]:
df['Country'].value_counts()

In [None]:
df['CustomerID'].nunique()

In [None]:
df['InvoiceNo'].nunique()

In [None]:
df.describe()

# **Dropping the nulls**

In [None]:
df = df.dropna()

In [None]:
df.shape

In [None]:
df['CustomerID'] = df['CustomerID'].astype(str)

# **Calculating the amount spent by customers**

In [None]:
df['Amount'] = df['UnitPrice'] * df['Quantity']

In [None]:
df = df.reset_index()

In [None]:
df.head()

# **Getting the total amount for each customer "using the group by statement"**

In [None]:
df_amount = df.groupby('CustomerID')['Amount'].sum()

In [None]:
df = df.reset_index()

In [None]:
df_amount.head()

# **Calculating the number of transactions for each customer using count of invoice number**

In [None]:
df_frequency = df.groupby('CustomerID')['InvoiceNo'].count()

In [None]:
df_frequency.head()

In [None]:
df_f = pd.merge(df_amount,df_frequency ,how = 'inner', on='CustomerID')

In [None]:
df_f.head()

# **Calculating the recency of each customer**

In [None]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'],format='%d-%m-%Y %H:%M')

In [None]:
df_max = df['InvoiceDate'].max()
df_max

In [None]:
df['diff'] = df_max - df['InvoiceDate']

In [None]:
df.head()

In [None]:
df_diff= df.groupby('CustomerID')['diff'].min()

In [None]:
df_diff.reset_index()
df_diff.head()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

# **Merging to get the final dataset**

In [None]:
df_final = pd.merge(df_f, df_diff , how = 'inner' , on = 'CustomerID')

In [None]:
df_final.head()

In [None]:
df_final.info()

In [None]:
df_final = df_final.rename({'InvoiceNo': 'Frequency', 'diff': 'Recency'}, axis=1)

In [None]:
df_final.head()

In [None]:
df_final['Recency'] = df_final['Recency'].dt.days

In [None]:
df_final.head()

In [None]:
attributes = ['Amount' , 'Frequency' , 'Recency' ]

# **Checking data for outliers**

In [None]:
import seaborn as sns
#sns.set_theme(style="whitegrid")
#tips = sns.load_dataset("tips")
ax = sns.boxplot(data = df_final[attributes])

In [None]:
df_final.nunique().sum()

In [None]:
df_final.shape

# **Removing the outliers**

In [None]:
Q1 = df_final.Amount.quantile(0.05)
Q3 = df_final.Amount.quantile(0.95)
IQR = Q3 - Q1
df_final = df_final[(df_final.Amount >= Q1 - 1.5 * IQR) & (df_final.Amount <= Q3 + 1.5 * IQR) ]

Q1 = df_final.Recency.quantile(0.05)
Q3 = df_final.Recency.quantile(0.95)
IQR = Q3 - Q1
df_final = df_final[(df_final.Recency >= Q1 - 1.5 * IQR) & (df_final.Recency <= Q3 + 1.5 * IQR) ]

Q1 = df_final.Frequency.quantile(0.05)
Q3 = df_final.Frequency.quantile(0.95)
IQR = Q3 - Q1
df_final = df_final[(df_final.Frequency >= Q1 - 1.5 * IQR) & (df_final.Frequency <= Q3 + 1.5 * IQR) ]

In [None]:
df_final.shape

In [None]:
df_final.head()

# **Scaling the data to get each column on a similar scale**

In [None]:
from sklearn.preprocessing import StandardScaler
# define standard scaler
scaler = StandardScaler()
# transform data
df1 = scaler.fit_transform(df_final)

In [None]:
df1.shape

In [None]:
import pandas as pd
df_f = pd.DataFrame(df1)
df_f.columns = ['Amount' , 'Frequency' , 'Recency']
df_f.head

In [None]:
df_f

# **Performaing K-Means Clustering**

In [None]:

# defining the kmeans function with initialization as k-means++
kmeans = KMeans(n_clusters=2, init='k-means++')

# fitting the k means algorithm on scaled data
kmeans.fit(df_f)

In [None]:
# inertia on the fitted data
kmeans.inertia_

# **Checking the ideal number of clusters using a scree plot**

In [None]:
# fitting multiple k-means algorithms and storing the values in an empty list
SSE = []
for cluster in range(1,20):
    kmeans = KMeans(n_jobs = -1, n_clusters = cluster, init='k-means++')
    kmeans.fit(df_f)
    SSE.append(kmeans.inertia_)

# converting the results into a dataframe and plotting them
frame = pd.DataFrame({'Cluster':range(1,20), 'SSE':SSE})
plt.figure(figsize=(12,6))
plt.plot(frame['Cluster'], frame['SSE'], marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')

In [None]:
# k means using 5 clusters and k-means++ initialization
kmeans = KMeans(n_jobs = -1, n_clusters = 4, init='k-means++')
kmeans.fit(df_f)
pred = kmeans.predict(df_f)

In [None]:
frame = pd.DataFrame(df_f)
frame['cluster'] = pred
frame['cluster'].value_counts()

In [None]:
import matplotlib.pyplot as plt

In [None]:
pred

In [None]:
df_final['Label'] = pred

# 

# **Plotting the various clusters taking 2 features at a time**

In [None]:
sns.scatterplot(df_final['Amount'] ,df_final['Recency'] , hue = df_final['Label'] , palette = sns.color_palette('hls',4))

In [None]:
sns.scatterplot(df_final['Amount'] ,df_final['Frequency'] , hue = df_final['Label'] , palette = sns.color_palette('hls',4))

In [None]:
sns.scatterplot(df_final['Frequency'] ,df_final['Recency'] , hue = df_final['Label'] , palette = sns.color_palette('hls',4))

# **Plotting Clusters taking all features at a time**

In [None]:
sns.scatterplot(df_final['Amount'] ,df_final['Frequency'] ,df_final['Recency'] , hue = df_final['Label'])