In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<div style="background-color: #0000ff; padding: 10px 20px; border-radius: 20px; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1); text-align: center;">
  <h1 style="color: white; font-size: 30px;">Initial Data Analysis</h1>
</div>


In [None]:
### Reading the file

data=pd.read_excel('/kaggle/input/online-retail/Online Retail.xlsx')

In [None]:
data

<div style="background-color:#bbf2ef ; border-radius: 10px; padding: 15px; border: 1px solid #ccc; font-size: 16px;">

**This DataFrame contains 8 variables that correspond to:**

- **InvoiceNo**  
  **Type:** Nominal  
  **Description:** Invoice number. A 6-digit integral number uniquely assigned to each transaction. If this code starts with the letter 'C', it indicates a cancellation.

- **StockCode**  
  **Type:** Nominal  
  **Description:** Product (item) code. A 5-digit integral number uniquely assigned to each distinct product.

- **Description**  
  **Type:** Nominal  
  **Description:** Product (item) name.

- **Quantity**  
  **Type:** Numeric  
  **Description:** The quantities of each product (item) per transaction.

- **InvoiceDate**  
  **Type:** Numeric  
  **Description:** Invoice date and time. The day and time when each transaction was generated.

- **UnitPrice**  
  **Type:** Numeric  
  **Description:** Unit price. Product price per unit in sterling.

- **CustomerID**  
  **Type:** Nominal  
  **Description:** Customer number. A 5-digit integral number uniquely assigned to each customer.

- **Country**  
  **Type:** Nominal  
  **Description:** Country name. The name of the country where each customer resides.

</div>


In [None]:
### Understanding the structure of dataset

data.info()

In [None]:
### Percentage of null records in description column

print("Percentage of null records in description column",round((1454/541909)*100,2),"%")

In [None]:
### Percentage of null records in Customer ID column

print("Percentage of null records in CustomerID column",round((135080/541909)*100,2),"%")

<div style="background-color: #bbf2ef; border-radius: 10px; padding: 15px; border: 1px solid #ccc; font-size: 16px;">

### Initial Inferences From Data:

* Dataset contains 541,909 entries and 8 columns
* **Invoice No:** 1 invoice number can have multiple products purchased
* **Description:** ~0.3% of descriptions present are nulls 
* **Customer ID:** ~25% of Customer IDs are nulls

</div>


In [None]:
### Descriptive Statistics

data.describe().T

In [None]:
### Summary statistics for categorical variables

data.describe(include='object').T

<div style="background-color: #bbf2ef ; border-radius: 10px; padding: 15px; border: 1px solid #ccc; font-size: 16px;">

### Inferences

#### Quantity
* **Average Product Quantity per transaction:** 9.55
* **Negative Values:** Indicates cancelled orders.
* **High Standard Deviation (218.08):** Data is widely spread out.
* **Outliers:** Presence of outliers due to a significant gap between the 75th percentile and maximum value.

#### UnitPrice
* **Average Unit Price:** 4.61
* **Outliers:** Presence of outliers due to a significant gap between the 75th percentile and maximum value.

#### CustomerID
* **Missing Values:** 406,829 missing values.

#### StockCode
* **Unique Stock Codes:** 4,070 unique stock codes.

#### Description
* **Unique Descriptions:** 4,223 unique descriptions.
* **Most Frequent Description:** "White hanging heart t-light holder" (2,369 times).
* **Missing Values:** Few missing values.

#### Country
* **Data Coverage:** Data from 38 different countries is available.

</div>


<div style="background-color: #0000ff; padding: 10px 20px; border-radius: 20px; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1); text-align: center;">
  <h1 style="color: white; font-size: 30px;">Data Cleaning and Transformation</h1>
</div>

<div style="background-color: #3296e3; padding: 8px 16px; border-radius: 20px; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1); text-align: left;">
  <h1 style="color: white; font-size: 24px;">Checking for missing values and removing them</h1>
</div>


In [None]:
### Checking for nulls

missing_data=data.isna().sum()
missing_per=(missing_data[missing_data>0]/data.shape[0])*100
missing_data

In [None]:
### Percentage of missing values

round(missing_per,2)

In [None]:
### Checking if for each stock code - only 1 description exists or not

data_stockcodes=data.groupby('StockCode')['Description'].unique()
data_stockcodes

<div style="background-color: #bbf2ef; border-radius: 10px; padding: 15px; border: 1px solid #ccc; font-size: 16px;">

### Inferences

#### Customer ID
* ~ 25% of the customer IDs are missing.
* Imputing data won't help as clustering is based on customer behavior and preferences. Accurate data is required.
* Removing them might be a good choice.

#### Description
* 0.27% of descriptions are missing.
* Each stock code does not correspond to a unique description.
* Removing this column might be a good choice as well.

</div>


In [None]:
### Removing rows with missing values in 'CustomerID' and 'Description' columns

data = data.dropna(subset=['CustomerID', 'Description'])
data

In [None]:
data.isnull().sum()

<div style="background-color: #3296e3; padding: 8px 16px; border-radius: 20px; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1); text-align: left;">
  <h1 style="color: white; font-size: 24px;">Handling Duplicate Values</h1>
</div>


In [None]:
### Checking for duplicate entries

data.duplicated().sum()

In [None]:
### Deleting the duplicate entries

data.drop_duplicates(inplace = True)

In [None]:
data

<div style="background-color: #3296e3; padding: 8px 16px; border-radius: 20px; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1); text-align: left;">
  <h1 style="color: white; font-size: 24px;"> 'Invoice number' having 'C' as prefix signifies cancelled transactions </h1>
</div>


In [None]:
import pandas as pd
import re

### Creating a regular expression pattern
pattern = r'^[a-zA-Z]+\d+$|\d+[a-zA-Z]+$'

In [None]:
### Filtering rows based on the pattern
cancelled_df = data[data['InvoiceNo'].str.match(pattern) == True] 
cancelled_df

In [None]:
cancelled_df[['Quantity', 'UnitPrice']].describe()

<div style="background-color: #bbf2ef ; border-radius: 10px; padding: 15px; border: 1px solid #ccc; font-size: 16px;">

### Inferences

* Unit prices have a high variance factor. 
* Keeping these records might help in clustering as this would efficiently capture the segment of customers that cancel their orders often
* Recommendation system will not recommend these products

</div>


In [None]:
### Creating a new column to show transaction status

data['Transaction_Status'] = np.where(data['InvoiceNo'].astype(str).str.startswith('C'), 'Cancelled', 'Completed')
data

In [None]:
data[data['Transaction_Status']=="Cancelled"]

In [None]:
### Percentage of Cancelled Transactions

cancelled_percent=round(((data['Transaction_Status'] == 'Cancelled').sum()/data.shape[0])*100,2)
print("The percentage of cancelled transactions is: ",cancelled_percent,"%")

<div style="background-color: #3296e3; padding: 8px 16px; border-radius: 20px; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1); text-align: left;">
  <h1 style="color: white; font-size: 24px;"> Analysing the Stockcode Column  </h1>
</div>


In [None]:
### Unique Stock Codes

stock_codes_distinct=data['StockCode'].nunique()
stock_codes_distinct

In [None]:
### Value counts for each Stock Code

data['StockCode'].value_counts().head(10)

In [None]:
### Plotting the top 10 stock codes sold

import matplotlib.pyplot as plt

top_stock_codes = data['StockCode'].value_counts().head(10)

top_stock_codes.plot(kind='bar', figsize=(10, 6))
plt.title('Top 10 Stock Codes')
plt.xlabel('Stock Code')
plt.ylabel('Frequency')
plt.xticks(rotation=0, fontsize=10)
plt.show()


<div style="background-color: #bbf2ef ; border-radius: 10px; padding: 15px; border: 1px solid #ccc; font-size: 16px;">

### Inferences

* Not all stock codes are numeric 
* Need to dive further to understand how many non numeric stock codes exist

</div>

In [None]:
data.dtypes

In [None]:
### Finding the number of numeric characters in each unique stock code

unique_stock_codes = data['StockCode'].unique()
numeric_char_counts_in_unique_codes = pd.Series(unique_stock_codes).apply(lambda x: sum(c.isdigit() for c in str(x))).value_counts()
numeric_char_counts_in_unique_codes

In [None]:
unique_stock_codes

In [None]:
### Finding and printing the stock codes with 0 and 1 numeric characters

anomalous_stock_codes = [code for code in unique_stock_codes if sum(c.isdigit() for c in str(code)) in (0, 1)]
print("Anomalous stock codes:")
for code in anomalous_stock_codes:
    print(code)

In [None]:
### Pecentage of anomalous records present in data

anomalous_stock_codes

In [None]:
### Filter the DataFrame for anomalous stock codes

anomalous_data = data[data['StockCode'].isin(anomalous_stock_codes)]

In [None]:
### Calculate the sum of records

total_anomalous_records = len(anomalous_data)

In [None]:
### Getting percentage of anomalous codes in the data

per_anomalous_codes=round((total_anomalous_records/data.shape[0])*100,2)
print("Percentage of anomalous codes in the dataframe:", per_anomalous_codes,"%")

<div style="background-color: #bbf2ef ; border-radius: 10px; padding: 15px; border: 1px solid #ccc; font-size: 16px;">

### Inferences

* Majority of the codes have 5 digits
* 7 anomalous codes only exist in 0.48% of the dataset and hence can be removed


</div>

In [None]:
anomalous_data

In [None]:
### Filtering the DataFrame for normal stock codes

data = data[~data['StockCode'].isin(anomalous_stock_codes)]
data

<div style="background-color: #3296e3; padding: 8px 16px; border-radius: 20px; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1); text-align: left;">
  <h1 style="color: white; font-size: 24px;"> Analysing the Description Column </h1>
</div>


In [None]:
### Top 20 selling products

top_product_desc = data['Description'].value_counts().head(20).sort_values(ascending=False)

top_product_desc.plot(kind='barh', figsize=(10, 6))
plt.title('Top 20 Product Descriptions')
plt.xlabel('Frequency')
plt.ylabel('Product Description')
plt.show()


In [None]:
### Finding unique descriptions containing lowercase characters

lowercase_descriptions = data['Description'].unique()
lowercase_descriptions = [desc for desc in lowercase_descriptions if any(char.islower() for char in desc)]
for desc in lowercase_descriptions:
    print(desc)

In [None]:
### Checking for product descriptions containing the following keywords: "Next Day Carriage" OR "High Resolution Image"

filtered_data = data[(data['Description'] == 'Next Day Carriage') | (data['Description'] == 'High Resolution Image')]
filtered_data

In [None]:
### Removing the above records as these terms don't make sense for product descriptions

percent_anom_pd=round((filtered_data.shape[0]/data.shape[0])*100,2)
print("Percentage of instances with 'Next Day Carriage' or 'High resolution Image' are",percent_anom_pd,"%")

In [None]:
filtered_data['Description'].unique()

In [None]:
### Removing the above records as these terms don't make sense for product descriptions

data = data[~data['Description'].isin(filtered_data['Description'].unique())]
data

<div style="background-color: #3296e3; padding: 8px 16px; border-radius: 20px; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1); text-align: left;">
  <h1 style="color: white; font-size: 24px;"> Analysing the Unit Price Column </h1>
</div>

In [None]:
data['UnitPrice'].describe()

In [None]:
data[data['UnitPrice']==0].describe()[['Quantity']]

<div style="background-color: #bbf2ef ; border-radius: 10px; padding: 15px; border: 1px solid #ccc; font-size: 16px;">

### Inferences

* Unit Price=0 in 33 records (Might be an error)
* Deep dive: Qty is 12540 where Unit Price is 0. Doesn't seem right
* Removing these record should be a better approach to remove noise

</div>

In [None]:
### Removing these records from the data

data=data[data['UnitPrice']!=0]
data

In [None]:
### Resetting Index

data.reset_index(drop=True,inplace=True)

In [None]:
data

<div style="background-color: #0000ff; padding: 10px 20px; border-radius: 20px; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1); text-align: center;">
  <h1 style="color: white; font-size: 30px;"> Feature Engineering</h1>
</div>

<div style="background-color: #3296e3; padding: 8px 16px; border-radius: 20px; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1); text-align: left;">
  <h1 style="color: white; font-size: 24px;"> Recency (Days Since Last Purchase) </h1>
</div>

In [None]:
data.dtypes

In [None]:
### Extracting date

data['InvoiceDay'] = data['InvoiceDate'].dt.date

In [None]:
### Creating a new dataframe for features at customer ID level

customer_data=data.groupby('CustomerID')['InvoiceDay'].max().reset_index()
customer_data

In [None]:
### Extracting the most recent date of purchase and subtracting it 

most_recent_dateofpurchase=data['InvoiceDay'].max()

In [None]:
### Converting them to same date format

customer_data['InvoiceDay'] = pd.to_datetime(customer_data['InvoiceDay'])
most_recent_dateofpurchase = pd.to_datetime(most_recent_dateofpurchase)

In [None]:
### Calculating the Recency of each customer

customer_data['Days_Since_Last_Purchase'] = (most_recent_dateofpurchase - customer_data['InvoiceDay']).dt.days
customer_data

In [None]:
### Removing Invoice Day

customer_data.drop(columns=['InvoiceDay'], inplace=True)
customer_data

<div style="background-color: #3296e3; padding: 8px 16px; border-radius: 20px; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1); text-align: left;">
  <h1 style="color: white; font-size: 24px;"> Frequency (Total Transaction And Total Products Purchased) </h1>
</div>

In [None]:
data['CustomerID'].nunique()

In [None]:
### Total Transaction per customer

transactions_df=data.groupby('CustomerID')['InvoiceNo'].nunique().reset_index()
transactions_df.rename(columns={'InvoiceNo': 'Total_Transactions'}, inplace=True)
transactions_df

In [None]:
### Total Products purchased per customer

products_purchased_df=data.groupby('CustomerID')['Quantity'].sum().reset_index()
products_purchased_df.rename(columns={'Quantity': 'Total_Products_Purchased'}, inplace=True)
products_purchased_df

In [None]:
### Merge the new features into the customer_data dataframe

customer_data = pd.merge(customer_data, transactions_df, on='CustomerID')
customer_data = pd.merge(customer_data, products_purchased_df, on='CustomerID')
customer_data

<div style="background-color: #3296e3; padding: 8px 16px; border-radius: 20px; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1); text-align: left;">
  <h1 style="color: white; font-size: 24px;"> Monetary (Total Spend, Average Order Value) </h1>
</div>

In [None]:
### Total Spend per customer

data['TotalSpend']=data['UnitPrice']*data['Quantity']
total_spend_df=data.groupby('CustomerID')['TotalSpend'].sum().reset_index()
total_spend_df

In [None]:
### Merging the new features

customer_data = pd.merge(customer_data, total_spend_df, on='CustomerID')

In [None]:
customer_data

In [None]:
### AOV per customer

customer_data['AOV']=customer_data['TotalSpend']/customer_data['Total_Transactions']
customer_data

<div style="background-color: #3296e3; padding: 8px 16px; border-radius: 20px; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1); text-align: left;">
  <h1 style="color: white; font-size: 24px;"> Product Diversity (Unique Products Purchased) </h1>
</div>

In [None]:
### Calculating the number of unique products purchased by each customer

unique_products_purchased = data.groupby('CustomerID')['StockCode'].nunique().reset_index()
unique_products_purchased.rename(columns={'StockCode': 'Unique_Products_Purchased'}, inplace=True)

In [None]:
### Merging the new feature into the customer_data dataframe

customer_data = pd.merge(customer_data, unique_products_purchased, on='CustomerID')

In [None]:
### Displaying the first few rows of the customer_data dataframe

customer_data.head()

<div style="background-color: #3296e3; padding: 8px 16px; border-radius: 20px; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1); text-align: left;">
  <h1 style="color: white; font-size: 24px;"> Behavioral Features(Average Days Between Purchase, Favourite Shopping Day, Favourite Shopping Hour) </h1>
</div>

In [None]:
data

In [None]:
data.dtypes

In [None]:
### Extracting day of week and hour from InvoiceDate

data['Day_Of_Week'] = data['InvoiceDate'].dt.dayofweek
data['Hour'] = data['InvoiceDate'].dt.hour

In [None]:
### Calculating the average number of days between consecutive purchases 

days_between_purchases = data.groupby('CustomerID')['InvoiceDay'].apply(lambda x: (x.diff().dropna()).apply(lambda y: y.days))
average_days_between_purchases = days_between_purchases.groupby('CustomerID').mean().reset_index()

In [None]:
### Renaming the column before merging

average_days_between_purchases.rename(columns={'InvoiceDay': 'Average_Days_Between_Purchases'}, inplace=True)
customer_data = pd.merge(customer_data, average_days_between_purchases, on='CustomerID')

In [None]:
### Finding the favorite shopping day of the week

favorite_shopping_day = data.groupby(['CustomerID', 'Day_Of_Week']).size().reset_index(name='Count')
favorite_shopping_day = favorite_shopping_day.loc[favorite_shopping_day.groupby('CustomerID')['Count'].idxmax()][['CustomerID', 'Day_Of_Week']]
customer_data = pd.merge(customer_data, favorite_shopping_day, on='CustomerID')


In [None]:
### Finding the favorite shopping hour of the day

favorite_shopping_hour = data.groupby(['CustomerID', 'Hour']).size().reset_index(name='Count')
favorite_shopping_hour = favorite_shopping_hour.loc[favorite_shopping_hour.groupby('CustomerID')['Count'].idxmax()][['CustomerID', 'Hour']]
customer_data = pd.merge(customer_data, favorite_shopping_hour, on='CustomerID')

In [None]:
customer_data.head()

<div style="background-color: #3296e3; padding: 8px 16px; border-radius: 20px; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1); text-align: left;">
  <h1 style="color: white; font-size: 24px;"> Geographic Features(Country) </h1>
</div>

In [None]:
data['Country'].value_counts(normalize=True).head()

<div style="background-color: #bbf2ef ; border-radius: 10px; padding: 15px; border: 1px solid #ccc; font-size: 16px;">

### Inferences

* UK seems to have the highest number of records
* Creating a dominant country for each customer would be a good idea to indicate if a customer makes maximum number of transactions in UK or not

</div>

In [None]:
### Number of transactions per country for each customer

customer_country = data.groupby(['CustomerID', 'Country']).size().reset_index(name='Number_of_Transactions')
customer_country

In [None]:
### Checking for duplicate entries

customer_country[customer_country['CustomerID'].duplicated(keep=False)]

In [None]:
### Country with the maximum number of transactions for each customer 

customer_main_country = customer_country.sort_values('Number_of_Transactions', ascending=False).drop_duplicates('CustomerID')
customer_main_country

In [None]:
customer_main_country[customer_main_country['CustomerID']==12370]

In [None]:
### Creating a binary column indicating whether the customer is from the UK or not

customer_main_country['Is_UK'] = customer_main_country['Country'].apply(lambda x: 1 if x == 'United Kingdom' else 0)
customer_main_country

In [None]:
### Merging this data with our customer_data dataframe

customer_data = pd.merge(customer_data, customer_main_country[['CustomerID', 'Is_UK']], on='CustomerID', how='left')

In [None]:
customer_data.head()

In [None]:
customer_data['Is_UK'].value_counts()

<div style="background-color: #3296e3; padding: 8px 16px; border-radius: 20px; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1); text-align: left;">
  <h1 style="color: white; font-size: 24px;"> Cancellations( Cancellation Frequency, Cancellation Rate)  </h1>
</div>

In [None]:
customer_data

In [None]:
data

In [None]:
### Getting the cancelled transactions per customer

Cancelled_transactions=data[data['Transaction_Status']=="Cancelled"]
Cancelled_transactions.head(4)

In [None]:
### Creating a column for the cancellation frequency

cancelled_frequency=Cancelled_transactions.groupby('CustomerID')['InvoiceNo'].nunique().reset_index()
cancelled_frequency

In [None]:
### Renaming the column 

cancelled_frequency.rename(columns={'InvoiceNo': 'Cancellation_Frequency'}, inplace=True)

In [None]:
### Merging the Cancellation Frequency data into the customer_data dataframe

customer_data = pd.merge(customer_data, cancelled_frequency, on='CustomerID', how='left')
customer_data.head(4)

In [None]:
### Replacing NaN values with 0

customer_data['Cancellation_Frequency'].fillna(0, inplace=True)
customer_data.head(4)

In [None]:
### Calculating the Cancellation Rate

customer_data['Cancellation_Rate'] = customer_data['Cancellation_Frequency'] / customer_data['Total_Transactions']
customer_data.head(4)

<div style="background-color: #3296e3; padding: 8px 16px; border-radius: 20px; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1); text-align: left;">
  <h1 style="color: white; font-size: 24px;"> Seasonality (Monthly Spending Mean, Monthly Spending SD, Spending Trend) </h1>
</div>

In [None]:
### Extracting month and year from InvoiceDate

data['Year'] = data['InvoiceDate'].dt.year
data['Month'] = data['InvoiceDate'].dt.month

In [None]:
data

In [None]:
monthly_spend=data.groupby(['CustomerID','Year','Month'])['TotalSpend'].sum().reset_index()
monthly_spend

In [None]:
### Calculating mean 

monthly_spend_mean=monthly_spend.groupby(['CustomerID'])['TotalSpend'].agg(['mean']).reset_index()
monthly_spend_mean.rename(columns={'mean':'Monthly_Spending_Mean'},inplace=True)
monthly_spend_mean

In [None]:
### Calculating stdev 

monthly_spend_sd=monthly_spend.groupby(['CustomerID'])['TotalSpend'].agg(['std']).reset_index()
monthly_spend_sd.rename(columns={'std':'Monthly_Spending_SD'},inplace=True)
monthly_spend_sd

In [None]:
### Replace NaN values with 0

monthly_spend_sd['Monthly_Spending_SD'].fillna(0, inplace=True)
monthly_spend_sd.head(4)

In [None]:
### Merging the new features into the customer_data dataframe

customer_data = pd.merge(customer_data, monthly_spend_mean, on='CustomerID')
customer_data = pd.merge(customer_data, monthly_spend_sd, on='CustomerID')
customer_data

In [None]:
### Calculating Spending Trends using Linear Regression

from scipy.stats import linregress

def calculate_trend(spend_data):
    if len(spend_data) > 1:
        x = np.arange(len(spend_data))
        slope, _, _, _, _ = linregress(x, spend_data)
        return slope
    else:
        return 0

In [None]:
### Apply the calculate_trend function to find the spending trend for each customer

spending_trends = monthly_spend.groupby('CustomerID')['TotalSpend'].apply(calculate_trend).reset_index()
spending_trends.rename(columns={'TotalSpend': 'Spending_Trend'}, inplace=True)
spending_trends

<div style="background-color: #bbf2ef ; border-radius: 10px; padding: 15px; border: 1px solid #ccc; font-size: 16px;">

### Inferences

* Positive values in spending trend indicates possibility to growing loyalty/satisfied customer. The opposite would hint at customer attrition

</div>

In [None]:
### Merge the new features into the customer_data dataframe

customer_data = pd.merge(customer_data, spending_trends, on='CustomerID')
customer_data

In [None]:
### Changing the data type of 'CustomerID' to string as it is a unique identifier and not used in mathematical operations
customer_data['CustomerID'] = customer_data['CustomerID'].astype(str)

In [None]:
### Convert data types of columns to optimal types

customer_data = customer_data.convert_dtypes()

In [None]:
customer_data.head(4)

In [None]:
customer_data.info()

## Customer Dataset Description (Updated):

<div style="background-color:#bbf2ef ; border-radius: 10px; padding: 15px; border: 1px solid #ccc; font-size: 16px;">

**This DataFrame contains 15 variables that correspond to:**

- **CustomerID:** Identifier uniquely assigned to each customer, used to distinguish individual customers.

- **Days_Since_Last_Purchase:**The number of days that have passed since the customer's last purchase.

- **Total_Transactions:** The total number of transactions made by the customer.

- **Total_Products_Purchased`:** The total quantity of products purchased by the customer across all transactions.

- **Total_Spend:** The total amount of money the customer has spent across all transactions.

- **AOV:** Average Order Value, calculated as the total spend divided by the number of transactions.

- **Unique_Products_Purchased:** The number of different products the customer has purchased.

- **Average_Days_Between_Purchases:** The average number of days between consecutive purchases made by the customer.

- **Day_Of_Week:** The day of the week when the customer prefers to shop, represented numerically (0 for Monday, 6 for Sunday).

- **Hour:** The hour of the day when the customer prefers to shop, represented in a 24-hour format.

- **Is_UK:** A binary variable indicating whether the customer is based in the UK (1) or not (0).

- **Cancellation_Frequency:** The total number of transactions that the customer has cancelled.

- **Cancellation_Rate:** The proportion of transactions that the customer has cancelled, calculated as cancellation frequency divided by total transactions.

- **Monthly_Spending_Mean:** The average monthly spending of the customer.

- **Monthly_Spending_SD:** The standard deviation of the customer's monthly spending, indicating the variability in their spending pattern.

- **Spending_Trend:** A numerical representation of the trend in the customer's spending over time. A positive value indicates an increasing trend, a negative value indicates a decreasing trend, and a value close to zero indicates a stable trend.

</div>


<div style="background-color: #0000ff; padding: 10px 20px; border-radius: 20px; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1); text-align: center;">
  <h1 style="color: white; font-size: 30px;"> Outlier Detection And Treatment</h1>
</div>

<div style="background-color: #bbf2ef ; border-radius: 10px; padding: 15px; border: 1px solid #ccc; font-size: 16px;">

Since K-Means is very sensitive to outliers - we'll use **Isolation forest Algorithm** to deal with outliers. It isolates observations by randomly selecting a feature and then randomly selects a split value between the maximum and minimum values of selected feature

**Strategy**: Flag Outliers in another column and exclude those from our dataset

</div>

In [None]:
### Initializing the IsolationForest model 

from sklearn.ensemble import IsolationForest

model = IsolationForest(contamination=0.05, random_state=0)

In [None]:
### Fitting the model on our database

customer_data['Outlier_Scores'] = model.fit_predict(customer_data.iloc[:, 1:].to_numpy())

In [None]:
### Creating a new column to identify outliers (1 for inliers and -1 for outliers)

customer_data['Is_Outlier'] = [1 if x == -1 else 0 for x in customer_data['Outlier_Scores']]

In [None]:
### Displaying the first few rows of the customer_data dataframe

customer_data.head(4)

In [None]:
### Calculate the percentage of inliers and outliers

outlier_percentage = customer_data['Is_Outlier'].value_counts(normalize=True) * 100
outlier_percentage

<div style="background-color: #bbf2ef ; border-radius: 10px; padding: 15px; border: 1px solid #ccc; font-size: 16px;">

### Inferences

* 5% outliers present in the data

</div>

In [None]:
### Separating the outliers for analysis

outliers_data = customer_data[customer_data['Is_Outlier'] == 1]

In [None]:
### Removing the outliers from the main dataset

customer_data_cleaned = customer_data[customer_data['Is_Outlier'] == 0]

In [None]:
### Dropping the 'Outlier_Scores' and 'Is_Outlier' columns

customer_data_cleaned = customer_data_cleaned.drop(columns=['Outlier_Scores', 'Is_Outlier'])

In [None]:
### Resetting the index of the cleaned data

customer_data_cleaned.reset_index(drop=True, inplace=True)
customer_data_cleaned.head(4)

In [None]:
customer_data_cleaned.shape

<div style="background-color: #0000ff; padding: 10px 20px; border-radius: 20px; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1); text-align: center;">
  <h1 style="color: white; font-size: 30px;"> Correlation Analysis </h1>
</div>

<div style="background-color: #bbf2ef ; border-radius: 10px; padding: 15px; border: 1px solid #ccc; font-size: 16px;">

We need to check for presence of **multicollinearity** before proceeding to the clustering

</div>

In [None]:
### Calculating the correlation matrix excluding the 'CustomerID' column

corr = customer_data_cleaned.drop(columns=['CustomerID']).corr()
corr

In [None]:
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
from matplotlib import colors as mcolors

### Reset background style
sns.set_style('whitegrid')

### Defining a custom colormap
colors = ['#2662ed', '#26dced', '#087cbf','#dce3de','white']
my_cmap = LinearSegmentedColormap.from_list('custom_map', colors, N=256)

### Creating a mask to only show the lower triangle of the matrix 
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask, k=1)] = True

### Plotting the heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr, mask=mask, cmap=my_cmap, annot=True, center=0, fmt='.2f', linewidths=2)
plt.title('Correlation Matrix', fontsize=14)
plt.show()

<div style="background-color: #bbf2ef ; border-radius: 10px; padding: 15px; border: 1px solid #ccc; font-size: 16px;">

### Inferences

* Highly correlated variables are as follows:
    * **Monthly Spending Mean and AOV** (0.80)
    * **Total_Spend and Total_Products_Purchased** (0.88)
    * **Total_Transactions and Total_Spend** (0.80)
    * **Total_Transactions and Total_Products_Purchased** (0.71)
    * **Cancellation_Rate and Cancellation_Frequency** (0.69)

</div>

<div style="background-color: #0000ff; padding: 10px 20px; border-radius: 20px; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1); text-align: center;">
  <h1 style="color: white; font-size: 30px;"> Feature Scaling </h1>
</div>

<div style="background-color: #bbf2ef ; border-radius: 10px; padding: 15px; border: 1px solid #ccc; font-size: 16px;">

* Before proceeding to PCA, it is important to scale all the features
* **Features to be ignored:** CustomerID, Is_UK, Day_Of_Week

</div>

In [None]:
from sklearn.preprocessing import StandardScaler

### Initialize the StandardScaler

scaler = StandardScaler()

In [None]:
### List of columns that don't need to be scaled

columns_to_exclude = ['CustomerID', 'Is_UK', 'Day_Of_Week']


In [None]:
### List of columns that need to be scaled

columns_to_scale = customer_data_cleaned.columns.difference(columns_to_exclude)

In [None]:
### Copying the cleaned dataset

customer_data_scaled = customer_data_cleaned.copy()

In [None]:
### Applying the scaler to the necessary columns in the dataset

customer_data_scaled[columns_to_scale] = scaler.fit_transform(customer_data_scaled[columns_to_scale])

In [None]:
### Displaying the first few rows of the scaled data

customer_data_scaled.head(4)

<div style="background-color: #0000ff; padding: 10px 20px; border-radius: 20px; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1); text-align: center;">
  <h1 style="color: white; font-size: 30px;"> Dimensionality Reduction (PCA) </h1>
</div>

<div style="background-color: #bbf2ef ; border-radius: 10px; padding: 15px; border: 1px solid #ccc; font-size: 16px;">

* To deal with multucollinear features, we will go ahead with Principal Component Analysis
* This would help in forming better clusters

</div>

In [None]:
from sklearn.decomposition import PCA

### Setting CustomerID as the index column
customer_data_scaled.set_index('CustomerID', inplace=True)

### Applying PCA
pca = PCA().fit(customer_data_scaled)

### Calculating the Cumulative Sum of the Explained Variance
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_explained_variance = np.cumsum(explained_variance_ratio)

### Setting the optimal k value (based on our analysis, we can choose 6)
optimal_k = 6

In [None]:
### Setting seaborn plot style
sns.set(rc={'axes.facecolor': '#dce3de'}, style='darkgrid')

### Plot the cumulative explained variance against the number of components
plt.figure(figsize=(20, 10))

### Bar chart for the explained variance of each component
barplot = sns.barplot(x=list(range(1, len(cumulative_explained_variance) + 1)),
                      y=explained_variance_ratio,
                      color='#2662ed',
                      alpha=0.8)

### Line plot for the cumulative explained variance
lineplot, = plt.plot(range(0, len(cumulative_explained_variance)), cumulative_explained_variance,
                     marker='o', linestyle='--', color='#087cbf', linewidth=2)

### Plot optimal k value line
optimal_k_line = plt.axvline(optimal_k - 1, color='#26dced', linestyle='--', label=f'Optimal k value = {optimal_k}') 

# Set labels and title
plt.xlabel('Number of Components', fontsize=14)
plt.ylabel('Explained Variance', fontsize=14)
plt.title('Cumulative Variance vs. Number of Components', fontsize=18)

### Customize ticks and legend
plt.xticks(range(0, len(cumulative_explained_variance)))
plt.legend(handles=[barplot.patches[0], lineplot, optimal_k_line],
           labels=['Explained Variance of Each Component', 'Cumulative Explained Variance', f'Optimal k value = {optimal_k}'],
           loc=(0.62, 0.1),
           frameon=True,
           framealpha=1.0,  
           edgecolor='#000203')  

### Display the variance values for both graphs on the plots
x_offset = -0.3
y_offset = 0.01
for i, (ev_ratio, cum_ev_ratio) in enumerate(zip(explained_variance_ratio, cumulative_explained_variance)):
    plt.text(i, ev_ratio, f"{ev_ratio:.2f}", ha="center", va="bottom", fontsize=10)
    if i > 0:
        plt.text(i + x_offset, cum_ev_ratio + y_offset, f"{cum_ev_ratio:.2f}", ha="center", va="bottom", fontsize=10)

plt.grid(axis='both')   
plt.show()

<div style="background-color: #bbf2ef ; border-radius: 10px; padding: 15px; border: 1px solid #ccc; font-size: 16px;">

* To deal with multucollinear features, we will go ahead with Principal Component Analysis
* This would help in forming better clusters

</div>

In [None]:
### Creating a PCA object with 6 components

pca = PCA(n_components=6)

In [None]:
### Fitting and transforming the original data to the new PCA dataframe

customer_data_pca = pca.fit_transform(customer_data_scaled)

In [None]:
### Creating a new dataframe from the PCA dataframe, with columns labeled PC1, PC2, etc.

customer_data_pca = pd.DataFrame(customer_data_pca, columns=['PC'+str(i+1) for i in range(pca.n_components_)])

In [None]:
### Adding the CustomerID index back to the new PCA dataframe

customer_data_pca.index = customer_data_scaled.index

In [None]:
customer_data_pca

In [None]:
### Defining a function to highlight the top 3 absolute values in each column of a dataframe

def highlight_top3(column):
    top3 = column.abs().nlargest(3).index
    return ['background-color:  #dce3de' if i in top3 else '' for i in column.index]

In [None]:
### Creating the PCA component DataFrame and applying the highlighting function

pc_df = pd.DataFrame(pca.components_.T, columns=['PC{}'.format(i+1) for i in range(pca.n_components_)],  
                     index=customer_data_scaled.columns)

pc_df.style.apply(highlight_top3, axis=0)

<div style="background-color: #0000ff; padding: 10px 20px; border-radius: 20px; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1); text-align: center;">
  <h1 style="color: white; font-size: 30px;"> K-Means Clustering </h1>
</div>

<div style="background-color: #3296e3; padding: 8px 16px; border-radius: 20px; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1); text-align: left;">
  <h1 style="color: white; font-size: 24px;"> Optimal Number Of Clusters (Elbow Method) </h1>
</div>

<div style="background-color: #bbf2ef ; border-radius: 10px; padding: 15px; border: 1px solid #ccc; font-size: 16px;">

* To deal with multucollinear features, we will go ahead with Principal Component Analysis
* This would help in forming better clusters

</div>

In [None]:
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
from sklearn.cluster import KMeans

### Set plot style, and background color
sns.set(style='darkgrid', rc={'axes.facecolor': '#dce3de'})

### Set the color palette for the plot
sns.set_palette(['#2662ed'])

### Instantiate the clustering model with the specified parameters
km = KMeans(init='k-means++', n_init=10, max_iter=100, random_state=0)

### Create a figure and axis with the desired size
fig, ax = plt.subplots(figsize=(12, 5))

### Instantiate the KElbowVisualizer with the model and range of k values, and disable the timing plot
visualizer = KElbowVisualizer(km, k=(2, 15), timings=False, ax=ax)

### Fit the data to the visualizer
visualizer.fit(customer_data_pca)

### Finalize and render the figure
visualizer.show();

<div style="background-color: #bbf2ef ; border-radius: 10px; padding: 15px; border: 1px solid #ccc; font-size: 16px;">

## Inference
    
* The k value is coming out to be 6 in this case - however it is a little unclear given there is no clear elbow formed

</div>

<div style="background-color: #3296e3; padding: 8px 16px; border-radius: 20px; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1); text-align: left;">
  <h1 style="color: white; font-size: 24px;"> Optimal Number Of Clusters (Silhouette Method) </h1>
</div>

In [None]:
import matplotlib.gridspec as gridspec
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

def silhouette_analysis(df, start_k, stop_k, figsize=(15, 16)):

    ### Set the size of the figure
    plt.figure(figsize=figsize)

    ### Create a grid with (stop_k - start_k + 1) rows and 2 columns
    grid = gridspec.GridSpec(stop_k - start_k + 1, 2)

    ### Assign the first plot to the first row and both columns
    first_plot = plt.subplot(grid[0, :])

    ### First plot: Silhouette scores for different k values
    sns.set_palette(['darkorange'])

    silhouette_scores = []

    ### Iterate through the range of k values
    for k in range(start_k, stop_k + 1):
        km = KMeans(n_clusters=k, init='k-means++', n_init=10, max_iter=100, random_state=0)
        km.fit(df)
        labels = km.predict(df)
        score = silhouette_score(df, labels)
        silhouette_scores.append(score)

    best_k = start_k + silhouette_scores.index(max(silhouette_scores))

    plt.plot(range(start_k, stop_k + 1), silhouette_scores, marker='o')
    plt.xticks(range(start_k, stop_k + 1))
    plt.xlabel('Number of clusters (k)')
    plt.ylabel('Silhouette score')
    plt.title('Average Silhouette Score for Different k Values', fontsize=15)

    ### Add the optimal k value text to the plot
    optimal_k_text = f'The k value with the highest Silhouette score is: {best_k}'
    plt.text(10, 0.23, optimal_k_text, fontsize=12, verticalalignment='bottom', 
             horizontalalignment='left', bbox=dict(facecolor='#fcc36d', edgecolor='#ff6200', boxstyle='round, pad=0.5'))
    
    plt.tight_layout()
    plt.show()

In [None]:
silhouette_analysis(customer_data_pca, 3, 12, figsize=(20, 50))

<div style="background-color: #bbf2ef ; border-radius: 10px; padding: 15px; border: 1px solid #ccc; font-size: 16px;">

* To deal with multucollinear features, we will go ahead with Principal Component Analysis
* This would help in forming better clusters

</div>

<div style="background-color: #3296e3; padding: 8px 16px; border-radius: 20px; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1); text-align: left;">
  <h1 style="color: white; font-size: 24px;"> Clustering Using K-Means </h1>
</div>

<div style="background-color: #bbf2ef ; border-radius: 10px; padding: 15px; border: 1px solid #ccc; font-size: 16px;">

* To deal with multucollinear features, we will go ahead with Principal Component Analysis
* This would help in forming better clusters

</div>

In [None]:
from collections import Counter

### Apply KMeans clustering using the optimal k
kmeans = KMeans(n_clusters=3, init='k-means++', n_init=10, max_iter=100, random_state=0)
kmeans.fit(customer_data_pca)

### Get the frequency of each cluster
cluster_frequencies = Counter(kmeans.labels_)

### Create a mapping from old labels to new labels based on frequency
label_mapping = {label: new_label for new_label, (label, _) in 
                 enumerate(cluster_frequencies.most_common())}

### Reverse the mapping to assign labels as per your criteria
label_mapping = {v: k for k, v in {2: 1, 1: 0, 0: 2}.items()}

### Apply the mapping to get the new labels
new_labels = np.array([label_mapping[label] for label in kmeans.labels_])

### Append the new cluster labels back to the original dataset
customer_data_cleaned['cluster'] = new_labels

### Append the new cluster labels to the PCA version of the dataset
customer_data_pca['cluster'] = new_labels

In [None]:
customer_data_cleaned.head(4)

In [None]:
customer_data_cleaned['cluster'].value_counts()

In [None]:
customer_data_cleaned.shape

<div style="background-color: #0000ff; padding: 10px 20px; border-radius: 20px; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1); text-align: center;">
  <h1 style="color: white; font-size: 30px;"> Clustering Evaluation </h1>
</div>

In [None]:
### Setting up the color scheme for the clusters (RGB order)

colors = ['#e8000b', '#1ac938', '#023eff']

In [None]:
import plotly.graph_objects as go

### Create separate data frames for each cluster
cluster_0 = customer_data_pca[customer_data_pca['cluster'] == 0]
cluster_1 = customer_data_pca[customer_data_pca['cluster'] == 1]
cluster_2 = customer_data_pca[customer_data_pca['cluster'] == 2]

### Create a 3D scatter plot
fig = go.Figure()

### Add data points for each cluster separately and specify the color
fig.add_trace(go.Scatter3d(x=cluster_0['PC1'], y=cluster_0['PC2'], z=cluster_0['PC3'], 
                           mode='markers', marker=dict(color=colors[0], size=5, opacity=0.4), name='Cluster 0'))
fig.add_trace(go.Scatter3d(x=cluster_1['PC1'], y=cluster_1['PC2'], z=cluster_1['PC3'], 
                           mode='markers', marker=dict(color=colors[1], size=5, opacity=0.4), name='Cluster 1'))
fig.add_trace(go.Scatter3d(x=cluster_2['PC1'], y=cluster_2['PC2'], z=cluster_2['PC3'], 
                           mode='markers', marker=dict(color=colors[2], size=5, opacity=0.4), name='Cluster 2'))

### Set the title and layout details
fig.update_layout(
    title=dict(text='3D Visualization of Customer Clusters in PCA Space', x=0.5),
    scene=dict(
        xaxis=dict(backgroundcolor="#fcf0dc", gridcolor='white', title='PC1'),
        yaxis=dict(backgroundcolor="#fcf0dc", gridcolor='white', title='PC2'),
        zaxis=dict(backgroundcolor="#fcf0dc", gridcolor='white', title='PC3'),
    ),
    width=900,
    height=800
)

### Show the plot
fig.show()

In [None]:
### Calculate the percentage of customers in each cluster
cluster_percentage = (customer_data_pca['cluster'].value_counts(normalize=True) * 100).reset_index()
cluster_percentage.columns = ['Cluster', 'Percentage']
cluster_percentage.sort_values(by='Cluster', inplace=True)

### Create a horizontal bar plot
plt.figure(figsize=(10, 4))
sns.barplot(x='Percentage', y='Cluster', data=cluster_percentage, orient='h', palette=colors)

### Adding percentages on the bars
for index, value in enumerate(cluster_percentage['Percentage']):
    plt.text(value+0.5, index, f'{value:.2f}%')

plt.title('Distribution of Customers Across Clusters', fontsize=14)
plt.xticks(ticks=np.arange(0, 50, 5))
plt.xlabel('Percentage (%)')

### Show the plot
plt.show()

<div style="background-color: #bbf2ef ; border-radius: 10px; padding: 15px; border: 1px solid #ccc; font-size: 16px;">

* To deal with multucollinear features, we will go ahead with Principal Component Analysis
* This would help in forming better clusters

</div>

<div style="background-color: #3296e3; padding: 8px 16px; border-radius: 20px; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1); text-align: left;">
  <h1 style="color: white; font-size: 24px;"> Evaluation Metrics </h1>
</div>

In [None]:
from tabulate import tabulate

### Compute number of customers
num_observations = len(customer_data_pca)

### Separate the features and the cluster labels
X = customer_data_pca.drop('cluster', axis=1)
clusters = customer_data_pca['cluster']

### Compute the metrics
sil_score = silhouette_score(X, clusters)
calinski_score = calinski_harabasz_score(X, clusters)
davies_score = davies_bouldin_score(X, clusters)

### Create a table to display the metrics and the number of observations
table_data = [
    ["Number of Observations", num_observations],
    ["Silhouette Score", sil_score],
    ["Calinski Harabasz Score", calinski_score],
    ["Davies Bouldin Score", davies_score]
]

### Print the table
print(tabulate(table_data, headers=["Metric", "Value"], tablefmt='pretty'))


<div style="background-color: #bbf2ef ; border-radius: 10px; padding: 15px; border: 1px solid #ccc; font-size: 16px;">

* To deal with multucollinear features, we will go ahead with Principal Component Analysis
* This would help in forming better clusters

</div>

<div style="background-color: #0000ff; padding: 10px 20px; border-radius: 20px; box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1); text-align: center;">
  <h1 style="color: white; font-size: 30px;"> Cluster Analysis </h1>
</div>

In [None]:
### Setting 'CustomerID' column as index and assigning it to a new dataframe
df_customer = customer_data_cleaned.set_index('CustomerID')

### Standardize the data (excluding the cluster column)
scaler = StandardScaler()
df_customer_standardized = scaler.fit_transform(df_customer.drop(columns=['cluster'], axis=1))

### Create a new dataframe with standardized values and add the cluster column back
df_customer_standardized = pd.DataFrame(df_customer_standardized, columns=df_customer.columns[:-1], index=df_customer.index)
df_customer_standardized['cluster'] = df_customer['cluster']

### Calculate the centroids of each cluster
cluster_centroids = df_customer_standardized.groupby('cluster').mean()

In [None]:
### Function to create a radar chart
def create_radar_chart(ax, angles, data, color, cluster):
    # Plot the data and fill the area
    ax.fill(angles, data, color=color, alpha=0.4)
    ax.plot(angles, data, color=color, linewidth=2, linestyle='solid')
    
    ### Add a title
    ax.set_title(f'Cluster {cluster}', size=20, color=color, y=1.1)

In [None]:
### Set data
labels=np.array(cluster_centroids.columns)
num_vars = len(labels)

### Compute angle of each axis
angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()

### The plot is circular, so we need to "complete the loop" and append the start to the end
labels = np.concatenate((labels, [labels[0]]))
angles += angles[:1]

### Initialize the figure
fig, ax = plt.subplots(figsize=(20, 10), subplot_kw=dict(polar=True), nrows=1, ncols=3)

### Create radar chart for each cluster
for i, color in enumerate(colors):
    data = cluster_centroids.loc[i].tolist()
    data += data[:1]  # Complete the loop
    create_radar_chart(ax[i], angles, data, color, i)

### Add input data
ax[0].set_xticks(angles[:-1])
ax[0].set_xticklabels(labels[:-1])

ax[1].set_xticks(angles[:-1])
ax[1].set_xticklabels(labels[:-1])

ax[2].set_xticks(angles[:-1])
ax[2].set_xticklabels(labels[:-1])

### Add a grid
ax[0].grid(color='grey', linewidth=0.5)

### Display the plot
plt.tight_layout()
plt.show()

<div style="background-color: #bbf2ef ; border-radius: 10px; padding: 15px; border: 1px solid #ccc; font-size: 16px;">

* To deal with multucollinear features, we will go ahead with Principal Component Analysis
* This would help in forming better clusters

</div>