# 1. Import Libraries

In [20]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering


# 2. Load dataset 

In [11]:
dataset_path = "dataset/customer_segment.csv"
customer_df = pd.read_csv(dataset_path, encoding='ISO-8859-1')

# 3.  basic data exploration to understand the structure and contents of the dataset

In [12]:
print('Dataset/Dataframe dimensions:', customer_df.shape)

print("First few rows of the dataset:")
display(customer_df.head())

Dataset/Dataframe dimensions: (541909, 8)
First few rows of the dataset:


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [13]:
print("\nSummary statistics of numerical columns:")
display(customer_df.describe())


Summary statistics of numerical columns:


Unnamed: 0,Quantity,UnitPrice,CustomerID
count,541909.0,541909.0,406829.0
mean,9.55225,4.611114,15287.69057
std,218.081158,96.759853,1713.600303
min,-80995.0,-11062.06,12346.0
25%,1.0,1.25,13953.0
50%,3.0,2.08,15152.0
75%,10.0,4.13,16791.0
max,80995.0,38970.0,18287.0


## 3.1 Calculate null percentange

In [14]:
tab_info_df=pd.DataFrame(customer_df.dtypes).T.rename(index={0:'column type'})
tab_info_df = pd.concat([tab_info_df, pd.DataFrame(customer_df.isnull().sum()).T.rename(index={0: 'null values (nb)'})])
tab_info_df=pd.concat([tab_info_df, pd.DataFrame(customer_df.isnull().sum()/customer_df.shape[0]*100).T.rename(index={0:'null values (%)'})])

display(tab_info_df)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
column type,object,object,object,int64,object,float64,float64,object
null values (nb),0,0,1454,0,0,0,135080,0
null values (%),0.0,0.0,0.268311,0.0,0.0,0.0,24.926694,0.0


## 3.2 Removing null customerid records because it might introduce noise and inaccuracies 

In [15]:
print('Dataframe dimensions:', customer_df.shape)
cleaned_customer_df = customer_df.copy()
cleaned_customer_df.dropna(axis = 0, subset = ['CustomerID'], inplace = True)
print('Dataframe dimensions:', cleaned_customer_df.shape)

tab_info_df=pd.DataFrame(cleaned_customer_df.dtypes).T.rename(index={0:'column type'})
tab_info_df = pd.concat([tab_info_df, pd.DataFrame(cleaned_customer_df.isnull().sum()).T.rename(index={0: 'null values (nb)'})])
tab_info_df=pd.concat([tab_info_df, pd.DataFrame(cleaned_customer_df.isnull().sum()/cleaned_customer_df.shape[0]*100).T.rename(index={0:'null values (%)'})])

display(tab_info_df)

Dataframe dimensions: (541909, 8)
Dataframe dimensions: (406829, 8)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
column type,object,object,object,int64,object,float64,float64,object
null values (nb),0,0,0,0,0,0,0,0
null values (%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 3.3 Removing duplicates

In [16]:
print('Duplicate entries: {}'.format(cleaned_customer_df.duplicated().sum()))
cleaned_customer_df.drop_duplicates(inplace = True)

Duplicate entries: 5225


## 3.4 Converting *InvoiceDate* column to a date datatype

In [17]:
cleaned_customer_df['InvoiceDate'] = pd.to_datetime(cleaned_customer_df['InvoiceDate'])
display(cleaned_customer_df.dtypes)

InvoiceNo              object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID            float64
Country                object
dtype: object

In [18]:
print('Dataframe dimensions original:', customer_df.shape)
print('Dataframe dimensions after cleaning:', cleaned_customer_df.shape)

Dataframe dimensions original: (541909, 8)
Dataframe dimensions after cleaning: (401604, 8)


In [22]:


# Select features for clustering (e.g., Quantity, UnitPrice)
X = cleaned_customer_df[['Quantity', 'UnitPrice']]
X = X.astype(np.float32)
# Initialize hierarchical clustering model
model = AgglomerativeClustering(n_clusters=3)  # specify the number of clusters

# Fit the model to the data
model.fit(X)

# Add cluster labels to the dataframe
cleaned_customer_df['Cluster'] = model.labels_

# Display the dataframe with cluster labels
print(cleaned_customer_df.head())


MemoryError: Unable to allocate 601. GiB for an array with shape (80642685606,) and data type float64