In [None]:
# import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler


In [None]:
# Load dataset
df = pd.read_csv('OnlineRetail.csv', encoding='ISO-8859-1')

# Display the first few rows of the dataset
print(df.head())


In [None]:
# Data Cleaning
# Remove rows with missing CustomerID
df = df.dropna(subset=['CustomerID'])

# Remove negative quantities (returns)
df = df[df['Quantity'] > 0]

# Create TotalPrice column
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']

# Display cleaned dataset info
print(df.info())


In [None]:
# Basic Statistics and Visualizations
# Summary statistics
print(df.describe())

# Distribution of TotalPrice
plt.figure(figsize=(10, 6))
sns.histplot(df['TotalPrice'], bins=50, kde=True)
plt.title('Distribution of Total Price')
plt.show()


In [None]:
# RFM (Recency, Frequency, Monetary) Analysis
# Recency: days since last purchase
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
snapshot_date = df['InvoiceDate'].max() + pd.DateOffset(1)
df['Recency'] = (snapshot_date - df['InvoiceDate']).dt.days

# Frequency: number of purchases
frequency = df.groupby('CustomerID')['InvoiceNo'].nunique()

# Monetary: total spend
monetary = df.groupby('CustomerID')['TotalPrice'].sum()

# Combine RFM into a single DataFrame
rfm = pd.DataFrame({'Recency': df.groupby('CustomerID')['Recency'].min(),
                    'Frequency': frequency,
                    'Monetary': monetary})

# Display the RFM table
print(rfm.head())


In [None]:
# Scaling the Data
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm)

# Display scaled data
print(rfm_scaled[:5])


In [None]:
# Applying K-means Clustering
# Find the optimal number of clusters using the Elbow method
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(rfm_scaled)
    wcss.append(kmeans.inertia_)

# Plot the Elbow graph
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()


In [None]:
# Cluster the Data
# Apply KMeans with the optimal number of clusters (e.g., 4)
kmeans = KMeans(n_clusters=4, random_state=42)
rfm['Cluster'] = kmeans.fit_predict(rfm_scaled)

# Display cluster centroids
print(pd.DataFrame(scaler.inverse_transform(kmeans.cluster_centers_), columns=rfm.columns[:-1]))


In [None]:
# Visualizing the Clusters
# Visualize clusters using pairplot
sns.pairplot(rfm, hue='Cluster', palette='viridis')
plt.show()

# Summary of each cluster
cluster_summary = rfm.groupby('Cluster').agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary': 'mean',
    'Cluster': 'count'
}).rename(columns={'Cluster': 'Count'}).reset_index()

print(cluster_summary)


### Interpreting the Clusters

    Cluster 0: High Recency, Low Frequency and Monetary - Lost Customers
    Cluster 1: Low Recency, High Frequency and Monetary - Loyal Customers
    Cluster 2: Medium Recency, Medium Frequency and Monetary - Potential Loyalists
    Cluster 3: High Recency, High Frequency and Monetary - New Customers

### Recommendations

    Loyal Customers: Offer rewards and exclusive deals.
    Potential Loyalists: Engage with personalized marketing.
    New Customers: Provide introductory offers and welcome messages.
    Lost Customers: Send re-engagement campaigns.