In [1]:
# -------------------------
# üìå INSTALL DEPENDENCIES
# -------------------------
!pip install mlcroissant
!pip install plotly
!pip install scikit-learn

# -------------------------
# üìå IMPORT LIBRARIES
# -------------------------
import mlcroissant as mlc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# -------------------------
# üìå LOAD DATA USING CROISSANT
# -------------------------

print("‚è≥ Fetching Dataset...")
croissant_dataset = mlc.Dataset(
    "https://www.kaggle.com/datasets/abhishekrp1517/online-retail-transactions-dataset/croissant/download"
)

# Check dataset record sets
record_sets = croissant_dataset.metadata.record_sets
print("Available Record Sets:", record_sets)

# Fetch first record set
df = pd.DataFrame(croissant_dataset.records(record_set=record_sets[0].uuid))

print("\nDataset Loaded Successfully!")
print(df.head())

# -------------------------
# üìå BASIC EXPLORATION
# -------------------------
print("\nDataset Info:")
print(df.info())

print("\nMissing Values:")
print(df.isnull().sum())

print("\nBasic Statistics:")
print(df.describe())

# -------------------------
# üìå DATA CLEANING
# -------------------------

# Remove negative or zero quantities/prices
if "Quantity" in df.columns:
    df = df[df["Quantity"] > 0]

if "UnitPrice" in df.columns:
    df = df[df["UnitPrice"] > 0]

# Remove missing customer IDs
if "CustomerID" in df.columns:
    df = df.dropna(subset=["CustomerID"])

# Ensure CustomerID is int
df["CustomerID"] = df["CustomerID"].astype(int)

# -------------------------
# üìå FEATURE ENGINEERING ‚Äî RFM ANALYSIS
# -------------------------

print("\n‚è≥ Creating RFM Features...")

df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
max_date = df['InvoiceDate'].max()

# RFM:
rfm = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (max_date - x.max()).days,  # Recency
    'InvoiceNo': 'nunique',                               # Frequency
    'UnitPrice': 'mean',                                  # Monetary (avg price)
})

rfm.columns = ['Recency', 'Frequency', 'Monetary']
print("\nRFM Table:")
print(rfm.head())

# -------------------------
# üìå STANDARDIZATION
# -------------------------

scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm)

# -------------------------
# üìå K-MEANS CLUSTERING
# -------------------------

# Determine best k using silhouette method
scores = {}
for k in range(2, 10):
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(rfm_scaled)
    scores[k] = silhouette_score(rfm_scaled, km.labels_)

best_k = max(scores, key=scores.get)
print("\nBest K Value:", best_k)

# Train final model
kmeans = KMeans(n_clusters=best_k, random_state=42)
rfm["Cluster"] = kmeans.fit_predict(rfm_scaled)

# -------------------------
# üìå CLUSTER EVALUATION
# -------------------------

sil_score = silhouette_score(rfm_scaled, rfm["Cluster"])
print("\nSilhouette Score:", sil_score)

# -------------------------
# üìå VISUALIZATION
# -------------------------

rfm_plot = rfm.reset_index()

fig = px.scatter_3d(
    rfm_plot,
    x="Recency",
    y="Frequency",
    z="Monetary",
    color="Cluster",
    title="Customer Segmentation using K-Means (3D Plot)",
)
fig.show()

# -------------------------
# üìå CLUSTER PROFILES
# -------------------------

cluster_summary = rfm.groupby("Cluster").mean()
print("\nCluster Summary:")
print(cluster_summary)

# -------------------------
# üìå SAVE SEGMENTED CUSTOMERS
# -------------------------

rfm.to_csv("customer_segments.csv")
print("\nCustomer segments saved as customer_segments.csv !")


Collecting mlcroissant
  Downloading mlcroissant-1.0.22-py2.py3-none-any.whl.metadata (10 kB)
Collecting jsonpath-rw (from mlcroissant)
  Downloading jsonpath-rw-1.4.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting rdflib (from mlcroissant)
  Downloading rdflib-7.4.0-py3-none-any.whl.metadata (12 kB)
Downloading mlcroissant-1.0.22-py2.py3-none-any.whl (145 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m145.3/145.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rdflib-7.4.0-py3-none-any.whl (569 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m569.0/569.0 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: jsonpath-rw
  Building wheel for jsonpath-rw (setup.py) ... [?25l[?25hdone
  Created wheel for json

  -  [Metadata(Online Retail transactions Dataset)] Property "http://mlcommons.org/croissant/citeAs" is recommended, but does not exist.


Available Record Sets: [RecordSet(uuid="Online+Retail.csv")]


Downloading https://www.kaggle.com/api/v1/datasets/download/abhishekrp1517/online-retail-transactions-dataset?datasetVersionNumber=2...: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 29.0M/29.0M [00:00<00:00, 57.6MiB/s]



Dataset Loaded Successfully!
  Online+Retail.csv/InvoiceNo Online+Retail.csv/StockCode  \
0                   b'536365'                   b'85123A'   
1                   b'536365'                    b'71053'   
2                   b'536365'                   b'84406B'   
3                   b'536365'                   b'84029G'   
4                   b'536365'                   b'84029E'   

            Online+Retail.csv/Description  Online+Retail.csv/Quantity  \
0   b'WHITE HANGING HEART T-LIGHT HOLDER'                           6   
1                  b'WHITE METAL LANTERN'                           6   
2       b'CREAM CUPID HEARTS COAT HANGER'                           8   
3  b'KNITTED UNION FLAG HOT WATER BOTTLE'                           6   
4       b'RED WOOLLY HOTTIE WHITE HEART.'                           6   

  Online+Retail.csv/InvoiceDate  Online+Retail.csv/UnitPrice  \
0           2010-12-01 08:26:00                         2.55   
1           2010-12-01 08:26:00     

KeyError: 'CustomerID'

In [2]:
print(df.columns)


Index(['Online+Retail.csv/InvoiceNo', 'Online+Retail.csv/StockCode',
       'Online+Retail.csv/Description', 'Online+Retail.csv/Quantity',
       'Online+Retail.csv/InvoiceDate', 'Online+Retail.csv/UnitPrice',
       'Online+Retail.csv/CustomerID', 'Online+Retail.csv/Country'],
      dtype='object')


In [3]:
df.columns = df.columns.str.split("/").str[-1]

print("\nClean Column Names:")
print(df.columns)



Clean Column Names:
Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')


In [4]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors="coerce")

df = df.dropna(subset=['CustomerID'])
df = df[df['Quantity'] > 0]
df = df[df['UnitPrice'] > 0]

df['CustomerID'] = df['CustomerID'].astype(int)


In [5]:
print("\n‚è≥ Creating RFM features...")

max_date = df['InvoiceDate'].max()

rfm = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (max_date - x.max()).days,
    'InvoiceNo': 'nunique',
    'UnitPrice': 'mean'
})

rfm.columns = ['Recency', 'Frequency', 'Monetary']

print("\nRFM Sample:")
print(rfm.head())


‚è≥ Creating RFM features...

RFM Sample:
            Recency  Frequency  Monetary
CustomerID                              
12346           325          1  1.040000
12347             1          7  2.644011
12348            74          4  5.764839
12349            18          1  8.289041
12350           309          1  3.841176


In [6]:
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm)


In [7]:
scores = {}
for k in range(2, 10):
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(rfm_scaled)
    scores[k] = silhouette_score(rfm_scaled, km.labels_)

best_k = max(scores, key=scores.get)
print("\nBest K =", best_k)



Best K = 5


In [8]:
kmeans = KMeans(n_clusters=best_k, random_state=42)
rfm["Cluster"] = kmeans.fit_predict(rfm_scaled)

sil = silhouette_score(rfm_scaled, rfm["Cluster"])
print("\nSilhouette Score:", sil)




Silhouette Score: 0.6914745332235603


In [9]:
fig = px.scatter_3d(
    rfm.reset_index(),
    x="Recency",
    y="Frequency",
    z="Monetary",
    color="Cluster",
    title="Customer Segmentation (K-Means Clusters)"
)
fig.show()


In [10]:
print("\nCluster Summary:")
print(rfm.groupby("Cluster").mean())



Cluster Summary:
            Recency    Frequency     Monetary
Cluster                                      
0         40.404784     5.149341     3.306290
1        247.192523     1.614953     4.197149
2          0.000000  1430.000000     6.287797
3         84.000000     1.000000  2033.100000
4        144.800000     1.800000   414.556000


In [11]:
rfm.to_csv("customer_segments.csv")
print("\n‚úî File saved as customer_segments.csv")


‚úî File saved as customer_segments.csv
