In [1]:

# Customer Segmentation Using Clustering

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
customers_df = pd.read_csv('Customers.csv')
transactions_df = pd.read_csv('Transactions.csv')
products_df = pd.read_csv('Products.csv')

# Preprocess data
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])

transactions_merged = transactions_df.merge(products_df, on="ProductID", how="left")
customer_transactions = transactions_merged.merge(customers_df, on="CustomerID", how="left")

# Aggregate features
customer_features = customer_transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'TransactionID': 'count',
    'Quantity': 'mean',
    'TransactionDate': lambda x: (x.max() - x.min()).days,
    'Region': 'first',
    'SignupDate': 'first'
}).rename(columns={
    'TotalValue': 'TotalSpend',
    'TransactionID': 'TransactionCount',
    'Quantity': 'AverageQuantity',
    'TransactionDate': 'DaysBetweenPurchases'
})

latest_date = customer_transactions['TransactionDate'].max()
customer_features['DaysSinceSignup'] = (latest_date - customer_features['SignupDate']).dt.days
customer_features = customer_features.drop(columns=['SignupDate'])

# Normalize and encode features
customer_features_encoded = pd.get_dummies(customer_features, columns=['Region'], drop_first=True)
numerical_features = ['TotalSpend', 'TransactionCount', 'AverageQuantity', 'DaysBetweenPurchases', 'DaysSinceSignup']
scaler = StandardScaler()
customer_features_encoded[numerical_features] = scaler.fit_transform(customer_features_encoded[numerical_features])

# Perform clustering
n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
customer_features_encoded['Cluster'] = kmeans.fit_predict(customer_features_encoded)

# Evaluate clustering
db_index = davies_bouldin_score(customer_features_encoded[numerical_features], customer_features_encoded['Cluster'])
print(f"Davies-Bouldin Index: {db_index:.3f}")

# Visualize clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x=customer_features_encoded['TotalSpend'],
    y=customer_features_encoded['TransactionCount'],
    hue=customer_features_encoded['Cluster'],
    palette='viridis',
    s=50
)
plt.title('Customer Clusters (K-Means)', fontsize=16)
plt.xlabel('Normalized Total Spend', fontsize=12)
plt.ylabel('Normalized Transaction Count', fontsize=12)
plt.legend(title='Cluster', fontsize=10)
plt.grid(True, alpha=0.3)
plt.show()


ModuleNotFoundError: No module named 'pandas'