In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import DBSCAN
from sklearn.metrics import davies_bouldin_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA


In [6]:
# Load customer and transaction data
customers_df = pd.read_csv('Customers.csv')
transactions_df = pd.read_csv('Transactions.csv')

# Merge customer and transaction data if needed (for example, based on CustomerID)
merged_df = pd.merge(customers_df, transactions_df, on='CustomerID',how = "inner")

# Ensure 'SignupDate' is in datetime format
merged_df['SignupDate'] = pd.to_datetime(merged_df['SignupDate'])

# Calculate days since joining
merged_df['DaysSinceJoining'] = (pd.to_datetime('today') - merged_df['SignupDate']).dt.days


In [7]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   CustomerID        1000 non-null   object        
 1   CustomerName      1000 non-null   object        
 2   Region            1000 non-null   object        
 3   SignupDate        1000 non-null   datetime64[ns]
 4   TransactionID     1000 non-null   object        
 5   ProductID         1000 non-null   object        
 6   TransactionDate   1000 non-null   object        
 7   Quantity          1000 non-null   int64         
 8   TotalValue        1000 non-null   float64       
 9   Price             1000 non-null   float64       
 10  DaysSinceJoining  1000 non-null   int64         
dtypes: datetime64[ns](1), float64(2), int64(2), object(6)
memory usage: 86.1+ KB


In [8]:
# Step 1: Preprocessing
features_df = merged_df[['CustomerID','Region','TotalValue','DaysSinceJoining']]

# One-hot encode Region as it's categorical
features_df = pd.get_dummies(features_df, columns=['Region','CustomerID'])

# Standardizing the numerical features (Quantity, TotalValue, Price)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features_df)

In [17]:
# Initialize a list to store the results
results = []

# Loop through different numbers of clusters for GridSearch
for n_clusters in range(2, 11):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(scaled_features)

    # Calculate Davies-Bouldin Index
    db_index = davies_bouldin_score(scaled_features, kmeans.labels_)

    # Store the results in the list
    results.append({'n_clusters': n_clusters, 'davies_bouldin_index': db_index})

# Create a DataFrame from the results list
results_df = pd.DataFrame(results)

# Step 3: Output the best results
best_db_index = results_df['davies_bouldin_index'].min()
best_n_clusters = results_df.loc[results_df['davies_bouldin_index'] == best_db_index, 'n_clusters'].values[0]

print(f"Best number of clusters: {best_n_clusters}")
print(f"Best Davies-Bouldin Index: {best_db_index}")


# Display the DataFrame of all results
results_df



Best number of clusters: 2
Best Davies-Bouldin Index: 0.7596059938446725


NameError: name 'best_kmeans' is not defined

In [None]:
# Step 1: Initialize a DataFrame to store results
cluster_results = []

# Step 2: Loop through different numbers of clusters for GridSearch
best_db_index = float('inf')
best_kmeans = None
best_n_clusters = 0

# Try different numbers of clusters (from 2 to 10)
for n_clusters in range(2, 11):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(scaled_features)

    # Calculate Davies-Bouldin Index
    db_index = davies_bouldin_score(scaled_features, kmeans.labels_)

    # Store the result for each iteration
    cluster_results.append({
        'n_clusters': n_clusters,
        'davies_bouldin_index': db_index
    })

    # Track the best DBI (lower is better)
    if db_index < best_db_index:
        best_db_index = db_index
        best_kmeans = kmeans
        best_n_clusters = n_clusters

# Step 3: Convert results to a DataFrame for easier analysis
cluster_results_df = pd.DataFrame(cluster_results)

# Display the results of each trial
cluster_results_df

# Step 4: Add cluster labels to the dataframe
features_df['Cluster'] = best_kmeans.labels_

# Display the updated DataFrame with cluster labels
cluster_results_df

# Step 4: Add cluster labels to the dataframe
features_df['Cluster'] = best_kmeans.labels_

In [None]:
# Step 5: Visualization
# Visualize clusters using a pair plot
sns.pairplot(features_df, hue='Cluster', palette='Set1')
plt.savefig('pairplot.png')
plt.show()

# Reduce dimensions to 3D for visualization
pca = PCA(n_components=3)
pca_components = pca.fit_transform(scaled_features)

# 3D scatter plot
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(pca_components[:, 0], pca_components[:, 1], pca_components[:, 2], c=best_kmeans.labels_, cmap='Set1')
ax.set_xlabel('PCA1')
ax.set_ylabel('PCA2')
ax.set_zlabel('PCA3')
plt.title('3D Cluster Visualization')
plt.savefig('3D Cluster Visualization.png', dpi=300)  
plt.show()
