In [9]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score, silhouette_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from matplotlib.backends.backend_pdf import PdfPages

# Load data
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')

# Merged the data
customer_transactions = pd.merge(customers, transactions, on='CustomerID')

# Feature Engineering (e.g., Total and Average Transaction Value)
customer_features = customer_transactions.groupby('CustomerID').agg(
    TotalTransactionValue=('TotalValue', 'sum'),
    AverageTransactionValue=('TotalValue', 'mean'),
    TransactionCount=('TransactionID', 'count'),
).reset_index()

# Merging with customer profile data
customer_features = pd.merge(customer_features, customers, on='CustomerID')

# Drop non-numeric columns (e.g., 'CustomerName', 'Region', 'SignupDate')
customer_features = customer_features.select_dtypes(include='number')

# Normalize data
scaler = StandardScaler()
customer_features_scaled = scaler.fit_transform(customer_features)

# Apply K-Means Clustering
n_clusters = 4  # You can adjust this
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
customer_features['Cluster'] = kmeans.fit_predict(customer_features_scaled)

# Evaluate Clustering (DB Index and Silhouette Score)
db_index = davies_bouldin_score(customer_features_scaled, customer_features['Cluster'])
silhouette_avg = silhouette_score(customer_features_scaled, customer_features['Cluster'])

# Print Clustering Metrics
print(f"Davies-Bouldin Index: {db_index}")
print(f"Silhouette Score: {silhouette_avg}")

# Visualize Clusters (using PCA for 2D projection)
pca = PCA(n_components=2)
pca_result = pca.fit_transform(customer_features_scaled)
customer_features['PCA1'] = pca_result[:, 0]
customer_features['PCA2'] = pca_result[:, 1]

# Save results to a PDF
with PdfPages('Amjad_PV_Clustering.pdf') as pdf:
    # Adding text report
    fig = plt.figure(figsize=(8, 6))
    plt.text(0.1, 0.8, f'Number of Clusters: {n_clusters}', fontsize=18, ha='left')
    plt.text(0.1, 0.6, f'Davies-Bouldin Index (DB Index): {db_index:.2f}', fontsize=18, ha='left')
    plt.axis('off')
    pdf.savefig(fig)  # Save the text page
    plt.close()
    
    # Plot the clustering visualization
    plt.figure(figsize=(10, 6))
    sns.scatterplot(
        x='PCA1', y='PCA2', hue='Cluster', data=customer_features, palette='viridis', s=100
    )
    plt.title('Customer Clusters (PCA Projection)')
    pdf.savefig()  # Save the plot to the PDF
    plt.close()
    
    # Create a summary of clustering metrics
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.axis('off')
    table_data = [
        ['Metric', 'Value'],
        ['Number of Clusters', n_clusters],
        ['Davies-Bouldin Index', round(db_index, 4)],
        ['Silhouette Score', round(silhouette_avg, 4)]
    ]
    table = ax.table(cellText=table_data, colLabels=None, cellLoc='center', loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(12)
    table.scale(1, 2)
    pdf.savefig()  # Save the table to the PDF
    plt.close()
    
    # Save the DataFrame's first rows
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.axis('off')
    ax.table(
        cellText=customer_features.head(10).values,
        colLabels=customer_features.columns,
        loc='center'
    )
    pdf.savefig()  # Save the table of clustering results
    plt.close()


Davies-Bouldin Index: 1.060424039992303
Silhouette Score: 0.3135106549790538
