In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from fpdf import FPDF

# Load the data
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

merged_data = pd.merge(transactions, customers[['CustomerID', 'Region']], on='CustomerID', how='left')
merged_data = pd.merge(merged_data, products[['ProductID', 'Category']], on='ProductID', how='left')

category_interactions = merged_data.groupby(['CustomerID', 'Category'])['Quantity'].sum().unstack(fill_value=0)

customer_summary = merged_data.groupby('CustomerID').agg(
    transaction_count=('TransactionID', 'count'),
    total_spent=('TotalValue', 'sum')
).reset_index()

final_data = pd.merge(category_interactions, customer_summary, on='CustomerID', how='left')

if 'Region' in final_data.columns:
    final_data = pd.get_dummies(final_data, columns=['Region'], drop_first=True)

features = final_data.drop(columns=['CustomerID'])
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

kmeans = KMeans(n_clusters=5, random_state=42)
final_data['Cluster'] = kmeans.fit_predict(scaled_features)

db_index = davies_bouldin_score(scaled_features, final_data['Cluster'])


pca = PCA(n_components=2)
pca_components = pca.fit_transform(scaled_features)


final_data['PCA1'] = pca_components[:, 0]
final_data['PCA2'] = pca_components[:, 1]

plt.figure(figsize=(10, 6))
sns.scatterplot(data=final_data, x='PCA1', y='PCA2', hue='Cluster', palette='viridis')
plt.title('Customer Segmentation Based on Profile and Transactions')
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.legend(title='Cluster')
plt.savefig('cluster_visualization.png')


final_data.to_csv("Customer_Segmentation.csv", index=False)

# Create PDF report
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()

# Title
pdf.set_font("Arial", size=16, style='B')
pdf.cell(200, 10, txt="Customer Segmentation and Clustering Report", ln=True, align='C')

# Clustering Summary
pdf.ln(10)
pdf.set_font("Arial", size=12)
pdf.cell(200, 10, txt=f"Number of clusters formed: 5", ln=True)
pdf.cell(200, 10, txt=f"Davies-Bouldin Index: {db_index:.4f}", ln=True)

# Cluster Distribution
cluster_sizes = final_data['Cluster'].value_counts().to_string()
pdf.ln(10)
pdf.multi_cell(0, 10, txt="Cluster Sizes:\n" + cluster_sizes)

# Add the PCA Plot
pdf.ln(10)
pdf.cell(200, 10, txt="Cluster Visualization (PCA Projection)", ln=True)
pdf.image('cluster_visualization.png', x=10, y=pdf.get_y(), w=180)


pdf.output("Customer_Segmentation_Report.pdf")

print("Clustering complete. Results saved to 'Customer_Segmentation_Report.pdf'.")


ModuleNotFoundError: No module named 'fpdf'