# TASK 3 : Customer Segmentation / Clustering

**Import necessary libraries**

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
from sklearn.decomposition import PCA

**File paths**

In [2]:
customer_file = '/kaggle/input/dataset3/Customers.csv'
product_file = '/kaggle/input/dataset3/Products.csv'
transaction_file = '/kaggle/input/dataset3/Transactions.csv'

**Load datasets**

In [3]:
customers = pd.read_csv(customer_file)
products = pd.read_csv(product_file)
transactions = pd.read_csv(transaction_file)

**Merge datasets**

In [4]:
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

**Feature engineering for clustering**

In [5]:
agg_dict = {
    'Price': 'mean' if 'Price' in merged_data.columns else None,
    'Quantity': 'sum' if 'Quantity' in merged_data.columns else None,
    'TotalValue': 'sum' if 'TotalValue' in merged_data.columns else None,
}
agg_dict = {key: value for key, value in agg_dict.items() if value}

if not agg_dict:
    raise ValueError("No valid columns available for clustering.")

customer_features = merged_data.groupby('CustomerID').agg(agg_dict).reset_index()

**Ensure 'CustomerID' is in customer_features**

In [6]:
if 'CustomerID' not in customer_features.columns:
    raise KeyError("'CustomerID' column is missing in customer_features.")

**One-hot encode categorical variables (e.g., Region, Category)**

In [7]:
if 'Region' in customers.columns:
    customers = pd.get_dummies(customers, columns=['Region'], prefix='Region')
if 'Category' in products.columns:
    merged_data = pd.get_dummies(merged_data, columns=['Category'], prefix='Category')