Integrate Cluster Labels and Train Random Forest


1. Merge Cluster Labels into Dataset

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load dataset with cluster labels
df = pd.read_csv('clustered_customers.csv')

# Optional: One-hot encode Cluster for model use
df = pd.get_dummies(df, columns=['Cluster'], prefix='Cluster')

2. Define Features & Target

In [14]:
# Example: Define high spenders as target
median_spend = df['Total_Spend'].median()
df['High_Spender'] = (df['Total_Spend'] > median_spend).astype(int)

# Define features (excluding Total_Spend and raw cluster label if needed)
feature_cols = [col for col in df.columns if col not in ['Total_Spend', 'High_Spender', 'Income']]
X = df[feature_cols]
y = df['High_Spender']

3. Train-Test Split

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

4. Train Random Forest Classifier

In [22]:
# Step 4: Train Random Forest Classifier

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load clustered dataset
df = pd.read_csv('clustered_customers.csv')

# Encode categorical variables into numeric (one-hot encoding)
df_encoded = pd.get_dummies(df, drop_first=True)

# Create binary target: High Spender (above median Total_Spend)
median_spend = df_encoded['Total_Spend'].median()
df_encoded['High_Spender'] = (df_encoded['Total_Spend'] > median_spend).astype(int)

# Define features (exclude original Total_Spend and target)
feature_cols = [col for col in df_encoded.columns if col not in ['Total_Spend', 'High_Spender']]
X = df_encoded[feature_cols]
y = df_encoded['High_Spender']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Train Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict and evaluate
y_pred = rf.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Confusion Matrix:
 [[217   5]
 [  4 218]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98       222
           1       0.98      0.98      0.98       222

    accuracy                           0.98       444
   macro avg       0.98      0.98      0.98       444
weighted avg       0.98      0.98      0.98       444

