# Customer Annual Spending Score Prediction

## E-commerce Customer Behavior Analysis and Predictive Modeling

**Objective:** Predict customer annual spending score to optimize targeted marketing campaigns

**Dataset:** Mall Customers - Contains customer demographic details, purchase history, and income information


## 1. Import Libraries


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline

# Set style for better visualizations
try:
    plt.style.use('seaborn-v0_8-darkgrid')
except:
    try:
        plt.style.use('seaborn-darkgrid')
    except:
        plt.style.use('ggplot')
sns.set_palette("husl")

# Set random seed for reproducibility
np.random.seed(42)


## 2. Load and Explore Dataset


In [None]:
# Load the dataset
df = pd.read_csv('Mall_Customers.csv')

# Display basic information
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
df.head()


In [None]:
# Dataset information
print("Dataset Info:")
df.info()
print("\n" + "="*50)
print("\nDataset Statistics:")
df.describe()


In [None]:
# Check for missing values
print("Missing Values:")
missing_values = df.isnull().sum()
print(missing_values)
print("\nMissing Percentage:")
print((missing_values / len(df)) * 100)


In [None]:
# Check for duplicates
print(f"Duplicate rows: {df.duplicated().sum()}")

# Check data types
print("\nData Types:")
print(df.dtypes)

# Check unique values in categorical columns
print("\nUnique values in Gender:")
print(df['Gender'].value_counts())


## 3. Exploratory Data Analysis (EDA)


In [None]:
# Distribution of Spending Score (Target Variable)
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.hist(df['Spending Score (1-100)'], bins=20, edgecolor='black', color='skyblue')
plt.title('Distribution of Spending Score', fontsize=14, fontweight='bold')
plt.xlabel('Spending Score')
plt.ylabel('Frequency')

plt.subplot(1, 3, 2)
plt.boxplot(df['Spending Score (1-100)'])
plt.title('Box Plot of Spending Score', fontsize=14, fontweight='bold')
plt.ylabel('Spending Score')

plt.subplot(1, 3, 3)
sns.violinplot(y=df['Spending Score (1-100)'])
plt.title('Violin Plot of Spending Score', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print(f"Mean Spending Score: {df['Spending Score (1-100)'].mean():.2f}")
print(f"Median Spending Score: {df['Spending Score (1-100)'].median():.2f}")
print(f"Std Spending Score: {df['Spending Score (1-100)'].std():.2f}")


In [None]:
# Correlation Analysis
# Create correlation matrix
numeric_cols = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
correlation_matrix = df[numeric_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=2, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix - Features vs Spending Score', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

print("\nCorrelation with Spending Score:")
print(correlation_matrix['Spending Score (1-100)'].sort_values(ascending=False))


In [None]:
# Relationship between Age and Spending Score
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.scatter(df['Age'], df['Spending Score (1-100)'], alpha=0.6, color='coral')
plt.xlabel('Age')
plt.ylabel('Spending Score')
plt.title('Age vs Spending Score', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)

plt.subplot(1, 3, 2)
sns.regplot(x='Age', y='Spending Score (1-100)', data=df, scatter_kws={'alpha':0.6})
plt.title('Age vs Spending Score (with regression line)', fontsize=14, fontweight='bold')

plt.subplot(1, 3, 3)
age_bins = pd.cut(df['Age'], bins=5)
df['Age_Group'] = age_bins
sns.boxplot(x='Age_Group', y='Spending Score (1-100)', data=df)
plt.title('Spending Score by Age Groups', fontsize=14, fontweight='bold')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()


In [None]:
# Relationship between Annual Income and Spending Score
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.scatter(df['Annual Income (k$)'], df['Spending Score (1-100)'], alpha=0.6, color='green')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score')
plt.title('Annual Income vs Spending Score', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)

plt.subplot(1, 3, 2)
sns.regplot(x='Annual Income (k$)', y='Spending Score (1-100)', data=df, scatter_kws={'alpha':0.6})
plt.title('Annual Income vs Spending Score (with regression line)', fontsize=14, fontweight='bold')

plt.subplot(1, 3, 3)
income_bins = pd.cut(df['Annual Income (k$)'], bins=5)
df['Income_Group'] = income_bins
sns.boxplot(x='Income_Group', y='Spending Score (1-100)', data=df)
plt.title('Spending Score by Income Groups', fontsize=14, fontweight='bold')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()


In [None]:
# Gender Analysis
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
gender_counts = df['Gender'].value_counts()
plt.pie(gender_counts.values, labels=gender_counts.index, autopct='%1.1f%%', startangle=90)
plt.title('Gender Distribution', fontsize=14, fontweight='bold')

plt.subplot(1, 3, 2)
sns.boxplot(x='Gender', y='Spending Score (1-100)', data=df)
plt.title('Spending Score by Gender', fontsize=14, fontweight='bold')

plt.subplot(1, 3, 3)
sns.violinplot(x='Gender', y='Spending Score (1-100)', data=df)
plt.title('Spending Score Distribution by Gender', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print("\nSpending Score Statistics by Gender:")
print(df.groupby('Gender')['Spending Score (1-100)'].describe())


In [None]:
# 3D Relationship Analysis
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')

# Color by spending score
scatter = ax.scatter(df['Age'], df['Annual Income (k$)'], df['Spending Score (1-100)'], 
                    c=df['Spending Score (1-100)'], cmap='viridis', alpha=0.6, s=50)

ax.set_xlabel('Age', fontsize=12)
ax.set_ylabel('Annual Income (k$)', fontsize=12)
ax.set_zlabel('Spending Score', fontsize=12)
ax.set_title('3D Relationship: Age, Income, and Spending Score', fontsize=14, fontweight='bold', pad=20)
plt.colorbar(scatter, label='Spending Score')
plt.show()


## 4. Feature Engineering


In [None]:
# Create a copy for feature engineering
df_processed = df.copy()

# Handle missing values (if any)
print("Missing values before handling:")
print(df_processed.isnull().sum())

# Fill any missing values (though dataset appears complete)
df_processed = df_processed.ffill().bfill()

print("\nMissing values after handling:")
print(df_processed.isnull().sum())


In [None]:
# Derive new features

# 1. Age groups
df_processed['Age_Group_Category'] = pd.cut(df_processed['Age'], 
                                             bins=[0, 30, 40, 50, 100], 
                                             labels=['Young', 'Middle', 'Senior', 'Elderly'])

# 2. Income groups
df_processed['Income_Group_Category'] = pd.cut(df_processed['Annual Income (k$)'], 
                                                bins=[0, 40, 70, 100, 150], 
                                                labels=['Low', 'Medium', 'High', 'Very High'])

# 3. Income to Age ratio (spending power indicator)
df_processed['Income_Age_Ratio'] = df_processed['Annual Income (k$)'] / (df_processed['Age'] + 1)

# 4. Age squared (non-linear relationship)
df_processed['Age_Squared'] = df_processed['Age'] ** 2

# 5. Income squared (non-linear relationship)
df_processed['Income_Squared'] = df_processed['Annual Income (k$)'] ** 2

# 6. Interaction feature: Age * Income
df_processed['Age_Income_Interaction'] = df_processed['Age'] * df_processed['Annual Income (k$)']

# 7. Spending capacity (normalized)
df_processed['Spending_Capacity'] = (df_processed['Annual Income (k$)'] - df_processed['Annual Income (k$)'].min()) / \
                                     (df_processed['Annual Income (k$)'].max() - df_processed['Annual Income (k$)'].min())

# 8. Is Young and High Income
df_processed['Young_High_Income'] = ((df_processed['Age'] < 35) & (df_processed['Annual Income (k$)'] > 70)).astype(int)

# 9. Is Senior and Low Income
df_processed['Senior_Low_Income'] = ((df_processed['Age'] > 50) & (df_processed['Annual Income (k$)'] < 50)).astype(int)

print("New features created:")
print(df_processed.columns.tolist())
print("\nDataset shape:", df_processed.shape)


In [None]:
# Display sample of engineered features
print("Sample of engineered features:")
feature_cols = ['Age', 'Annual Income (k$)', 'Age_Group_Category', 'Income_Group_Category', 
                'Income_Age_Ratio', 'Age_Squared', 'Income_Squared', 'Age_Income_Interaction',
                'Spending_Capacity', 'Young_High_Income', 'Senior_Low_Income']
df_processed[feature_cols].head(10)


## 5. Data Preprocessing


In [None]:
# Prepare features and target
X = df_processed.drop(['CustomerID', 'Spending Score (1-100)', 'Age_Group', 'Income_Group'], axis=1, errors='ignore')
y = df_processed['Spending Score (1-100)']

print("Features shape:", X.shape)
print("Target shape:", y.shape)
print("\nFeature columns:")
print(X.columns.tolist())


In [None]:
# Encode categorical variables
label_encoders = {}

for col in X.select_dtypes(include=['object', 'category']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le
    print(f"Encoded {col}")

print("\nAll features are now numeric:")
print(X.dtypes)


In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"Number of features: {X_train.shape[1]}")


In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for easier handling
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

print("Features scaled successfully!")
print("\nScaled training data statistics:")
print(X_train_scaled.describe())


## 6. Model Building - Simple Model


In [None]:
# Simple Model 1: Linear Regression
print("="*60)
print("SIMPLE MODEL: Linear Regression")
print("="*60)

simple_model = LinearRegression()
simple_model.fit(X_train_scaled, y_train)

# Predictions
y_train_pred_simple = simple_model.predict(X_train_scaled)
y_test_pred_simple = simple_model.predict(X_test_scaled)

# Metrics
train_rmse_simple = np.sqrt(mean_squared_error(y_train, y_train_pred_simple))
test_rmse_simple = np.sqrt(mean_squared_error(y_test, y_test_pred_simple))
train_mae_simple = mean_absolute_error(y_train, y_train_pred_simple)
test_mae_simple = mean_absolute_error(y_test, y_test_pred_simple)
train_r2_simple = r2_score(y_train, y_train_pred_simple)
test_r2_simple = r2_score(y_test, y_test_pred_simple)

print(f"\nTraining Metrics:")
print(f"  RMSE: {train_rmse_simple:.4f}")
print(f"  MAE:  {train_mae_simple:.4f}")
print(f"  R²:   {train_r2_simple:.4f}")

print(f"\nTest Metrics:")
print(f"  RMSE: {test_rmse_simple:.4f}")
print(f"  MAE:  {test_mae_simple:.4f}")
print(f"  R²:   {test_r2_simple:.4f}")

# Feature importance (coefficients)
feature_importance_simple = pd.DataFrame({
    'Feature': X_train_scaled.columns,
    'Coefficient': simple_model.coef_
}).sort_values('Coefficient', key=abs, ascending=False)

print("\nTop 10 Most Important Features (by absolute coefficient):")
print(feature_importance_simple.head(10))


## 7. Model Building - Complex Model


In [None]:
# Complex Model 1: Random Forest with Hyperparameter Tuning
print("="*60)
print("COMPLEX MODEL 1: Random Forest Regressor")
print("="*60)

# Hyperparameter tuning for Random Forest
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestRegressor(random_state=42)
rf_grid_search = GridSearchCV(rf_model, rf_param_grid, cv=5, scoring='neg_mean_squared_error', 
                              n_jobs=-1, verbose=1)

print("\nPerforming Grid Search for Random Forest...")
rf_grid_search.fit(X_train_scaled, y_train)

print(f"\nBest parameters: {rf_grid_search.best_params_}")
print(f"Best CV score (neg MSE): {rf_grid_search.best_score_:.4f}")

# Use best model
best_rf_model = rf_grid_search.best_estimator_

# Predictions
y_train_pred_rf = best_rf_model.predict(X_train_scaled)
y_test_pred_rf = best_rf_model.predict(X_test_scaled)

# Metrics
train_rmse_rf = np.sqrt(mean_squared_error(y_train, y_train_pred_rf))
test_rmse_rf = np.sqrt(mean_squared_error(y_test, y_test_pred_rf))
train_mae_rf = mean_absolute_error(y_train, y_train_pred_rf)
test_mae_rf = mean_absolute_error(y_test, y_test_pred_rf)
train_r2_rf = r2_score(y_train, y_train_pred_rf)
test_r2_rf = r2_score(y_test, y_test_pred_rf)

print(f"\nTraining Metrics:")
print(f"  RMSE: {train_rmse_rf:.4f}")
print(f"  MAE:  {train_mae_rf:.4f}")
print(f"  R²:   {train_r2_rf:.4f}")

print(f"\nTest Metrics:")
print(f"  RMSE: {test_rmse_rf:.4f}")
print(f"  MAE:  {test_mae_rf:.4f}")
print(f"  R²:   {test_r2_rf:.4f}")


In [None]:
# Complex Model 2: Gradient Boosting with Hyperparameter Tuning
print("="*60)
print("COMPLEX MODEL 2: Gradient Boosting Regressor")
print("="*60)

# Hyperparameter tuning for Gradient Boosting
gb_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

gb_model = GradientBoostingRegressor(random_state=42)
gb_grid_search = GridSearchCV(gb_model, gb_param_grid, cv=5, scoring='neg_mean_squared_error', 
                              n_jobs=-1, verbose=1)

print("\nPerforming Grid Search for Gradient Boosting...")
gb_grid_search.fit(X_train_scaled, y_train)

print(f"\nBest parameters: {gb_grid_search.best_params_}")
print(f"Best CV score (neg MSE): {gb_grid_search.best_score_:.4f}")

# Use best model
best_gb_model = gb_grid_search.best_estimator_

# Predictions
y_train_pred_gb = best_gb_model.predict(X_train_scaled)
y_test_pred_gb = best_gb_model.predict(X_test_scaled)

# Metrics
train_rmse_gb = np.sqrt(mean_squared_error(y_train, y_train_pred_gb))
test_rmse_gb = np.sqrt(mean_squared_error(y_test, y_test_pred_gb))
train_mae_gb = mean_absolute_error(y_train, y_train_pred_gb)
test_mae_gb = mean_absolute_error(y_test, y_test_pred_gb)
train_r2_gb = r2_score(y_train, y_train_pred_gb)
test_r2_gb = r2_score(y_test, y_test_pred_gb)

print(f"\nTraining Metrics:")
print(f"  RMSE: {train_rmse_gb:.4f}")
print(f"  MAE:  {train_mae_gb:.4f}")
print(f"  R²:   {train_r2_gb:.4f}")

print(f"\nTest Metrics:")
print(f"  RMSE: {test_rmse_gb:.4f}")
print(f"  MAE:  {test_mae_gb:.4f}")
print(f"  R²:   {test_r2_gb:.4f}")


## 8. Model Comparison and Selection


In [None]:
# Compare all models
models_comparison = pd.DataFrame({
    'Model': ['Linear Regression (Simple)', 'Random Forest (Complex)', 'Gradient Boosting (Complex)'],
    'Train RMSE': [train_rmse_simple, train_rmse_rf, train_rmse_gb],
    'Test RMSE': [test_rmse_simple, test_rmse_rf, test_rmse_gb],
    'Train MAE': [train_mae_simple, train_mae_rf, train_mae_gb],
    'Test MAE': [test_mae_simple, test_mae_rf, test_mae_gb],
    'Train R²': [train_r2_simple, train_r2_rf, train_r2_gb],
    'Test R²': [test_r2_simple, test_r2_rf, test_r2_gb]
})

print("="*80)
print("MODEL COMPARISON")
print("="*80)
print(models_comparison.to_string(index=False))

# Determine best model based on test R² (primary) and test RMSE (secondary)
best_model_idx = models_comparison['Test R²'].idxmax()
best_model_name = models_comparison.loc[best_model_idx, 'Model']

print("\n" + "="*80)
print(f"BEST MODEL: {best_model_name}")
print("="*80)
print(f"Test R² Score: {models_comparison.loc[best_model_idx, 'Test R²']:.4f}")
print(f"Test RMSE: {models_comparison.loc[best_model_idx, 'Test RMSE']:.4f}")
print(f"Test MAE: {models_comparison.loc[best_model_idx, 'Test MAE']:.4f}")


In [None]:
# Visualize model comparison
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# RMSE Comparison
axes[0].bar(models_comparison['Model'], models_comparison['Test RMSE'], color=['skyblue', 'lightgreen', 'coral'])
axes[0].set_title('Test RMSE Comparison', fontsize=14, fontweight='bold')
axes[0].set_ylabel('RMSE')
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(axis='y', alpha=0.3)

# MAE Comparison
axes[1].bar(models_comparison['Model'], models_comparison['Test MAE'], color=['skyblue', 'lightgreen', 'coral'])
axes[1].set_title('Test MAE Comparison', fontsize=14, fontweight='bold')
axes[1].set_ylabel('MAE')
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(axis='y', alpha=0.3)

# R² Comparison
axes[2].bar(models_comparison['Model'], models_comparison['Test R²'], color=['skyblue', 'lightgreen', 'coral'])
axes[2].set_title('Test R² Comparison', fontsize=14, fontweight='bold')
axes[2].set_ylabel('R² Score')
axes[2].tick_params(axis='x', rotation=45)
axes[2].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()


## 9. Model Results Visualization


In [None]:
# Select best model predictions for visualization
if best_model_name == 'Linear Regression (Simple)':
    best_predictions = y_test_pred_simple
    best_model_obj = simple_model
elif best_model_name == 'Random Forest (Complex)':
    best_predictions = y_test_pred_rf
    best_model_obj = best_rf_model
else:
    best_predictions = y_test_pred_gb
    best_model_obj = best_gb_model

# Actual vs Predicted
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Plot 1: Actual vs Predicted Scatter
axes[0, 0].scatter(y_test, best_predictions, alpha=0.6, color='steelblue')
axes[0, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2, label='Perfect Prediction')
axes[0, 0].set_xlabel('Actual Spending Score', fontsize=12)
axes[0, 0].set_ylabel('Predicted Spending Score', fontsize=12)
axes[0, 0].set_title(f'Actual vs Predicted - {best_model_name}', fontsize=14, fontweight='bold')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Plot 2: Residuals
residuals = y_test - best_predictions
axes[0, 1].scatter(best_predictions, residuals, alpha=0.6, color='coral')
axes[0, 1].axhline(y=0, color='r', linestyle='--', lw=2)
axes[0, 1].set_xlabel('Predicted Spending Score', fontsize=12)
axes[0, 1].set_ylabel('Residuals', fontsize=12)
axes[0, 1].set_title('Residual Plot', fontsize=14, fontweight='bold')
axes[0, 1].grid(True, alpha=0.3)

# Plot 3: Distribution of Residuals
axes[1, 0].hist(residuals, bins=20, edgecolor='black', color='lightgreen', alpha=0.7)
axes[1, 0].axvline(x=0, color='r', linestyle='--', lw=2)
axes[1, 0].set_xlabel('Residuals', fontsize=12)
axes[1, 0].set_ylabel('Frequency', fontsize=12)
axes[1, 0].set_title('Distribution of Residuals', fontsize=14, fontweight='bold')
axes[1, 0].grid(True, alpha=0.3)

# Plot 4: Actual vs Predicted Line Plot
test_indices = range(len(y_test))
axes[1, 1].plot(test_indices, y_test.values, 'o-', label='Actual', alpha=0.7, color='steelblue')
axes[1, 1].plot(test_indices, best_predictions, 's-', label='Predicted', alpha=0.7, color='coral')
axes[1, 1].set_xlabel('Test Sample Index', fontsize=12)
axes[1, 1].set_ylabel('Spending Score', fontsize=12)
axes[1, 1].set_title('Actual vs Predicted Over Test Samples', fontsize=14, fontweight='bold')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Feature Importance Visualization (for tree-based models)
if hasattr(best_model_obj, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'Feature': X_train_scaled.columns,
        'Importance': best_model_obj.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    plt.figure(figsize=(12, 8))
    sns.barplot(data=feature_importance.head(15), x='Importance', y='Feature', palette='viridis')
    plt.title(f'Top 15 Feature Importance - {best_model_name}', fontsize=16, fontweight='bold', pad=20)
    plt.xlabel('Importance Score', fontsize=12)
    plt.tight_layout()
    plt.show()
    
    print("\nTop 15 Most Important Features:")
    print(feature_importance.head(15).to_string(index=False))
else:
    # For Linear Regression, show coefficients
    feature_importance = pd.DataFrame({
        'Feature': X_train_scaled.columns,
        'Coefficient': best_model_obj.coef_
    }).sort_values('Coefficient', key=abs, ascending=False)
    
    plt.figure(figsize=(12, 8))
    sns.barplot(data=feature_importance.head(15), x='Coefficient', y='Feature', palette='coolwarm')
    plt.title(f'Top 15 Feature Coefficients - {best_model_name}', fontsize=16, fontweight='bold', pad=20)
    plt.xlabel('Coefficient Value', fontsize=12)
    plt.axvline(x=0, color='black', linestyle='--', linewidth=1)
    plt.tight_layout()
    plt.show()
    
    print("\nTop 15 Most Important Features (by absolute coefficient):")
    print(feature_importance.head(15).to_string(index=False))


## 10. Interpretation and Reporting


In [None]:
print("="*80)
print("INTERPRETATION AND INSIGHTS")
print("="*80)

print("\n1. MODEL PERFORMANCE SUMMARY:")
print(f"   Best Model: {best_model_name}")
print(f"   Test R² Score: {models_comparison.loc[best_model_idx, 'Test R²']:.4f}")
print(f"   Test RMSE: {models_comparison.loc[best_model_idx, 'Test RMSE']:.4f}")
print(f"   Test MAE: {models_comparison.loc[best_model_idx, 'Test MAE']:.4f}")

r2_score_val = models_comparison.loc[best_model_idx, 'Test R²']
if r2_score_val > 0.7:
    performance_level = "Excellent"
elif r2_score_val > 0.5:
    performance_level = "Good"
elif r2_score_val > 0.3:
    performance_level = "Moderate"
else:
    performance_level = "Poor"

print(f"   Performance Level: {performance_level}")
print(f"   The model explains {r2_score_val*100:.2f}% of the variance in spending scores.")


In [None]:
print("\n2. CUSTOMER ATTRIBUTES INFLUENCING ANNUAL SPENDING:")

# Get feature importance for interpretation
if hasattr(best_model_obj, 'feature_importances_'):
    importance_df = pd.DataFrame({
        'Feature': X_train_scaled.columns,
        'Importance': best_model_obj.feature_importances_
    }).sort_values('Importance', ascending=False)
else:
    importance_df = pd.DataFrame({
        'Feature': X_train_scaled.columns,
        'Importance': np.abs(best_model_obj.coef_)
    }).sort_values('Importance', ascending=False)

print("\n   Top 10 Most Influential Attributes:")
for idx, row in importance_df.head(10).iterrows():
    print(f"   {idx+1}. {row['Feature']}: {row['Importance']:.4f}")

# Analyze original features
original_features = ['Age', 'Annual Income (k$)', 'Gender']
print("\n   Key Insights on Original Features:")

# Age impact
if 'Age' in importance_df['Feature'].values:
    age_importance = importance_df[importance_df['Feature'] == 'Age']['Importance'].values[0]
    print(f"   - Age has importance: {age_importance:.4f}")

# Income impact
if 'Annual Income (k$)' in importance_df['Feature'].values:
    income_importance = importance_df[importance_df['Feature'] == 'Annual Income (k$)']['Importance'].values[0]
    print(f"   - Annual Income has importance: {income_importance:.4f}")

# Gender impact
if 'Gender' in importance_df['Feature'].values:
    gender_importance = importance_df[importance_df['Feature'] == 'Gender']['Importance'].values[0]
    print(f"   - Gender has importance: {gender_importance:.4f}")


In [None]:
print("\n3. INSIGHTS FOR IMPROVING TARGETED MARKETING:")

# Analyze customer segments
df_processed['Predicted_Spending'] = best_model_obj.predict(scaler.transform(X))
df_processed['Spending_Category'] = pd.cut(df_processed['Predicted_Spending'], 
                                           bins=[0, 30, 50, 70, 100], 
                                           labels=['Low', 'Medium', 'High', 'Very High'])

print("\n   Customer Segments by Predicted Spending:")
segment_analysis = df_processed.groupby('Spending_Category').agg({
    'Age': 'mean',
    'Annual Income (k$)': 'mean',
    'Gender': lambda x: x.mode()[0] if len(x.mode()) > 0 else 'N/A'
}).round(2)

print(segment_analysis)

print("\n   Marketing Recommendations:")
print("   1. High Spending Customers:")
high_spenders = df_processed[df_processed['Spending_Category'].isin(['High', 'Very High'])]
if len(high_spenders) > 0:
    print(f"      - Average Age: {high_spenders['Age'].mean():.1f} years")
    print(f"      - Average Income: ${high_spenders['Annual Income (k$)'].mean():.1f}k")
    print(f"      - Focus: Premium products, loyalty programs, exclusive offers")

print("\n   2. Low Spending Customers:")
low_spenders = df_processed[df_processed['Spending_Category'] == 'Low']
if len(low_spenders) > 0:
    print(f"      - Average Age: {low_spenders['Age'].mean():.1f} years")
    print(f"      - Average Income: ${low_spenders['Annual Income (k$)'].mean():.1f}k")
    print(f"      - Focus: Discount campaigns, budget-friendly options, value propositions")

print("\n   3. Demographic Targeting:")
gender_spending = df_processed.groupby('Gender')['Predicted_Spending'].mean()
print(f"      - Average predicted spending by gender:")
for gender, spending in gender_spending.items():
    print(f"        {gender}: {spending:.2f}")


In [None]:
print("\n4. MODEL PERFORMANCE EXPLANATION:")
print(f"   The {best_model_name} achieved the following performance:")
print(f"   - R² Score of {models_comparison.loc[best_model_idx, 'Test R²']:.4f} indicates that the model")
print(f"     explains {models_comparison.loc[best_model_idx, 'Test R²']*100:.2f}% of the variance in spending scores.")
print(f"   - RMSE of {models_comparison.loc[best_model_idx, 'Test RMSE']:.4f} means on average, predictions")
print(f"     deviate from actual values by approximately {models_comparison.loc[best_model_idx, 'Test RMSE']:.2f} points.")
print(f"   - MAE of {models_comparison.loc[best_model_idx, 'Test MAE']:.4f} represents the average absolute")
print(f"     error in spending score predictions.")

print("\n5. SUGGESTED IMPROVEMENTS:")
print("   1. Collect More Data: Increase dataset size for better generalization")
print("   2. Additional Features: Include purchase history, browsing patterns, product preferences")
print("   3. Feature Engineering: Create more domain-specific features (e.g., customer lifetime value)")
print("   4. Ensemble Methods: Combine multiple models for better predictions")
print("   5. Regular Updates: Retrain model periodically with new data")
print("   6. Cross-Validation: Use k-fold cross-validation for more robust evaluation")
print("   7. Hyperparameter Optimization: Use more sophisticated methods (Bayesian Optimization)")
print("   8. Feature Selection: Remove less important features to reduce overfitting")

print("\n6. REAL-WORLD APPLICATIONS:")
print("   1. Personalized Marketing: Target customers with high predicted spending scores")
print("   2. Budget Allocation: Allocate marketing budget based on customer segments")
print("   3. Product Recommendations: Suggest products to customers based on spending patterns")
print("   4. Customer Retention: Identify high-value customers for retention programs")
print("   5. Pricing Strategy: Adjust pricing for different customer segments")
print("   6. Inventory Management: Stock products preferred by high-spending customers")
print("   7. Campaign Optimization: Design campaigns targeting specific spending score ranges")
print("   8. Customer Acquisition: Identify characteristics of high-spending customers for targeting")

print("\n" + "="*80)


## 11. Additional Analysis - Customer Segmentation


In [None]:
# Create customer segments based on predicted spending
df_processed['Customer_Segment'] = pd.cut(df_processed['Predicted_Spending'], 
                                           bins=[0, 30, 50, 70, 100], 
                                           labels=['Low Spender', 'Medium Spender', 'High Spender', 'VIP'])

# Visualize segments
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Segment distribution
segment_counts = df_processed['Customer_Segment'].value_counts()
axes[0, 0].pie(segment_counts.values, labels=segment_counts.index, autopct='%1.1f%%', startangle=90)
axes[0, 0].set_title('Customer Segment Distribution', fontsize=14, fontweight='bold')

# Age distribution by segment
sns.boxplot(data=df_processed, x='Customer_Segment', y='Age', ax=axes[0, 1])
axes[0, 1].set_title('Age Distribution by Customer Segment', fontsize=14, fontweight='bold')
axes[0, 1].tick_params(axis='x', rotation=45)

# Income distribution by segment
sns.boxplot(data=df_processed, x='Customer_Segment', y='Annual Income (k$)', ax=axes[1, 0])
axes[1, 0].set_title('Income Distribution by Customer Segment', fontsize=14, fontweight='bold')
axes[1, 0].tick_params(axis='x', rotation=45)

# Gender distribution by segment
segment_gender = pd.crosstab(df_processed['Customer_Segment'], df_processed['Gender'])
segment_gender.plot(kind='bar', ax=axes[1, 1], color=['skyblue', 'lightcoral'])
axes[1, 1].set_title('Gender Distribution by Customer Segment', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Customer Segment')
axes[1, 1].set_ylabel('Count')
axes[1, 1].legend(title='Gender')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

print("Customer Segment Summary:")
print(segment_counts)


## 12. Final Summary and Export


In [None]:
# Create final summary dataframe
final_summary = pd.DataFrame({
    'CustomerID': df_processed['CustomerID'],
    'Age': df_processed['Age'],
    'Gender': df_processed['Gender'],
    'Annual_Income_k': df_processed['Annual Income (k$)'],
    'Actual_Spending_Score': df_processed['Spending Score (1-100)'],
    'Predicted_Spending_Score': df_processed['Predicted_Spending'],
    'Customer_Segment': df_processed['Customer_Segment'],
    'Prediction_Error': abs(df_processed['Spending Score (1-100)'] - df_processed['Predicted_Spending'])
})

print("Final Summary Dataset:")
print(final_summary.head(10))

# Save to CSV
final_summary.to_csv('customer_spending_predictions.csv', index=False)
print("\nPredictions saved to 'customer_spending_predictions.csv'")


In [None]:
print("\n" + "="*80)
print("ANALYSIS COMPLETE")
print("="*80)
print("\nKey Takeaways:")
print(f"1. Best Model: {best_model_name}")
print(f"2. Model Performance: R² = {models_comparison.loc[best_model_idx, 'Test R²']:.4f}")
print(f"3. The model can predict customer spending scores with reasonable accuracy")
print(f"4. Customer attributes like Age, Income, and derived features significantly influence spending")
print(f"5. The model can be used for targeted marketing campaigns and customer segmentation")
print("\n" + "="*80)
