In [None]:
import pandas as pd

file_path = 'car_purchasing.csv'  # Updated with the file name you provided
df = pd.read_csv(file_path, encoding='latin-1') # Try 'latin-1' or 'cp1252' if 'latin-1' doesn't work
df.head()

Unnamed: 0,customer name,customer e-mail,country,gender,age,annual Salary,credit card debt,net worth,car purchase amount
0,Martina Avila,cubilia.Curae.Phasellus@quisaccumsanconvallis.edu,Bulgaria,0,41.85172,62812.09301,11609.38091,238961.2505,35321.45877
1,Harlan Barnes,eu.dolor@diam.co.uk,Belize,0,40.870623,66646.89292,9572.957136,530973.9078,45115.52566
2,Naomi Rodriquez,vulputate.mauris.sagittis@ametconsectetueradip...,Algeria,1,43.152897,53798.55112,11160.35506,638467.1773,42925.70921
3,Jade Cunningham,malesuada@dignissim.com,Cook Islands,1,58.271369,79370.03798,14426.16485,548599.0524,67422.36313
4,Cedric Leach,felis.ullamcorper.viverra@egetmollislectus.net,Brazil,1,57.313749,59729.1513,5358.712177,560304.0671,55915.46248


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

def adjusted_r2(r2, n, p):
    """Calculate adjusted R² to account for the number of predictors."""
    denominator = n - p - 1
    if denominator <= 0:
        print(f"Warning: Adjusted R² calculation invalid (n={n}, p={p}). Returning R².")
        return r2
    return 1 - (1 - r2) * (n - 1) / denominator

def preprocess_data(data):
    """Preprocess the dataset: drop irrelevant columns, encode categoricals, handle missing values, and remove outliers."""
    columns_to_drop = [col for col in ['customer name', 'customer e-mail'] if col in data.columns]
    if columns_to_drop:
        data = data.drop(columns_to_drop, axis=1)
    data = pd.get_dummies(data, columns=['country', 'gender'], drop_first=True)
    numeric_columns = data.select_dtypes(include=[np.number]).columns
    data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())
    Q1 = data['car purchase amount'].quantile(0.25)
    Q3 = data['car purchase amount'].quantile(0.75)
    IQR = Q3 - Q1
    data = data[~((data['car purchase amount'] < (Q1 - 1.5 * IQR)) | (data['car purchase amount'] > (Q3 + 1.5 * IQR)))]
    return data

def feature_engineering(data):
    """Create a wealth index from financial features."""
    data['wealth_index'] = (data['annual Salary'] + data['net worth'] - data['credit card debt']) / 3
    return data

def train_models(X, y):
    """Train and evaluate Linear Regression, Random Forest, and XGBoost models."""
    X_scaled = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_pred_lr = lr.predict(X_test)

    rf = RandomForestRegressor(random_state=42)
    rf.fit(X_train, y_train)
    y_pred_rf = rf.predict(X_test)

    xgb = XGBRegressor(random_state=42)
    xgb.fit(X_train, y_train)
    y_pred_xgb = xgb.predict(X_test)

    return lr, rf, xgb, X_test, y_test, y_pred_lr, y_pred_rf, y_pred_xgb

def evaluate_models(lr, rf, xgb, X, y, X_test, y_test, y_pred_lr, y_pred_rf, y_pred_xgb):
    """Evaluate models with R², MSE, MAE, and cross-validation."""
    n_samples, n_features = X_test.shape
    print(f"Test set: {n_samples} samples, {n_features} features")
    print("\nLinear Regression:")
    print("R2:", r2_score(y_test, y_pred_lr))
    print("MSE:", mean_squared_error(y_test, y_pred_lr))
    print("MAE:", mean_absolute_error(y_test, y_pred_lr))
    adj_r2_lr = adjusted_r2(r2_score(y_test, y_pred_lr), n_samples, n_features)
    print("Adjusted R2:", adj_r2_lr)

    print("\nRandom Forest:")
    print("R2:", r2_score(y_test, y_pred_rf))
    print("MSE:", mean_squared_error(y_test, y_pred_rf))
    print("MAE:", mean_absolute_error(y_test, y_pred_rf))
    adj_r2_rf = adjusted_r2(r2_score(y_test, y_pred_rf), n_samples, n_features)
    print("Adjusted R2:", adj_r2_rf)

    print("\nXGBoost:")
    print("R2:", r2_score(y_test, y_pred_xgb))
    print("MSE:", mean_squared_error(y_test, y_pred_xgb))
    print("MAE:", mean_absolute_error(y_test, y_pred_xgb))
    adj_r2_xgb = adjusted_r2(r2_score(y_test, y_pred_xgb), n_samples, n_features)
    print("Adjusted R2:", adj_r2_xgb)

    rf_cv_scores = cross_val_score(rf, StandardScaler().fit_transform(X), y, cv=5, scoring='r2')
    xgb_cv_scores = cross_val_score(xgb, StandardScaler().fit_transform(X), y, cv=5, scoring='r2')
    print("\nRandom Forest Cross-Validated R2:", rf_cv_scores.mean(), "+/-", rf_cv_scores.std() * 2)
    print("XGBoost Cross-Validated R2:", xgb_cv_scores.mean(), "+/-", xgb_cv_scores.std() * 2)

def save_visualizations(data, rf, X):
    """Save correlation heatmap and feature importance plot."""
    os.makedirs('images', exist_ok=True)
    plt.figure(figsize=(10, 8))
    sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
    plt.title('Correlation Heatmap')
    plt.savefig('images/correlation_heatmap.png')
    plt.close()

    feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': rf.feature_importances_})
    feature_importance = feature_importance.sort_values('Importance', ascending=False).head(10)
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Importance', y='Feature', data=feature_importance)
    plt.title('Top 10 Important Features')
    plt.savefig('images/feature_importance.png')
    plt.close()
    print("Visualizations saved to 'images/' folder.")

def save_results(lr, rf, xgb, X_test, y_test, y_pred_lr, y_pred_rf, y_pred_xgb, rf_cv_scores, xgb_cv_scores):
    """Save model evaluation results to a file."""
    n_samples, n_features = X_test.shape
    with open('model_results.txt', 'w') as f:
        f.write("Linear Regression:\n")
        f.write(f"R2: {r2_score(y_test, y_pred_lr)}\n")
        f.write(f"MSE: {mean_squared_error(y_test, y_pred_lr)}\n")
        f.write(f"MAE: {mean_absolute_error(y_test, y_pred_lr)}\n")
        adj_r2_lr = adjusted_r2(r2_score(y_test, y_pred_lr), n_samples, n_features)
        f.write(f"Adjusted R2: {adj_r2_lr}\n")
        f.write("\nRandom Forest:\n")
        f.write(f"R2: {r2_score(y_test, y_pred_rf)}\n")
        f.write(f"MSE: {mean_squared_error(y_test, y_pred_rf)}\n")
        f.write(f"MAE: {mean_absolute_error(y_test, y_pred_rf)}\n")
        adj_r2_rf = adjusted_r2(r2_score(y_test, y_pred_rf), n_samples, n_features)
        f.write(f"Adjusted R2: {adj_r2_rf}\n")
        f.write("\nXGBoost:\n")
        f.write(f"R2: {r2_score(y_test, y_pred_xgb)}\n")
        f.write(f"MSE: {mean_squared_error(y_test, y_pred_xgb)}\n")
        f.write(f"MAE: {mean_absolute_error(y_test, y_pred_xgb)}\n")
        adj_r2_xgb = adjusted_r2(r2_score(y_test, y_pred_xgb), n_samples, n_features)
        f.write(f"Adjusted R2: {adj_r2_xgb}\n")
        f.write(f"\nRandom Forest Cross-Validated R2: {rf_cv_scores.mean()} +/- {rf_cv_scores.std() * 2}\n")
        f.write(f"XGBoost Cross-Validated R2: {xgb_cv_scores.mean()} +/- {xgb_cv_scores.std() * 2}\n")

def provide_marketing_insights(X):
    """Provide marketing insights based on customer attributes."""
    high_wealth_threshold = 380000
    high_age_threshold = 48

    high_wealth_customers = X[X['wealth_index'] > high_wealth_threshold]
    older_customers = X[X['age'] > high_age_threshold]
    target_segment = len(high_wealth_customers[high_wealth_customers['age'] > high_age_threshold]) / len(X) * 100

    print("\nBusiness Insights for Marketing Strategy Optimization:")
    print(f"- Segmentation Strategy: Target customers with 'wealth_index' > ${high_wealth_threshold:,} and 'age' > {high_age_threshold} years.")
    print(f"  - Approximately {target_segment:.1f}% of customers fall into this high-value segment.")
    print("- Potential Impact: A 10% increase in targeting this segment could proportionally boost sales, based on feature importance.")
    print("- Marketing Recommendation: Allocate 50% of the marketing budget to this segment with luxury car campaigns.")
    print("- Limitation: The dataset lacks advertising spend and promotions data, preventing direct optimization of marketing budgets or campaigns.")
    print("- Future Enhancement: Incorporate marketing data (e.g., ad spend, promotions) to enable precise budget allocation and ROI analysis.")

def main():
    # Load and preprocess data
    data = pd.read_csv('car_purchasing.csv', encoding='latin1')
    data = preprocess_data(data)
    data = feature_engineering(data)

    # Prepare features and target
    X = data.drop('car purchase amount', axis=1)
    X = X.drop(['annual Salary', 'net worth', 'credit card debt'], axis=1)  # Avoid multicollinearity
    y = data['car purchase amount']

    # Train and evaluate models
    lr, rf, xgb, X_test, y_test, y_pred_lr, y_pred_rf, y_pred_xgb = train_models(X, y)
    evaluate_models(lr, rf, xgb, X, y, X_test, y_test, y_pred_lr, y_pred_rf, y_pred_xgb)

    # Save visualizations
    save_visualizations(data, rf, X)

    # Save results
    rf_cv_scores = cross_val_score(rf, StandardScaler().fit_transform(X), y, cv=5, scoring='r2')
    xgb_cv_scores = cross_val_score(xgb, StandardScaler().fit_transform(X), y, cv=5, scoring='r2')
    save_results(lr, rf, xgb, X_test, y_test, y_pred_lr, y_pred_rf, y_pred_xgb, rf_cv_scores, xgb_cv_scores)

    # Provide marketing insights
    provide_marketing_insights(X)

    print("Script completed successfully.")

if __name__ == "__main__":
    main()

Test set: 99 samples, 213 features

Linear Regression:
R2: 0.325430422559425
MSE: 79524717.44174218
MAE: 7057.373739816686
Adjusted R2: 0.325430422559425

Random Forest:
R2: 0.6432427142286976
MSE: 42057963.02568209
MAE: 5217.414417709087
Adjusted R2: 0.6432427142286976

XGBoost:
R2: 0.603617153199183
MSE: 46729403.377763875
MAE: 5330.035142455808
Adjusted R2: 0.603617153199183

Random Forest Cross-Validated R2: 0.6146460329894577 +/- 0.12183875425466552
XGBoost Cross-Validated R2: 0.5297661761983244 +/- 0.14566343690372466
Visualizations saved to 'images/' folder.

Business Insights for Marketing Strategy Optimization:
- Segmentation Strategy: Target customers with 'wealth_index' > $380,000 and 'age' > 48 years.
  - Approximately 0.0% of customers fall into this high-value segment.
- Potential Impact: A 10% increase in targeting this segment could proportionally boost sales, based on feature importance.
- Marketing Recommendation: Allocate 50% of the marketing budget to this segment w