In [30]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import itertools
import shap
import matplotlib

## Preprocessing
- Not including the date column, because it isn't mentioned in the project brief

In [23]:
# Load the data
blender_data = pd.read_csv('MMM_Blender_data_2.csv')

# CREATE MONTH, YEAR, SALES LAGGED
blender_data['Date'] = pd.to_datetime(blender_data['Date'], format='%b-%Y')
# Extract time-related features (e.g., month, day, or time as numeric)
blender_data['month'] = blender_data['Date'].dt.month
blender_data['year'] = blender_data['Date'].dt.year
# Lagging the target variable (for time series modeling)
blender_data['monthly_sales_lagged'] = blender_data['Sales ($)'].shift(1)
# Drop rows with NaN values after lagging (to avoid issues with missing values)
blender_data.dropna(inplace=True)



# Extract time-related features (e.g., month, day, or time as numeric)
blender_data['month'] = blender_data['Date'].dt.month
blender_data['year'] = blender_data['Date'].dt.year

# Lagging the target variable (for time series modeling)
blender_data['monthly_sales_lagged'] = blender_data['Sales ($)'].shift(1)  # Lagged version of Y by 1 day

# Drop rows with NaN values after lagging (to avoid issues with missing values)
blender_data.dropna(inplace=True)

# # Define the independent variables (including the time-related features)
X = blender_data[blender_data.columns.difference(['Date','Sales ($)'])]
y = blender_data['Sales ($)']  # Dependent variable


# Interaction Terms

# Generate all pairwise combinations of independent variables
combinations = list(itertools.combinations(X.columns, 2))

# Create interaction terms
for var1, var2 in combinations:
    interaction_term = f"{var1}x{var2}"  # Column name for interaction
    X[interaction_term] = X[var1]*X[var2]

print(blender_data.info())
print (X.head())
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

<class 'pandas.core.frame.DataFrame'>
Index: 148 entries, 2 to 149
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Date                  148 non-null    datetime64[ns]
 1   Sales ($)             148 non-null    int64         
 2   TV_ads ($)            148 non-null    int64         
 3   online_ads ($)        148 non-null    int64         
 4   Print_ads ($)         148 non-null    int64         
 5   Price ($)             148 non-null    float64       
 6   month                 148 non-null    int32         
 7   year                  148 non-null    int32         
 8   monthly_sales_lagged  148 non-null    float64       
dtypes: datetime64[ns](1), float64(2), int32(2), int64(4)
memory usage: 10.4 KB
None
   Price ($)  Print_ads ($)  TV_ads ($)  month  monthly_sales_lagged  \
2      135.0          16016      222756      3             6115500.0   
3      135.0          29690      1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[interaction_term] = X[var1]*X[var2]


## Model Training & Hyperparameter Tuning
- Some basic GridSearch
- Optimizing for `neg_mean_squared_error`

In [24]:
# Perform hyperparameter tuning using GridSearchCV
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 6, 9],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# Initialize base model
base_model = XGBRegressor(random_state=42)

# Setup GridSearchCV
grid_search = GridSearchCV(
    estimator=base_model,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Get best parameters and model
print("Best parameters:", grid_search.best_params_)
print("Best score:", -grid_search.best_score_)

# Use the best model for predictions
model = grid_search.best_estimator_
y_pred = model.predict(X_test)

Fitting 5 folds for each of 729 candidates, totalling 3645 fits
Best parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.8}
Best score: 549696913409.50183


## Model Measurement

In [33]:
# Calculate regression metrics
mse = int(mean_squared_error(y_test, y_pred))
rmse = int(np.sqrt(mse))
mae = int(mean_absolute_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# Calculate adjusted R-squared
n = X_test.shape[0]  # number of observations
p = X_test.shape[1]  # number of predictors
adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

metrics_df = pd.DataFrame({
    'METRIC': ['R-squared', 'Mean Squared Error', 'Root Mean Squared Error', 'Mean Absolute Error'],
    'VALUE': [f'{r2:.3f}', mse, rmse, mae]
})
print("\nREGRESSION METRICS:\n===========================================")
print(metrics_df.to_string(index=False))

# Feature importance

# Get feature names from model
feature_names = X_train.columns.tolist()

# Calculate standard feature importance
feature_importance = pd.DataFrame({
    'FEATURE': feature_names,
    'IMPORTANCE': model.feature_importances_
})
feature_importance = feature_importance.sort_values('IMPORTANCE', ascending=False)
print("\nFEATURE IMPORTANCE:\n===========================================")
print(feature_importance.to_string(index=False))

# Export feature importance to CSV
feature_importance.to_csv('feature_importance.csv', index=False)

# Calculate SHAP values

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# Plot SHAP summary plot
print("\nSHAP FEATURE IMPORTANCE:\n===========================================")
shap.summary_plot(shap_values, X_test, feature_names=feature_names)

# Calculate and display mean absolute SHAP values
shap_importance = pd.DataFrame({
    'FEATURE': feature_names,
    'SHAP_IMPORTANCE': np.abs(shap_values).mean(axis=0)
})
shap_importance = shap_importance.sort_values('SHAP_IMPORTANCE', ascending=False)
print("\nSHAP FEATURE IMPORTANCE VALUES:\n===========================================")
print(shap_importance.to_string(index=False))

# Export SHAP importance to CSV
shap_importance.to_csv('shap_importance.csv', index=False)


REGRESSION METRICS:
                 METRIC        VALUE
              R-squared        0.946
     Mean Squared Error 479165169778
Root Mean Squared Error       692217
    Mean Absolute Error       538438

FEATURE IMPORTANCE:
                            FEATURE  IMPORTANCE
          TV_ads ($)xonline_ads ($)    0.652128
       Print_ads ($)xonline_ads ($)    0.136607
monthly_sales_laggedxonline_ads ($)    0.106906
                          Price ($)    0.029964
                     online_ads ($)    0.019610
                         TV_ads ($)    0.010340
    TV_ads ($)xmonthly_sales_lagged    0.008213
 Print_ads ($)xmonthly_sales_lagged    0.007736
                    TV_ads ($)xyear    0.004835
           Print_ads ($)xTV_ads ($)    0.003948
                               year    0.003439
         monthxmonthly_sales_lagged    0.002181
                Print_ads ($)xmonth    0.001874
                      Print_ads ($)    0.001702
                         monthxyear    0.001601
     

ImportError: matplotlib is not installed so plotting is not available! Run `pip install matplotlib` to fix this.