In [2]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from skopt import BayesSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
from joblib import Parallel, delayed


In [3]:
training_data = pd.read_parquet('C:/Users/parth/Downloads/training_set_v2.parquet')
test_data = pd.read_parquet('C:/Users/parth/Downloads/test_set_v2.parquet')
validation_data = pd.read_parquet('C:/Users/parth/Downloads/validation_set_v2.parquet')

In [4]:
# Step 2: Standardize the data
scaler = StandardScaler()
columns_to_drop = ['ItemKey', 'RWB_EFFECTIVE_DATE']

X_train = training_data.drop(columns=['events'] + columns_to_drop, axis=1)
X_test = test_data.drop(columns=['events'] + columns_to_drop, axis=1)
X_val = validation_data.drop(columns=['events'] + columns_to_drop, axis=1)

X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)
X_val_std = scaler.transform(X_val)

In [5]:
# Step 3: Instantiate dummy regressors
dummy_regressor_mean = DummyRegressor(strategy='mean')
dummy_regressor_median = DummyRegressor(strategy='median')
dummy_regressor_quantile = DummyRegressor(strategy='quantile', quantile=0.25)

In [6]:
# Step 4: Evaluate model architectures
models = {
    'Dummy Mean': dummy_regressor_mean,
    'Dummy Median': dummy_regressor_median,
    'Dummy Quantile': dummy_regressor_quantile,
    'Linear Regression': LinearRegression(),
    'Lasso Regression': Lasso(),
    'Ridge Regression': Ridge(),
    'Elastic Net Regression': ElasticNet(),
    'Decision Tree Regression': DecisionTreeRegressor(),
    'Random Forest Regression': RandomForestRegressor(),
    'Gradient Boosting Regression': GradientBoostingRegressor()
}

In [7]:
# Step 5: Evaluate the performance of each model
test_results = []
for model_name, model in models.items():
    model.fit(X_train_std, training_data['events'])
    predictions_test = model.predict(X_test_std)
    mse_test = mean_squared_error(test_data['events'], predictions_test)
    rmse_test = np.sqrt(mse_test)
    mae_test = mean_absolute_error(test_data['events'], predictions_test)

    test_results.append([model_name, mse_test, rmse_test, mae_test])

    # if model_name in ['Linear Regression', 'Lasso Regression', 'Ridge Regression', 'Elastic Net Regression', 'Decision Tree Regression', 'Random Forest Regression', 'Gradient Boosting Regression']:
    #     if hasattr(model, 'coef_'):
    #         feature_importances = model.coef_
    #         sorted_indices = np.argsort(np.abs(feature_importances))[::-1][:5]  
    #     elif hasattr(model, 'feature_importances_'):  
    #         feature_importances = model.feature_importances_
    #         sorted_indices = np.argsort(feature_importances)[::-1][:5]  
    #     else:
    #         sorted_indices = None

    #     if sorted_indices is not None:
    #         print(f'Feature importances for {model_name}:')
    #         for idx in sorted_indices:
    #             feature_name = X_train.columns[idx]
    #             importance = feature_importances[idx]
    #             print(f'{feature_name}: {importance}')



In [8]:
# Step 6: Aggregate test performance results into a data frame
test_metrics_df = pd.DataFrame(test_results, columns=['Model', 'Test MSE', 'Test RMSE', 'Test MAE'])
test_metrics_df

Unnamed: 0,Model,Test MSE,Test RMSE,Test MAE
0,Dummy Mean,3.676429,1.917402,1.806277
1,Dummy Median,2.384159,1.544072,1.403132
2,Dummy Quantile,1.097428,1.047582,0.7787389
3,Linear Regression,5.717068e+18,2391039000.0,15813900.0
4,Lasso Regression,3.676429,1.917402,1.806277
5,Ridge Regression,2.80536,1.674921,1.438591
6,Elastic Net Regression,3.65743,1.912441,1.801901
7,Decision Tree Regression,6.591247,2.567342,1.151459
8,Random Forest Regression,2.328374,1.525901,1.117487
9,Gradient Boosting Regression,2.41568,1.554246,1.342009


In [9]:
# # Include 5 fold cross val
# test_results2 = []
# for model_name, model in models.items():
#     cv_scores = cross_val_score(model, X_train_std, training_data['events'], scoring='neg_mean_squared_error', cv=5)
#     mse_cv = -np.mean(cv_scores)
#     rmse_cv = np.sqrt(mse_cv)
#     mae_cv = np.mean(cross_val_score(model, X_train_std, training_data['events'], scoring='neg_mean_absolute_error', cv=5))

#     model.fit(X_train_std, training_data['events'])
#     predictions_test = model.predict(X_test_std)
#     mse_test = mean_squared_error(test_data['events'], predictions_test)
#     rmse_test = np.sqrt(mse_test)
#     mae_test = mean_absolute_error(test_data['events'], predictions_test)

#     # if model_name in ['Linear Regression', 'Lasso Regression', 'Ridge Regression', 'Elastic Net Regression', 'Decision Tree Regression', 'Random Forest Regression', 'Gradient Boosting Regression']:
#     #     if hasattr(model, 'coef_'):  
#     #         feature_importances = model.coef_
#     #         sorted_indices = np.argsort(np.abs(feature_importances))[::-1][:5]  
#     #     elif hasattr(model, 'feature_importances_'):  
#     #         feature_importances = model.feature_importances_
#     #         sorted_indices = np.argsort(feature_importances)[::-1][:5]
#     #     else:
#     #         sorted_indices = None

#     #     if sorted_indices is not None:
#     #         print(f'Feature importances for {model_name}:')
#     #         for idx in sorted_indices:
#     #             feature_name = X_train.columns[idx]
#     #             importance = feature_importances[idx]
#     #             print(f'{feature_name}: {importance}')

#     test_results2.append([model_name, mse_test, rmse_test, mae_test, mse_cv, rmse_cv, mae_cv])

# test_metrics2_df = pd.DataFrame(test_results2, columns=['Model', 'Test MSE', 'Test RMSE', 'Test MAE', 'CV MSE', 'CV RMSE', 'CV MAE'])
# test_metrics2_df


In [10]:
# Define hyperparameter grid for grid search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [11]:
# Define hyperparameter distributions for random search
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [12]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_std, training_data['events'], test_size=0.2, random_state=42)

# Initialize models
rf_model = RandomForestRegressor(random_state=42)

In [13]:
# Grid Search
grid_search = GridSearchCV(rf_model, param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)
best_params_grid = grid_search.best_params_
best_rf_grid = grid_search.best_estimator_


In [None]:
# Random Search
random_search = RandomizedSearchCV(rf_model, param_distributions=param_dist, n_iter=10, scoring='neg_mean_squared_error', cv=5, random_state=42)
random_search.fit(X_train, y_train)
best_params_random = random_search.best_params_
best_rf_random = random_search.best_estimator_

In [None]:
# Evaluate best models on validation data
y_val_pred_grid = best_rf_grid.predict(X_val)
mse_val_grid = mean_squared_error(y_val, y_val_pred_grid)

y_val_pred_random = best_rf_random.predict(X_val)
mse_val_random = mean_squared_error(y_val, y_val_pred_random)

print(f'Grid Search - Best Hyperparameters: {best_params_grid}, Validation MSE: {mse_val_grid}')
print(f'Random Search - Best Hyperparameters: {best_params_random}, Validation MSE: {mse_val_random}')