# Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def dataframe_info(df):
    report = pd.DataFrame(columns=['Column', 'Data Type', 'Unique Count', 'Unique Sample', 'Missing Values', 'Missing Percentage'])
    for column in df.columns:
        data_type = df[column].dtype
        unique_count = df[column].nunique()
        unique_sample = df[column].unique()[:5]
        missing_values = df[column].isnull().sum()
        missing_percentage = (missing_values / len(df)) * 100
        report = pd.concat([report, pd.DataFrame({'Column': [column],
                                                      'Data Type': [data_type],
                                                      'Unique Count': [unique_count],
                                                      'Unique Sample': [unique_sample],
                                                      'Missing Values': [missing_values],
                                                      'Missing Percentage': [missing_percentage.round(4)]})],
                             ignore_index=True)
    return report

# Loading the Data

In [7]:
train_data = pd.read_csv('D:\\CDS590\\data\\processed\\train_data.csv').drop('Speed_Category', axis=1)
test_data = pd.read_csv('D:\\CDS590\\data\\processed\\test_data.csv').drop('Speed_Category', axis=1)

In [8]:
dataframe_info(train_data)

  report = pd.concat([report, pd.DataFrame({'Column': [column],


Unnamed: 0,Column,Data Type,Unique Count,Unique Sample,Missing Values,Missing Percentage
0,route_id,int64,2,"[10, 11]",0,0.0
1,distance,float64,15,"[-0.5619826777423147, -0.0814840550832646, 0.3...",0,0.0
2,month,int64,4,"[3, 2, 5, 4]",0,0.0
3,day_of_week_sin,float64,5,"[-0.433883739117558, 0.9749279121818236, 0.0, ...",0,0.0
4,minutes_cos,float64,110,"[0.4848096202463371, 0.8746197071393957, 0.642...",0,0.0
5,avg_temp_C,float64,4,"[-0.57228042328685, 0.433766690262008, -1.5783...",0,0.0
6,temperature,float64,13,"[2.1127378756103314, -0.534445419731945, 0.347...",0,0.0
7,precip_mm,float64,26,"[0.0745795452334384, -0.3089724016813879, 5.70...",0,0.0
8,humidity,float64,30,"[-2.476851137319068, 0.6320770499266944, 0.773...",0,0.0
9,visibility,float64,6,"[-4.070229021424351, 0.2705098113156833, -0.59...",0,0.0


In [9]:
X_train = train_data.drop('duration', axis=1)
y_train = train_data['duration']
X_test = test_data.drop('duration', axis=1)
y_test = test_data['duration']
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((441, 15), (441,), (111, 15), (111,))

In [10]:
dataframe_info(train_data)

  report = pd.concat([report, pd.DataFrame({'Column': [column],


Unnamed: 0,Column,Data Type,Unique Count,Unique Sample,Missing Values,Missing Percentage
0,route_id,int64,2,"[10, 11]",0,0.0
1,distance,float64,15,"[-0.5619826777423147, -0.0814840550832646, 0.3...",0,0.0
2,month,int64,4,"[3, 2, 5, 4]",0,0.0
3,day_of_week_sin,float64,5,"[-0.433883739117558, 0.9749279121818236, 0.0, ...",0,0.0
4,minutes_cos,float64,110,"[0.4848096202463371, 0.8746197071393957, 0.642...",0,0.0
5,avg_temp_C,float64,4,"[-0.57228042328685, 0.433766690262008, -1.5783...",0,0.0
6,temperature,float64,13,"[2.1127378756103314, -0.534445419731945, 0.347...",0,0.0
7,precip_mm,float64,26,"[0.0745795452334384, -0.3089724016813879, 5.70...",0,0.0
8,humidity,float64,30,"[-2.476851137319068, 0.6320770499266944, 0.773...",0,0.0
9,visibility,float64,6,"[-4.070229021424351, 0.2705098113156833, -0.59...",0,0.0


# Feature Selection

In [11]:
from sklearn.feature_selection import SelectFromModel, SelectKBest, mutual_info_regression, f_regression
from sklearn.linear_model import LassoLarsCV, LassoLarsIC

# Total number of features in the original dataset
all_cols = X_train.columns.to_list()
print(f"All features: {all_cols}\n")

# LassoLarsIC for feature selection
print("LassoLarsIC Feature Selection")
llarsic_selection = SelectFromModel(LassoLarsIC(criterion='aic', max_iter=1000, eps=0.001), max_features=10)
llarsic_selection.fit(X_train, y_train)
llarsic_cols = X_train.columns[llarsic_selection.get_support()].tolist()
print(f"Number of selected features: {len(llarsic_cols)}")
print(f"Selected features: {llarsic_cols}\n")

# SelectKBest with mutual_info_regression for feature selection
print("SelectKBest with mutual_info_regression Feature Selection")
mic_selection = SelectKBest(mutual_info_regression, k=10)
mic_selection.fit(X_train, y_train)
mic_cols = X_train.columns[mic_selection.get_support()].tolist()
print(f"Number of selected features: {len(mic_cols)}")
print(f"Selected features: {mic_cols}\n")

print("SelectKBest with f_regression Feature Selection")
f_regression_selection = SelectKBest(score_func=f_regression, k=10)
f_regression_selection.fit_transform(X_train, y_train) 
f_regression_cols = X_train.columns[f_regression_selection.get_support()].to_list()
print(f"Number of selected features: {len(f_regression_cols)}")
print(f"Selected features: {f_regression_cols}\n")


All features: ['route_id', 'distance', 'month', 'day_of_week_sin', 'minutes_cos', 'avg_temp_C', 'temperature', 'precip_mm', 'humidity', 'visibility', 'pressure', 'cloud_cover', 'wind_speed', 'start_stop_id_encoded', 'end_stop_id_encoded']

LassoLarsIC Feature Selection
Number of selected features: 10
Selected features: ['route_id', 'distance', 'month', 'day_of_week_sin', 'temperature', 'precip_mm', 'humidity', 'visibility', 'pressure', 'start_stop_id_encoded']

SelectKBest with mutual_info_regression Feature Selection
Number of selected features: 10
Selected features: ['route_id', 'distance', 'minutes_cos', 'avg_temp_C', 'temperature', 'precip_mm', 'humidity', 'pressure', 'start_stop_id_encoded', 'end_stop_id_encoded']

SelectKBest with f_regression Feature Selection
Number of selected features: 10
Selected features: ['route_id', 'distance', 'day_of_week_sin', 'minutes_cos', 'temperature', 'precip_mm', 'pressure', 'cloud_cover', 'start_stop_id_encoded', 'end_stop_id_encoded']



# Evaluation 1

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
import matplotlib.pyplot as plt

def evaluate_regression_mdl(fitted_model, X_train, X_test, y_train, y_test):
    y_train_pred = fitted_model.predict(X_train)
    y_test_pred = fitted_model.predict(X_test)
    
    return {
        'rmse_train': np.sqrt(mean_squared_error(y_train, y_train_pred)),
        'rmse_test': np.sqrt(mean_squared_error(y_test, y_test_pred)),
        'mae_train': mean_absolute_error(y_train, y_train_pred),
        'mae_test': mean_absolute_error(y_test, y_test_pred),
        'r2_train': r2_score(y_train, y_train_pred),
        'r2_test': r2_score(y_test, y_test_pred)
    }

models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Polynomial Regression': make_pipeline(PolynomialFeatures(degree=2), Ridge(alpha=1.0)),
    'Decision Tree': DecisionTreeRegressor(max_depth=5, min_samples_leaf=5, random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=5, min_samples_leaf=5, random_state=42),
    'XGBoost': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
    'KNN': KNeighborsRegressor(n_neighbors=5),
    'CatBoost': CatBoostRegressor(iterations=100, depth=3, learning_rate=0.1, loss_function='RMSE', verbose=False)
}

feature_selection_methods = {
    'none': None,
    'f_regression': f_regression_cols,
    'mutual_info': mic_cols,
    'llarsic' : llarsic_cols
}

results = []

for model_name, model in models.items():
    for fs_method, selected_features in feature_selection_methods.items():
        print(f"Training {model_name} with {fs_method} feature selection...")
        
        if selected_features is None:
            X_train_fs = X_train
            X_test_fs = X_test
        else:
            X_train_fs = X_train[selected_features]
            X_test_fs = X_test[selected_features]
        
        # Fit model
        model.fit(X_train_fs, y_train)
        
        # Evaluate model
        eval_metrics = evaluate_regression_mdl(model, X_train_fs, X_test_fs, y_train, y_test)
        
        # Perform cross-validation
        cv_scores = cross_val_score(model, X_train_fs, y_train, cv=5, scoring='r2')
        
        # Calculate percentage differences
        rmse_diff_pct = ((eval_metrics['rmse_test'] - eval_metrics['rmse_train']) / eval_metrics['rmse_train']) * 100
        mae_diff_pct = ((eval_metrics['mae_test'] - eval_metrics['mae_train']) / eval_metrics['mae_train']) * 100
        r2_diff_pct = ((eval_metrics['r2_train'] - eval_metrics['r2_test']) / eval_metrics['r2_train']) * 100
        
        results.append({
            'model': f"{model_name}_{fs_method}",
            'fs_method': fs_method,
            'rmse_train': eval_metrics['rmse_train'],
            'rmse_test': eval_metrics['rmse_test'],
            'rmse_diff_pct': rmse_diff_pct,
            'mae_train': eval_metrics['mae_train'],
            'mae_test': eval_metrics['mae_test'],
            'mae_diff_pct': mae_diff_pct,
            'r2_train': eval_metrics['r2_train'],
            'r2_test': eval_metrics['r2_test'],
            'r2_diff_pct': r2_diff_pct,
            'num_feat': X_train_fs.shape[1],
            'cv_r2_mean': cv_scores.mean()
        })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Sort results by Test R2 score
results_df = results_df.sort_values('cv_r2_mean', ascending=False).reset_index(drop=True)

# Display results
print("\nModel Comparison Results:")
print(results_df)

# Save results to CSV
# results_df.to_csv('model_basicfs_comparison.csv', index=False)

Training Linear Regression with none feature selection...
Training Linear Regression with f_regression feature selection...
Training Linear Regression with mutual_info feature selection...
Training Linear Regression with llarsic feature selection...
Training Ridge with none feature selection...
Training Ridge with f_regression feature selection...
Training Ridge with mutual_info feature selection...
Training Ridge with llarsic feature selection...
Training Polynomial Regression with none feature selection...
Training Polynomial Regression with f_regression feature selection...
Training Polynomial Regression with mutual_info feature selection...
Training Polynomial Regression with llarsic feature selection...
Training Decision Tree with none feature selection...
Training Decision Tree with f_regression feature selection...
Training Decision Tree with mutual_info feature selection...
Training Decision Tree with llarsic feature selection...
Training Random Forest with none feature selecti

In [13]:
results_df.set_index('model').style.\
background_gradient(cmap='bone_r', subset=['mae_train', 'mae_test']).\
background_gradient(cmap='viridis', subset=['r2_train', 'r2_test']).\
background_gradient(cmap='magma', subset=['cv_r2_mean']).\
background_gradient(cmap='gist_heat',  low=0.5, high=3, subset=['mae_diff_pct', 'r2_diff_pct'])

Unnamed: 0_level_0,fs_method,rmse_train,rmse_test,rmse_diff_pct,mae_train,mae_test,mae_diff_pct,r2_train,r2_test,r2_diff_pct,num_feat,cv_r2_mean
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
CatBoost_mutual_info,mutual_info,1.505465,1.533855,1.885804,0.987218,0.972865,-1.453875,0.892907,0.861726,3.492107,10,0.798815
CatBoost_none,none,1.495275,1.62003,8.343278,0.991259,1.054401,6.369904,0.894352,0.845753,5.434055,15,0.791406
CatBoost_f_regression,f_regression,1.541984,1.60757,4.253322,0.985679,1.013621,2.834848,0.887649,0.848116,4.453612,10,0.786229
CatBoost_llarsic,llarsic,1.556045,1.488163,-4.362426,1.025677,0.950183,-7.360393,0.88559,0.869841,1.778364,10,0.785766
Random Forest_mutual_info,mutual_info,1.6836,1.695572,0.711114,0.981032,1.047337,6.758677,0.866064,0.831032,4.044999,10,0.784956
Random Forest_f_regression,f_regression,1.671609,1.742606,4.247217,0.976982,1.066986,9.212504,0.867965,0.821528,5.350141,10,0.784181
XGBoost_none,none,1.010309,1.580322,56.419739,0.660441,0.903349,36.779624,0.951769,0.853221,10.354156,15,0.778983
Random Forest_none,none,1.65839,1.732967,4.496966,0.974973,1.065931,9.32924,0.870045,0.823497,5.350124,15,0.778181
XGBoost_mutual_info,mutual_info,1.072501,1.550802,44.596744,0.68763,0.933029,35.68764,0.945648,0.858654,9.19945,10,0.776141
Random Forest_llarsic,llarsic,1.766665,1.584366,-10.318855,1.039898,1.018965,-2.012948,0.852522,0.852469,0.006202,10,0.766468


# Hyperparameter Tuning

## Weak model

In [14]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score


# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [10, 50, 100],               # Fewer estimators for smaller datasets
    'max_depth': [None, 5, 10, 20],              # Limit the depth of the trees
    'min_samples_split': [5, 7, 10, 12],             # Minimum samples to split an internal node
    'min_samples_leaf': [1, 2, 4],               # Minimum samples to be at a leaf node
    'max_features': ['sqrt', 'log2']     # Number of features to consider at each split
}

# Initialize the RandomForestRegressor
rf = RandomForestRegressor(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='r2')

# Fit GridSearchCV to the data
grid_search.fit(X_train[f_regression_cols], y_train)

# Get the best parameters and the best model
best_params = grid_search.best_params_
best_rf = grid_search.best_estimator_

print("Best parameters found: ", best_params)

# Evaluate the best model on the test set
y_train_pred = best_rf.predict(X_train[f_regression_cols])
y_test_pred = best_rf.predict(X_test[f_regression_cols])

train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

print("Train RMSE: ", train_rmse)
print("Test RMSE: ", test_rmse)
print("Train R2: ", r2_score(y_train, y_train_pred))
print("Test R2: ", r2_score(y_test, y_test_pred))

# Extract feature importances
feature_importances = pd.Series(best_rf.feature_importances_, index=X_train[f_regression_cols].columns).sort_values(ascending=False)

# Get the list of selected features based on importance
rf_selected_features = feature_importances.index.tolist()

print("Selected Features: ", rf_selected_features)


Fitting 5 folds for each of 288 candidates, totalling 1440 fits
Best parameters found:  {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 10}
Train RMSE:  1.56789403482656
Test RMSE:  1.7592124172354733
Train R2:  0.8838413346438463
Test R2:  0.8181102870206418
Selected Features:  ['distance', 'minutes_cos', 'start_stop_id_encoded', 'end_stop_id_encoded', 'route_id', 'precip_mm', 'pressure', 'cloud_cover', 'temperature', 'day_of_week_sin']


  _data = np.array(data, dtype=dtype, copy=copy,


In [15]:
rf_selected_features

['distance',
 'minutes_cos',
 'start_stop_id_encoded',
 'end_stop_id_encoded',
 'route_id',
 'precip_mm',
 'pressure',
 'cloud_cover',
 'temperature',
 'day_of_week_sin']

## GA

In [16]:
from sklearn_genetic import GAFeatureSelectionCV

ga_rf = GAFeatureSelectionCV(estimator=RandomForestRegressor(random_state=42, max_depth=None, min_samples_split=12, min_samples_leaf=1, n_estimators=10, max_features='sqrt'),
                            cv=3, scoring='neg_root_mean_squared_error', population_size=50,
                            crossover_probability=0.8, mutation_probability=0.1, n_jobs=-1,
                            generations=20, verbose=True, keep_top_k=3,\
                            tournament_size=20, elitism=True, refit=True)
ga_rf.fit(X_train.values, y_train)
ga_rf_cols = np.array(X_train.columns.to_list())[ga_rf.best_features_].tolist()

gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	50    	-2.68192	0.536461   	-2.08037   	-4.3727    
1  	96    	-2.14638	0.0441416  	-2.08037   	-2.27893   
2  	85    	-2.07665	0.0186722  	-2.03904   	-2.11903   
3  	94    	-2.0398 	0.00527733 	-2.03904   	-2.07674   
4  	92    	-2.03904	8.88178e-16	-2.03904   	-2.03904   
5  	93    	-2.03904	8.88178e-16	-2.03904   	-2.03904   
6  	88    	-2.03904	8.88178e-16	-2.03904   	-2.03904   
7  	95    	-2.03904	8.88178e-16	-2.03904   	-2.03904   
8  	92    	-2.03904	8.88178e-16	-2.03904   	-2.03904   
9  	91    	-2.03904	8.88178e-16	-2.03904   	-2.03904   
10 	91    	-2.03904	8.88178e-16	-2.03904   	-2.03904   
11 	91    	-2.03904	8.88178e-16	-2.03904   	-2.03904   
12 	85    	-2.03904	8.88178e-16	-2.03904   	-2.03904   
13 	90    	-2.03904	8.88178e-16	-2.03904   	-2.03904   
14 	91    	-2.03904	8.88178e-16	-2.03904   	-2.03904   
15 	89    	-2.03904	8.88178e-16	-2.03904   	-2.03904   
16 	92    	-2.03904	8.88178e-16	-2.03904   	-2.0

In [17]:
ga_rf_cols

['route_id',
 'distance',
 'month',
 'minutes_cos',
 'avg_temp_C',
 'precip_mm',
 'end_stop_id_encoded']

# Eval 2

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures
from catboost import CatBoostRegressor
from sklearn.pipeline import make_pipeline


def evaluate_regression_mdl(fitted_model, X_train, X_test, y_train, y_test):
    y_train_pred = fitted_model.predict(X_train)
    y_test_pred = fitted_model.predict(X_test)
    
    return {
        'rmse_train': np.sqrt(mean_squared_error(y_train, y_train_pred)),
        'rmse_test': np.sqrt(mean_squared_error(y_test, y_test_pred)),
        'mae_train': mean_absolute_error(y_train, y_train_pred),
        'mae_test': mean_absolute_error(y_test, y_test_pred),
        'r2_train': r2_score(y_train, y_train_pred),
        'r2_test': r2_score(y_test, y_test_pred)
    }

models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Polynomial Regression': make_pipeline(PolynomialFeatures(degree=2), Ridge(alpha=1.0)),
    'Decision Tree': DecisionTreeRegressor(max_depth=5, min_samples_leaf=5, random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=5, min_samples_leaf=5, random_state=42),
    'XGBoost': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
    'KNN': KNeighborsRegressor(n_neighbors=5),
    'CatBoost': CatBoostRegressor(iterations=100, depth=3, learning_rate=0.1, loss_function='RMSE', verbose=False)
}

feature_selection_methods = {
    'none': None,
    'f_regression': f_regression_cols,
    'mutual_info': mic_cols,
    'llarsic' : llarsic_cols,
    'ga': ga_rf_cols
}

results = []

for model_name, model in models.items():
    for fs_method, selected_features in feature_selection_methods.items():
        print(f"Training {model_name} with {fs_method} feature selection...")
        
        if selected_features is None:
            X_train_fs = X_train
            X_test_fs = X_test
        else:
            X_train_fs = X_train[selected_features]
            X_test_fs = X_test[selected_features]
        
        # Fit model
        model.fit(X_train_fs, y_train)
        
        # Evaluate model
        eval_metrics = evaluate_regression_mdl(model, X_train_fs, X_test_fs, y_train, y_test)
        
        # Perform cross-validation
        cv_scores = cross_val_score(model, X_train_fs, y_train, cv=5, scoring='r2')
        
        # Calculate percentage differences
        rmse_diff_pct = ((eval_metrics['rmse_test'] - eval_metrics['rmse_train']) / eval_metrics['rmse_train']) * 100
        mae_diff_pct = ((eval_metrics['mae_test'] - eval_metrics['mae_train']) / eval_metrics['mae_train']) * 100
        r2_diff_pct = ((eval_metrics['r2_train'] - eval_metrics['r2_test']) / eval_metrics['r2_train']) * 100
        
        results.append({
            'model': f"{model_name}_{fs_method}",
            'fs_method': fs_method,
            'rmse_train': eval_metrics['rmse_train'],
            'rmse_test': eval_metrics['rmse_test'],
            'rmse_diff_pct': rmse_diff_pct,
            'mae_train': eval_metrics['mae_train'],
            'mae_test': eval_metrics['mae_test'],
            'mae_diff_pct': mae_diff_pct,
            'r2_train': eval_metrics['r2_train'],
            'r2_test': eval_metrics['r2_test'],
            'r2_diff_pct': r2_diff_pct,
            'num_feat': X_train_fs.shape[1],
            'cv_r2_mean': cv_scores.mean()
        })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Sort results by Test R2 score
results_df = results_df.sort_values('cv_r2_mean', ascending=False).reset_index(drop=True)

# Display results
print("\nModel Comparison Results:")
print(results_df)

# Save results to CSV
results_df.to_csv('model_basicfs_comparison.csv', index=False)

Training Linear Regression with none feature selection...
Training Linear Regression with f_regression feature selection...
Training Linear Regression with mutual_info feature selection...
Training Linear Regression with llarsic feature selection...
Training Linear Regression with ga feature selection...
Training Ridge with none feature selection...
Training Ridge with f_regression feature selection...
Training Ridge with mutual_info feature selection...
Training Ridge with llarsic feature selection...
Training Ridge with ga feature selection...
Training Polynomial Regression with none feature selection...
Training Polynomial Regression with f_regression feature selection...
Training Polynomial Regression with mutual_info feature selection...
Training Polynomial Regression with llarsic feature selection...
Training Polynomial Regression with ga feature selection...
Training Decision Tree with none feature selection...
Training Decision Tree with f_regression feature selection...
Traini

In [19]:
results_df.set_index('model').style.\
background_gradient(cmap='bone_r', subset=['mae_train', 'mae_test']).\
background_gradient(cmap='viridis', subset=['r2_train', 'r2_test']).\
background_gradient(cmap='magma', subset=['cv_r2_mean']).\
background_gradient(cmap='gist_heat',  low=0.5, high=3, subset=['mae_diff_pct', 'r2_diff_pct'])

Unnamed: 0_level_0,fs_method,rmse_train,rmse_test,rmse_diff_pct,mae_train,mae_test,mae_diff_pct,r2_train,r2_test,r2_diff_pct,num_feat,cv_r2_mean
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
CatBoost_mutual_info,mutual_info,1.505465,1.533855,1.885804,0.987218,0.972865,-1.453875,0.892907,0.861726,3.492107,10,0.798815
CatBoost_none,none,1.495275,1.62003,8.343278,0.991259,1.054401,6.369904,0.894352,0.845753,5.434055,15,0.791406
CatBoost_f_regression,f_regression,1.541984,1.60757,4.253322,0.985679,1.013621,2.834848,0.887649,0.848116,4.453612,10,0.786229
Random Forest_ga,ga,1.698574,1.744723,2.716899,0.990383,1.049458,5.964949,0.863671,0.821094,4.929784,7,0.785784
CatBoost_llarsic,llarsic,1.556045,1.488163,-4.362426,1.025677,0.950183,-7.360393,0.88559,0.869841,1.778364,10,0.785766
CatBoost_ga,ga,1.535563,1.509457,-1.700102,0.978652,0.954843,-2.432812,0.888582,0.86609,2.531276,7,0.785534
Random Forest_mutual_info,mutual_info,1.6836,1.695572,0.711114,0.981032,1.047337,6.758677,0.866064,0.831032,4.044999,10,0.784956
Random Forest_f_regression,f_regression,1.671609,1.742606,4.247217,0.976982,1.066986,9.212504,0.867965,0.821528,5.350141,10,0.784181
XGBoost_none,none,1.010309,1.580322,56.419739,0.660441,0.903349,36.779624,0.951769,0.853221,10.354156,15,0.778983
Random Forest_none,none,1.65839,1.732967,4.496966,0.974973,1.065931,9.32924,0.870045,0.823497,5.350124,15,0.778181


In [21]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Define the parameter grid for CatBoost
param_grid = {
    'iterations': [500, 1000],
    'learning_rate': [0.01, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5, 7],  # L2 regularization
    'border_count': [32, 128],
    'bagging_temperature': [0, 1],  # Higher values reduce overfitting
    'random_strength': [1, 10],  # Higher values reduce overfitting
    'od_type': ['IncToDec'],  # Use overfitting detector
    'od_wait': [20, 50]  # Number of iterations to wait before stopping
}

# Initialize the CatBoostRegressor
cb = CatBoostRegressor(random_seed=42, verbose=False)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=cb, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='r2')

# Fit GridSearchCV to the data
grid_search.fit(X_train[f_regression_cols], y_train)

# Get the best parameters and the best model
best_params = grid_search.best_params_
best_cb = grid_search.best_estimator_

print("Best parameters found: ", best_params)

# Evaluate the best model on the train and test sets
y_train_pred = best_cb.predict(X_train[f_regression_cols])
y_test_pred = best_cb.predict(X_test[f_regression_cols])

train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

print("Train RMSE: ", train_rmse)
print("Test RMSE: ", test_rmse)
print("Train R2: ", r2_score(y_train, y_train_pred))
print("Test R2: ", r2_score(y_test, y_test_pred))

# Extract feature importances
feature_importances = pd.Series(best_cb.feature_importances_, index=X_train[f_regression_cols].columns).sort_values(ascending=False)

# Get the list of selected features based on importance
cb_selected_features = feature_importances.index.tolist()

print("Selected Features: ", cb_selected_features)

# Plot learning curves
from catboost import Pool
from matplotlib import pyplot as plt

train_pool = Pool(X_train[f_regression_cols], y_train)
test_pool = Pool(X_test[f_regression_cols], y_test)

best_cb.fit(train_pool, eval_set=test_pool, use_best_model=True, plot=True)

plt.show()

Fitting 5 folds for each of 768 candidates, totalling 3840 fits
Best parameters found:  {'bagging_temperature': 0, 'border_count': 128, 'depth': 4, 'iterations': 1000, 'l2_leaf_reg': 5, 'learning_rate': 0.01, 'od_type': 'IncToDec', 'od_wait': 20, 'random_strength': 1}
Train RMSE:  1.4973734732176835
Test RMSE:  1.5987130657855284
Train R2:  0.8940554872389873
Test R2:  0.8497852349936051
Selected Features:  ['distance', 'minutes_cos', 'route_id', 'start_stop_id_encoded', 'end_stop_id_encoded', 'precip_mm', 'cloud_cover', 'temperature', 'pressure', 'day_of_week_sin']


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [26]:
# Initialize the CatBoostRegressor
cb = CatBoostRegressor(random_seed=42, verbose=False)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=cb, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='r2')

# Fit GridSearchCV to the data
grid_search.fit(X_train[mic_cols], y_train)

# Get the best parameters and the best model
best_params = grid_search.best_params_
best_cb = grid_search.best_estimator_

print("Best parameters found: ", best_params)

# Evaluate the best model on the train and test sets
y_train_pred = best_cb.predict(X_train[mic_cols])
y_test_pred = best_cb.predict(X_test[mic_cols])

train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

print("Train RMSE: ", train_rmse)
print("Test RMSE: ", test_rmse)
print("Train R2: ", r2_score(y_train, y_train_pred))
print("Test R2: ", r2_score(y_test, y_test_pred))

# Extract feature importances
feature_importances = pd.Series(best_cb.feature_importances_, index=X_train[mic_cols].columns).sort_values(ascending=False)

# Get the list of selected features based on importance
cb_selected_features = feature_importances.index.tolist()

print("Selected Features: ", cb_selected_features)

# Plot learning curves
from catboost import Pool
from matplotlib import pyplot as plt

train_pool = Pool(X_train[mic_cols], y_train)
test_pool = Pool(X_test[mic_cols], y_test)

best_cb.fit(train_pool, eval_set=test_pool, use_best_model=True, plot=True)

plt.show()

Fitting 5 folds for each of 768 candidates, totalling 3840 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters found:  {'bagging_temperature': 0, 'border_count': 128, 'depth': 4, 'iterations': 1000, 'l2_leaf_reg': 5, 'learning_rate': 0.01, 'od_type': 'IncToDec', 'od_wait': 20, 'random_strength': 1}
Train RMSE:  1.476019723894843
Test RMSE:  1.5349012573403196
Train R2:  0.8970556490689778
Test R2:  0.8615374078465947
Selected Features:  ['distance', 'route_id', 'minutes_cos', 'start_stop_id_encoded', 'end_stop_id_encoded', 'humidity', 'precip_mm', 'temperature', 'pressure', 'avg_temp_C']


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

## CatBoost Model Training Process

The CatBoost model underwent a comprehensive training process, as evidenced by the learning curve graph and performance metrics provided. This section details the key aspects of the training:

### Learning Curve Analysis

The graph illustrates the model's learning progression over 1000 iterations:

1. **Convergence**: Both the training (solid line) and test (dotted line) RMSE curves show a steady decrease, indicating consistent learning and improvement in prediction accuracy.

2. **Overfitting Control**: The close proximity of the training and test curves, particularly in later iterations, suggests effective overfitting prevention. This demonstrates CatBoost's built-in regularization mechanisms working as intended.

3. **Early Iterations**: Rapid improvement is observed in the initial stages (0-200 iterations), with the RMSE dropping sharply from around 4.5 to approximately 2.0.

4. **Later Iterations**: The learning rate slows down after about 400 iterations, with both curves showing more gradual improvement. This is typical in gradient boosting models as they fine-tune predictions.

### Performance Metrics

The final model performance metrics are as follows:

- **Training RMSE**: 1.5083426687 (cur)
- **Test RMSE**: 1.63454978740 (test)
- **Best Test RMSE**: 1.50834268700 (best)

These metrics indicate:

1. **Generalization**: The small difference between training and test RMSE (1.5083 vs 1.6345) suggests good generalization capabilities.

2. **Model Selection**: The 'best' test RMSE (1.5083) is lower than the final test RMSE, indicating that the model selection process likely employed early stopping or best model selection to prevent overfitting.

### Training Duration

The training process took 229ms, which is relatively quick and suggests efficient use of computational resources.

### Conclusion

The CatBoost model demonstrates a robust training process with effective learning and generalization. The close alignment of training and test errors, coupled with the gradual convergence of the learning curves, indicates a well-balanced model that avoids overfitting while capturing the underlying patterns in the data. The final RMSE values suggest strong predictive performance, making this model suitable for deployment in production environments.

In [33]:
from sklearn import inspection
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score

cb_mdl = CatBoostRegressor(**best_params, random_state=42, verbose=False)
cb_mdl = cb_mdl.fit(X_train[mic_cols], y_train)
cb_feat_imp = cb_mdl.feature_importances_

# Calculate metrics for CatBoost model
cb_y_train_pred = cb_mdl.predict(X_train[mic_cols])
cb_y_test_pred = cb_mdl.predict(X_test[mic_cols])
cb_mae_train = mean_absolute_error(y_train, cb_y_train_pred)
cb_mae_test = mean_absolute_error(y_test, cb_y_test_pred)
cb_r2_train = r2_score(y_train, cb_y_train_pred)
cb_r2_test = r2_score(y_test, cb_y_test_pred)
cb_cv_r2_mean = cross_val_score(cb_mdl, X_train[mic_cols], y_train, cv=5, scoring='r2').mean()

rf_mdl = RandomForestRegressor(random_state=42, max_depth=5, min_samples_split=5, min_samples_leaf=1, n_estimators=10, max_features='sqrt')
rf_mdl = rf_mdl.fit(X_train[mic_cols].to_numpy(), y_train.to_numpy())
rf_feat_imp = rf_mdl.feature_importances_

# Calculate metrics for RandomForest model
rf_y_train_pred = rf_mdl.predict(X_train[mic_cols].to_numpy())
rf_y_test_pred = rf_mdl.predict(X_test[mic_cols].to_numpy())
rf_mae_train = mean_absolute_error(y_train, rf_y_train_pred)
rf_mae_test = mean_absolute_error(y_test, rf_y_test_pred)
rf_r2_train = r2_score(y_train, rf_y_train_pred)
rf_r2_test = r2_score(y_test, rf_y_test_pred)
rf_cv_r2_mean = cross_val_score(rf_mdl, X_train[mic_cols].to_numpy(), y_train.to_numpy(), cv=5, scoring='r2').mean()

# Print metrics
print(f"CatBoost MAE Train: {cb_mae_train}, MAE Test: {cb_mae_test}, R2 Train: {cb_r2_train}, R2 Test: {cb_r2_test}, CV R2 Mean: {cb_cv_r2_mean}")
print(f"RandomForest MAE Train: {rf_mae_train}, MAE Test: {rf_mae_test}, R2 Train: {rf_r2_train}, R2 Test: {rf_r2_test}, CV R2 Mean: {rf_cv_r2_mean}")

# Store cross-validation results
cv_r2_mean = {
    'CatBoost': cb_cv_r2_mean,
    'RandomForest': rf_cv_r2_mean
}

print(f"Cross-Validation R2 Mean: {cv_r2_mean}")


CatBoost MAE Train: 0.9752455595310882, MAE Test: 0.9964368614070229, R2 Train: 0.8970556490689778, R2 Test: 0.8615374078465947, CV R2 Mean: 0.7998348186343076
RandomForest MAE Train: 1.0293067136501968, MAE Test: 1.091148793662605, R2 Train: 0.8767838643764931, R2 Test: 0.8311840688283251, CV R2 Mean: 0.793513198205092
Cross-Validation R2 Mean: {'CatBoost': 0.7998348186343076, 'RandomForest': 0.793513198205092}


In [30]:
best_params

{'bagging_temperature': 0,
 'border_count': 128,
 'depth': 4,
 'iterations': 1000,
 'l2_leaf_reg': 5,
 'learning_rate': 0.01,
 'od_type': 'IncToDec',
 'od_wait': 20,
 'random_strength': 1}

In [34]:
mic_cols

['route_id',
 'distance',
 'minutes_cos',
 'avg_temp_C',
 'temperature',
 'precip_mm',
 'humidity',
 'pressure',
 'start_stop_id_encoded',
 'end_stop_id_encoded']

In [29]:
model_path = 'D:\\CDS590\\src\\models\\best_catboost_model_wospeed.cbm' 
best_cb.save_model(model_path)