In [2]:
import hopsworks
from hsfs.client.exceptions import RestAPIError
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


In [3]:
# To display all the columns in output
pd.set_option('display.max_columns', None)


In [4]:
# Establish connection to Hopsworks using the API key
project = hopsworks.login(
    api_key_value="dYTVrTVvbj6Qw82i.YGKHdS9snQYFgOADJIvLdvZ2n2S5BxIAOtvPUEmAyd56bvaG6xhhGyNM3nYbexaP"
)

# Access the Feature Store
fs = project.get_feature_store()

# Specify the feature group and its version
feature_group_name = "final_df_feature_group"
feature_group_version = 1

try:
    # Retrieve the feature group
    final_df_fg = fs.get_feature_group(feature_group_name, version=feature_group_version)
    # Read the feature group as a Pandas DataFrame
    final_df = final_df_fg.read()
    print(f"Downloaded feature group: {feature_group_name} (version {feature_group_version})")
except RestAPIError as e:
    print(f"Error downloading feature group: {feature_group_name} (version {feature_group_version})")
    raise e

# Prepare the data
unique_counts = final_df.nunique()
cts_cols_df = final_df[['unique_id', 'route_avg_temp', 'route_avg_wind_speed', 'route_avg_precip',
                         'route_avg_humidity', 'route_avg_visibility', 'route_avg_pressure', 'distance',
                         'average_hours', 'temp_origin', 'wind_speed_origin', 'precip_origin',
                         'humidity_origin', 'visibility_origin', 'pressure_origin',
                         'temp_destination', 'wind_speed_destination', 'precip_destination',
                         'humidity_destination', 'visibility_destination', 'pressure_destination',
                         'avg_no_of_vehicles', 'truck_age', 'load_capacity_pounds', 'mileage_mpg',
                         'age', 'experience', 'average_speed_mph']].copy()

cat_cols_df = final_df[['route_description', 'description_origin', 'description_destination',
                         'accident', 'fuel_type', 'gender', 'driving_style', 
                         'ratings', 'is_midnight']]

date_cols_df = final_df[['unique_id', 'departure_date', 'estimated_arrival', 
                          'estimated_arrival_nearest_hour', 'departure_date_nearest_hour']].copy()

target_df = final_df[['delay']]



Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1044630
Connected. Call `.close()` to terminate connection gracefully.
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.80s) 
Downloaded feature group: final_df_feature_group (version 1)


In [5]:
# One-hot encoding for categorical columns
cat_cols_encoded_df = pd.get_dummies(cat_cols_df, drop_first=True)

# Concatenate the continuous columns, encoded categorical columns, and target column
final_encoded_df = pd.concat([cts_cols_df, cat_cols_encoded_df, target_df], axis=1)

# Scale the continuous columns
unique_id = cts_cols_df['unique_id']
cts_cols_without_id = cts_cols_df.drop('unique_id', axis=1)

scaler = StandardScaler()
cts_cols_scaled = scaler.fit_transform(cts_cols_without_id)
cts_cols_scaled_df = pd.DataFrame(cts_cols_scaled, columns=cts_cols_without_id.columns)
cts_cols_scaled_df = pd.concat([unique_id.reset_index(drop=True), cts_cols_scaled_df], axis=1)

final_scaled_df = pd.concat([cts_cols_scaled_df, cat_cols_encoded_df, target_df.reset_index(drop=True)], axis=1)


In [6]:
# Class Imbalance with Oversampling using SMOTE
final_scaled_df_with_target = final_scaled_df.copy()
X = final_scaled_df_with_target.drop(columns=['delay'])
y = final_scaled_df_with_target['delay']

# Apply SMOTE to create synthetic samples for the minority class
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Convert the resampled arrays back to a DataFrame
final_resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
final_resampled_df['delay'] = y_resampled

# Verify the counts of the target variable after oversampling
print(final_resampled_df['delay'].value_counts())



delay
0    6704
1    6704
Name: count, dtype: int64


In [7]:
# PCA
unique_id = final_resampled_df['unique_id']
target = final_resampled_df['delay']
features_for_pca = final_resampled_df.drop(columns=['unique_id', 'delay'])

pca = PCA(n_components=0.95)
pca_transformed = pca.fit_transform(features_for_pca)
pca_columns = [f'PCA_{i+1}' for i in range(pca_transformed.shape[1])]
pca_df = pd.DataFrame(pca_transformed, columns=pca_columns)

final_pca_df = pd.concat([unique_id.reset_index(drop=True), pca_df, target.reset_index(drop=True)], axis=1)



In [8]:
# Train, test, and validation split
final_pca_df = final_pca_df.merge(date_cols_df[['unique_id', 'estimated_arrival']], on='unique_id', how='left')

if 'estimated_arrival' in final_pca_df.columns:
    final_pca_df['estimated_arrival'] = final_pca_df['estimated_arrival'].dt.tz_localize(None)

    train_df = final_pca_df[final_pca_df['estimated_arrival'] <= pd.to_datetime('2019-01-30')]
    validation_df = final_pca_df[(final_pca_df['estimated_arrival'] > pd.to_datetime('2019-01-30')) & 
                                  (final_pca_df['estimated_arrival'] <= pd.to_datetime('2019-02-07'))]
    test_df = final_pca_df[final_pca_df['estimated_arrival'] > pd.to_datetime('2019-02-07')]

    X_train = train_df.drop(columns=['delay', 'unique_id', 'estimated_arrival'])
    y_train = train_df['delay']

    X_valid = validation_df.drop(columns=['delay', 'unique_id', 'estimated_arrival'])
    y_valid = validation_df['delay']

    X_test = test_df.drop(columns=['delay', 'unique_id', 'estimated_arrival'])
    y_test = test_df['delay']

    # Print out the shapes of the resulting datasets to verify the splits
    print(f"Training set shape: {X_train.shape}, Validation set shape: {X_valid.shape}, Test set shape: {X_test.shape}")
else:
    print("Error: 'estimated_arrival' column not found in final_pca_df.")



Training set shape: (8850, 30), Validation set shape: (2572, 30), Test set shape: (1986, 30)


In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# Define the Random Forest model
rf_model = RandomForestClassifier(random_state=42, max_depth=None, max_features='sqrt', 
                                  min_samples_leaf=1, min_samples_split=2, n_estimators=300)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the validation set
y_valid_pred_rf = rf_model.predict(X_valid)

# Evaluate the model performance
print("Random Forest Classification Report (after hyperparameter tuning):")
print(classification_report(y_valid, y_valid_pred_rf))
print("Random Forest Accuracy (after hyperparameter tuning):", accuracy_score(y_valid, y_valid_pred_rf))

# Save the model if necessary, and proceed with further evaluations


Random Forest Classification Report (after hyperparameter tuning):
              precision    recall  f1-score   support

           0       0.68      0.81      0.74      1244
           1       0.78      0.64      0.70      1328

    accuracy                           0.72      2572
   macro avg       0.73      0.72      0.72      2572
weighted avg       0.73      0.72      0.72      2572

Random Forest Accuracy (after hyperparameter tuning): 0.7220062208398134


In [10]:
from sklearn.naive_bayes import GaussianNB

# Train Naive Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Make predictions on the validation set
y_valid_pred_nb = nb_model.predict(X_valid)

# Evaluate the Naive Bayes model
print("Naive Bayes Classification Report:")
print(classification_report(y_valid, y_valid_pred_nb))
print("Naive Bayes Accuracy:", accuracy_score(y_valid, y_valid_pred_nb))


Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.72      0.68      1244
           1       0.70      0.62      0.66      1328

    accuracy                           0.67      2572
   macro avg       0.67      0.67      0.67      2572
weighted avg       0.67      0.67      0.67      2572

Naive Bayes Accuracy: 0.6699066874027994


In [11]:
from sklearn.linear_model import LogisticRegression

# Train Logistic Regression model
log_reg_model = LogisticRegression(max_iter=1000, random_state=42)
log_reg_model.fit(X_train, y_train)

# Make predictions on the validation set
y_valid_pred_log_reg = log_reg_model.predict(X_valid)

# Evaluate the Logistic Regression model
print("Logistic Regression Classification Report:")
print(classification_report(y_valid, y_valid_pred_log_reg))
print("Logistic Regression Accuracy:", accuracy_score(y_valid, y_valid_pred_log_reg))


Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.75      0.68      1244
           1       0.71      0.58      0.64      1328

    accuracy                           0.66      2572
   macro avg       0.67      0.67      0.66      2572
weighted avg       0.67      0.66      0.66      2572

Logistic Regression Accuracy: 0.6632970451010887


In [12]:
import xgboost as xgb

# Train XGBoost model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions on the validation set
y_valid_pred_xgb = xgb_model.predict(X_valid)

# Evaluate the XGBoost model
print("XGBoost Classification Report:")
print(classification_report(y_valid, y_valid_pred_xgb))
print("XGBoost Accuracy:", accuracy_score(y_valid, y_valid_pred_xgb))


Parameters: { "use_label_encoder" } are not used.


XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.76      0.72      1244
           1       0.75      0.68      0.71      1328

    accuracy                           0.72      2572
   macro avg       0.72      0.72      0.72      2572
weighted avg       0.72      0.72      0.72      2572

XGBoost Accuracy: 0.7192846034214619


## Hyperparameter Tuning

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define the base model
rf_model = RandomForestClassifier(random_state=42)

# Define the hyperparameter grid
rf_param_grid = {
    'n_estimators': [300, 500, 700],   # More trees in the forest
    'max_depth': [None, 20, 25, 30],   # Vary the depth of the trees
    'min_samples_split': [2, 5, 10],   # Vary the minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4],     # Vary the minimum samples at a leaf node
    'max_features': ['sqrt', 'log2'],  # Vary the number of features to consider when splitting
    'bootstrap': [True, False],        # Whether to use bootstrapping
    'class_weight': [None, 'balanced'] # Addressing class imbalance
}

# Set up GridSearchCV
rf_grid_search = GridSearchCV(estimator=rf_model, param_grid=rf_param_grid,
                               cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

# Fit the model
rf_grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Random Forest Parameters:", rf_grid_search.best_params_)
print("Best Random Forest Score:", rf_grid_search.best_score_)

# Make predictions on the validation set
y_valid_pred_rf = rf_grid_search.predict(X_valid)

# Evaluate the model
print("Random Forest Classification Report:")
print(classification_report(y_valid, y_valid_pred_rf))
print("Random Forest Accuracy:", accuracy_score(y_valid, y_valid_pred_rf))


Fitting 5 folds for each of 864 candidates, totalling 4320 fits
[CV] END bootstrap=True, class_weight=None, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   9.1s
[CV] END bootstrap=True, class_weight=None, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   9.1s
[CV] END bootstrap=True, class_weight=None, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   9.2s
[CV] END bootstrap=True, class_weight=None, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   9.3s
[CV] END bootstrap=True, class_weight=None, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   9.5s
[CV] END bootstrap=True, class_weight=None, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=  

KeyboardInterrupt: 

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# Define the Random Forest model
rf_model = RandomForestClassifier(random_state=42, max_depth=None, max_features='sqrt', 
                                  min_samples_leaf=1, min_samples_split=2, n_estimators=300)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the validation set
y_valid_pred_rf = rf_model.predict(X_valid)

# Evaluate the model performance
print("Random Forest Classification Report (after hyperparameter tuning):")
print(classification_report(y_valid, y_valid_pred_rf))
print("Random Forest Accuracy (after hyperparameter tuning):", accuracy_score(y_valid, y_valid_pred_rf))

# Save the model if necessary, and proceed with further evaluations


Random Forest Classification Report (after hyperparameter tuning):
              precision    recall  f1-score   support

           0       0.68      0.81      0.74      1244
           1       0.78      0.64      0.70      1328

    accuracy                           0.72      2572
   macro avg       0.73      0.72      0.72      2572
weighted avg       0.73      0.72      0.72      2572

Random Forest Accuracy (after hyperparameter tuning): 0.7220062208398134
