In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import time
from sklearn.metrics import classification_report

In [2]:
train = pd.read_csv('train_final.csv')
validation = pd.read_csv('val_final.csv')

#### Apply standard scaling to numerical columns

In [9]:
# Standardize Training Set Features 
scaler_train = StandardScaler()
numer_cols_train = scaler_train.fit_transform(train.loc[:, 'Temperature':'Distance'])
cat_cols_train = train.loc[:, 'Blocked':'Severity'].values
scaled_train = pd.DataFrame(np.hstack([numer_cols_train, cat_cols_train]))
scaled_train.columns = train.columns
X_train = scaled_train.loc[:, 'Temperature':'MapQuest-Bing'].values.astype(float)
y_train = scaled_train.loc[:, 'Severity'].values.astype(int)

# Standardize Validation Set Features 
scaler_val = StandardScaler()
numer_cols_val = scaler_val.fit_transform(validation.loc[:, 'Temperature':'Distance'])
cat_cols_val = validation.loc[:, 'Blocked':'Severity'].values
scaled_validation = pd.DataFrame(np.hstack([numer_cols_val, cat_cols_val]))
scaled_validation.columns = validation.columns
X_val = scaled_validation.loc[:, 'Temperature':'MapQuest-Bing'].values.astype(float)
y_val = scaled_validation.loc[:, 'Severity'].values.astype(int)

#### Method 1: Baseline Logistic Regression (no penalty)

In [13]:
start = time.time()
log_reg = LogisticRegression(random_state=0, solver='saga', C=10**6, max_iter=1000, tol=10**-3)
log_reg.fit(X_train, y_train)
end = time.time()
y_train_pred = log_reg.predict(X_train)
y_val_pred = log_reg.predict(X_val)
print(f'The Logistic Regression classifier took {(end - start):.2f} seconds to train.')
print('Metrics on the Training Set: \n')
print(classification_report(y_train, y_train_pred))
print('Metrics on the Validation Set: \n')
print(classification_report(y_val, y_val_pred))

The Logistic Regression classifier took 1096.09 seconds to train.
Metrics on the Training Set: 

              precision    recall  f1-score   support

           0       0.79      0.91      0.85   1697093
           1       0.63      0.40      0.49    667944

    accuracy                           0.77   2365037
   macro avg       0.71      0.66      0.67   2365037
weighted avg       0.75      0.77      0.75   2365037

Metrics on the Validation Set: 

              precision    recall  f1-score   support

           0       0.79      0.91      0.85    423784
           1       0.64      0.40      0.49    167476

    accuracy                           0.77    591260
   macro avg       0.71      0.66      0.67    591260
weighted avg       0.75      0.77      0.75    591260

