In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.metrics import ConfusionMatrixDisplay, f1_score, accuracy_score, recall_score, classification_report, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

import copy

import time
from tqdm.notebook import tqdm_notebook as tqdm

import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import StepLR
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

In [None]:
class MLPv1(nn.Module):

    def __init__(self, input_dim, hidden_dim, num_hidden_layers, output_dim):
        super().__init__()

        layers = [nn.Linear(input_dim, hidden_dim), nn.LeakyReLU()]

        for _ in range(num_hidden_layers):
            layers.extend([nn.Linear(hidden_dim, hidden_dim), nn.LeakyReLU()])

        layers.append(nn.Linear(hidden_dim, output_dim))

        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

In [None]:
class MLPv2(nn.Module):

    def __init__(self, input_dim: int, hidden_dims: list, output_dim: int, dropout_prob: float):
        super(MLPv2, self).__init__()

        self.input_layer = nn.Linear(input_dim, hidden_dims[0])

        self.hidden_layers = nn.ModuleList(
            [
                nn.Linear(hidden_dims[i], hidden_dims[i+1]) for i in range(len(hidden_dims) - 1)
            ]
        )
        self.output_layer = nn.Linear(hidden_dims[-1], output_dim)

        self.dropout = nn.Dropout1d(p=dropout_prob)

    def forward(self, x):
        x = self.input_layer(x)

        for layer in self.hidden_layers:
            x = F.sigmoid(self.dropout(layer(x)))

        x = self.output_layer(x)

        return x

In [None]:
def train(model,
          X_train: torch.Tensor,
          y_train: torch.Tensor,
          X_val: torch.Tensor,
          y_val: torch.Tensor,
          class_weights=None,
          learning_rate=0.1,
          batch_size=32,
          epochs=25) -> None:

    start_time = time.time()
    criterion = nn.CrossEntropyLoss(weight=class_weights)

    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    # optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    # scheduler = StepLR(optimizer, step_size=3, gamma=learning_rate/2)

    train_data = [(X_train[i,:], y_train[i]) for i in range(len(y_train))]

    best_val_accuracy = -1
    best_checkpoint = None
    best_epoch = -1

    # Training loop
    for epoch in tqdm(range(epochs), dynamic_ncols=True):
        train_correct = 0
        model.train()

        for batch in tqdm(DataLoader(train_data,
                                     batch_size=batch_size,
                                     shuffle=True),
                          desc=f"Epoch {epoch+1}/{epochs}",
                          leave=False):

            x_batch, y_batch = batch

            optimizer.zero_grad()
            scores = model(x_batch)
            loss = criterion(scores, y_batch)
            loss.backward()
            optimizer.step()

            preds = torch.argmax(scores, dim=1)
            train_correct += torch.sum(preds == y_batch).item()


        # scheduler.step()
        train_accuracy = train_correct / len(y_train)

        model.eval()
        with torch.no_grad():
            val_scores = model(X_val)
            val_preds = torch.argmax(val_scores, dim=1)
            # print(Counter(val_preds.numpy()))
            val_accuracy = torch.mean((val_preds == y_val).float()).item()
            val_f1score = f1_score(y_val, val_preds, average='weighted')

            tqdm.write(f" Epoch {epoch+1}/{epochs}  Train: {train_accuracy:.3f}  Val Acc: {val_accuracy:.3f}  Val F1: {val_f1score:.3f}")

            if val_accuracy > best_val_accuracy:
                best_val_accuracy = val_accuracy
                best_checkpoint = copy.deepcopy(model.state_dict())
                best_epoch = epoch

        end_time = time.time()

    model.load_state_dict(best_checkpoint)
    print(f'Training took {end_time - start_time:.2f} seconds')
    print(f'\nBest epoch was {best_epoch}, val_acc={best_val_accuracy:.3f}')
    print(f"Class_weights: {class_weights}\nLearning_rate: {learning_rate}\nBatch_size: {batch_size}\nEpochs: {epochs}")

In [None]:
def evaluate(model: object, X: torch.Tensor, y: torch.Tensor) -> str:
    model.eval()
    with torch.no_grad():
        scores = model(X)
        y_preds = torch.argmax(scores, dim=1)
        # accuracy = torch.mean((y_preds == y).float()).item()

        class_report = classification_report(y.numpy(),
                                             y_preds.numpy(),
                                             labels=range(NUM_CLASSES),
                                             target_names=[f'{LABEL_NAME} {i+1}' for i in range(NUM_CLASSES)],
                                             output_dict=False)

    return class_report

In [None]:
df = pd.read_csv("/content/drive/MyDrive/DSCI-550-Group-Project/Data/cleaned_data.csv")

In [None]:
df.drop(columns=['Street', 'City', 'County','State','Zipcode','Airport_Code'], inplace=True)

## Feature Engineering

In [None]:
categorical_cols = df.select_dtypes(include=['object','bool','category']).columns

for x in range(0,len(categorical_cols)):
  col = categorical_cols[x]
  print(col + ":" + str(df[col].value_counts()))


Side:R    2219525
L     447034
Name: Side, dtype: int64
Wind_Direction:Calm        548625
NW          385220
SW          373614
SE          315264
NE          249759
S           196802
W           194803
N           148441
E           135554
Variable    118477
Name: Wind_Direction, dtype: int64
Weather_Condition:Clear           1267159
Cloudy          1027611
Rain             176195
Fog               73587
Snow              54296
Windy             30719
Thunderstorm      29839
Smoke              6796
Sand                202
Hail                147
Tornado               8
Name: Weather_Condition, dtype: int64


In [None]:
df = pd.get_dummies(df, columns = ['Side'])
df.head(5)

Unnamed: 0,Severity,Start_Lat,Start_Lng,Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),...,Sunrise_Sunset_Night,Civil_Twilight_Night,Nautical_Twilight_Night,Astronomical_Twilight_Night,year,month,Weekday,Hour,Side_L,Side_R
0,3,40.10891,-83.09286,42.1,36.1,58.0,29.76,10.0,SW,10.4,...,1,1,1,1,2016,2,0,0,0,1
1,2,39.86542,-84.0628,36.9,63.0,91.0,29.68,10.0,Calm,7.0,...,1,1,1,1,2016,2,0,5,0,1
2,2,39.10266,-84.52468,36.0,63.0,97.0,29.7,10.0,Calm,7.0,...,1,1,1,0,2016,2,0,6,0,1
3,2,41.06213,-81.53784,39.0,63.0,55.0,29.65,10.0,Calm,7.0,...,1,1,0,0,2016,2,0,6,0,1
4,3,39.172393,-84.492792,37.0,29.8,93.0,29.69,10.0,SW,10.4,...,0,0,0,0,2016,2,0,7,0,1


In [None]:
def class_breakdown(df: pd.DataFrame, label="Severity"):

    label_counts = df[label].value_counts()
    label_percentages = df[label].value_counts(normalize=True) * 100

    return pd.merge(label_counts, label_percentages, on=label)


In [None]:
def stratified_sampling(df, label, sample_size):
    groups = df.groupby(label)
    sample = pd.DataFrame()

    for _, group in groups:
        stratum_sample = group.sample(frac=sample_size, replace=False, random_state=42)
        sample = sample.append(stratum_sample)

    return sample

In [None]:
# df2 = df.sample(frac=0.5, random_state=42)
# df2.dropna(inplace=True)

In [None]:
df2 = stratified_sampling(df, "Severity", 0.5)
print(df2["Severity"].value_counts(),  # print breakdown of df by severity
      "\n",
      df2["Severity"].value_counts()/df2["Severity"].value_counts().sum()) # print proportions

2    1179603
3      76546
4      64606
1      12524
Name: Severity, dtype: int64 
 2    0.884738
3    0.057412
4    0.048456
1    0.009393
Name: Severity, dtype: float64


SPLITTING

In [None]:
# sev_2_samples = df2[df2['Severity'] == 2].sample(n=df2["Severity"].value_counts().nlargest(2).iloc[-1],
#                                                  random_state=42)
# df2 = df2[df2['Severity'] != 2]
# df2 = pd.concat([df2, sev_2_samples])
# print(df2["Severity"].value_counts(), "\n", df2["Severity"].value_counts()/df2["Severity"].value_counts().sum())

In [None]:
train_data, test_val_data = train_test_split(df2, stratify=df2["Severity"], test_size=0.30, random_state=42)


test_data, val_data = train_test_split(test_val_data, test_size=0.50, random_state=42)

In [None]:
test_val_data.shape

(399984, 34)

In [None]:
X_val_unnorm = val_data.drop('Severity', axis=1)
X_test_unnorm = test_data.drop('Severity', axis=1)
y_val = val_data['Severity'] - 1
y_test = test_data['Severity'] - 1

In [None]:
# sev_2_samples = train_data[train_data['Severity'] == 2].sample(n=train_data["Severity"].value_counts().nlargest(2).iloc[-1],
#                                                  random_state=42)
# train_data = train_data[train_data['Severity'] != 2]
# train_data = pd.concat([train_data, sev_2_samples])
# print(train_data["Severity"].value_counts(), "\n", train_data["Severity"].value_counts()/train_data["Severity"].value_counts().sum())

In [None]:
X_train_unnorm = train_data.drop('Severity', axis=1)
y_train = train_data['Severity'] - 1

In [None]:
X_train_unnorm.columns

Index(['Start_Lat', 'Start_Lng', 'Temperature(F)', 'Wind_Chill(F)',
       'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction',
       'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'Amenity',
       'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway',
       'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal',
       'Sunrise_Sunset_Night', 'Civil_Twilight_Night',
       'Nautical_Twilight_Night', 'Astronomical_Twilight_Night', 'year',
       'month', 'Weekday', 'Hour', 'Side_L', 'Side_R'],
      dtype='object')

In [None]:
label_encoder = LabelEncoder()

for col in categorical_cols.drop("Side"):
    X_train_unnorm[col] = label_encoder.fit_transform(X_train_unnorm[col])
    X_test_unnorm[col] = label_encoder.transform(X_test_unnorm[col])
    X_val_unnorm[col] = label_encoder.transform(X_val_unnorm[col])

In [None]:
X_train_unnorm.drop(["Side_R", "year"], axis=1, inplace=True)
X_test_unnorm.drop(["Side_R", "year"], axis=1, inplace=True)
X_val_unnorm.drop(["Side_R", "year"], axis=1, inplace=True)

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(X_train_unnorm)

X_train = scaler.transform(X_train_unnorm)
X_test = scaler.transform(X_test_unnorm)
X_val = scaler.transform(X_val_unnorm)

In [None]:
print(X_train.shape, X_test.shape, X_val.shape, y_train.shape, y_test.shape, y_val.shape)


(933295, 32) (199992, 32) (199992, 32) (933295,) (199992,) (199992,)


In [None]:
unique_classes = np.unique(y_train)
class_indices = {class_label: idx for idx, class_label in enumerate(unique_classes)}
y_train_int = np.array([class_indices[class_label] for class_label in y_train])

In [None]:
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32)

print(class_weights_tensor)

tensor([26.6139,  0.2826,  4.3545,  5.1593])


# MLP

In [None]:
torch.manual_seed(11)

X_train_tensor = torch.tensor(X_train, dtype=torch.float)
y_train_tensor = torch.tensor(y_train.to_numpy(), dtype=torch.long)
X_val_tensor = torch.tensor(X_val, dtype=torch.float)
y_val_tensor = torch.tensor(y_val.to_numpy(), dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float)
y_test_tensor = torch.tensor(y_test.to_numpy(), dtype=torch.long)

In [None]:
type(X_train_tensor)

torch.Tensor

In [None]:
# Dataset
INPUT_DIM = 32 # be sure to adjust
NUM_CLASSES = 4
LABEL_NAME = "Severity"

# Parameters, Hyperparameters
HIDDEN_DIM = 192
NUM_HIDDEN_LAYERS = 5
HIDDEN_DIM_ARR = [HIDDEN_DIM]*8
DROPOUT_P = 0.2
LEARNING_RATE = 0.1
BATCH_SIZE = 32
NUM_EPOCHS = 10

In [None]:
model = MLPv1(INPUT_DIM, HIDDEN_DIM, NUM_HIDDEN_LAYERS, NUM_CLASSES)
# model = MLPv2(INPUT_DIM, HIDDEN_DIM_ARR, NUM_CLASSES, DROPOUT_P)

In [None]:
train(model,
      X_train_tensor,
      y_train_tensor,
      X_val_tensor,
      y_val_tensor,
      # class_weights=class_weights_tensor,
      learning_rate=LEARNING_RATE,
      batch_size=BATCH_SIZE,
      epochs=NUM_EPOCHS)

print(f"Hidden_dim_size: {HIDDEN_DIM}")
# if not sev_2_samples.empty:
#   print("Balanced Severity 2 and 3")

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1/10:   0%|          | 0/29166 [00:00<?, ?it/s]

 Epoch 1/10  Train: 0.887  Val Acc: 0.892  Val F1: 0.870


Epoch 2/10:   0%|          | 0/29166 [00:00<?, ?it/s]

 Epoch 2/10  Train: 0.894  Val Acc: 0.895  Val F1: 0.861


Epoch 3/10:   0%|          | 0/29166 [00:00<?, ?it/s]

 Epoch 3/10  Train: 0.896  Val Acc: 0.898  Val F1: 0.874


Epoch 4/10:   0%|          | 0/29166 [00:00<?, ?it/s]

 Epoch 4/10  Train: 0.898  Val Acc: 0.899  Val F1: 0.879


Epoch 5/10:   0%|          | 0/29166 [00:00<?, ?it/s]

 Epoch 5/10  Train: 0.899  Val Acc: 0.898  Val F1: 0.880


Epoch 6/10:   0%|          | 0/29166 [00:00<?, ?it/s]

 Epoch 6/10  Train: 0.901  Val Acc: 0.900  Val F1: 0.878


Epoch 7/10:   0%|          | 0/29166 [00:00<?, ?it/s]

 Epoch 7/10  Train: 0.901  Val Acc: 0.901  Val F1: 0.881


Epoch 8/10:   0%|          | 0/29166 [00:00<?, ?it/s]

 Epoch 8/10  Train: 0.902  Val Acc: 0.901  Val F1: 0.884


Epoch 9/10:   0%|          | 0/29166 [00:00<?, ?it/s]

 Epoch 9/10  Train: 0.903  Val Acc: 0.899  Val F1: 0.876


Epoch 10/10:   0%|          | 0/29166 [00:00<?, ?it/s]

 Epoch 10/10  Train: 0.903  Val Acc: 0.901  Val F1: 0.882
Training took 1165.15 seconds

Best epoch was 9, val_acc=0.901
Class_weights: None
Learning_rate: 0.1
Batch_size: 32
Epochs: 10
Hidden_dim_size: 192


In [None]:
# train_report = evaluate(model, X_train_tensor, y_train_tensor)
# val_report = evaluate(model, X_val_tensor, y_val_tensor)
test_report = evaluate(model, X_test_tensor, y_test_tensor)

print("="*53, "\n", f"{'Model Evaluation':^53}", "\n" + "="*53)
# print("\n", f"{'TRAIN':^53}")
# print(train_report)
# print("\n", f"{'VALIDATION':^53}")
# print(val_report)
print("\n", f"{'TEST':^53}")
print(test_report)



                   Model Evaluation                    

                         TEST                         
              precision    recall  f1-score   support

  Severity 1       0.72      0.38      0.50      1935
  Severity 2       0.92      0.99      0.95    176868
  Severity 3       0.59      0.30      0.40     11480
  Severity 4       0.53      0.20      0.29      9709

    accuracy                           0.90    199992
   macro avg       0.69      0.46      0.53    199992
weighted avg       0.88      0.90      0.88    199992

