In [607]:
import pandas as pd
import numpy as np


#### Reading data

In [608]:
CA_data = pd.read_csv('CA_data.csv')

columns = ['Weather Vehicle 1', 'Lighting Vehicle 1', 'Roadway Surface Vehicle 1', 'Movement Preceding Collision Vehicle 1']

### Preprocess Data

In [609]:
# dropping columns with null data
for col in columns:
    print("\n")
    CA_data.drop(CA_data[CA_data[col] == 'Not Available'].index, inplace=True)
    print(CA_data[col].value_counts())





Weather Vehicle 1
Clear             265
Cloudy             29
Raining            10
Fog/Visibility      3
Wind                3
Slippery            1
Dark                1
Name: count, dtype: int64


Lighting Vehicle 1
Daylight                                227
Dark w Street-lights                     76
Dusk/Dawn                                 5
No unusual conditions                     2
Dark w Non-functioning Street-lights      2
Name: count, dtype: int64


Roadway Surface Vehicle 1
Dry                    286
Wet                     14
Slippery                 8
Other                    1
Proceeding Straight      1
Name: count, dtype: int64


Movement Preceding Collision Vehicle 1
Stopped                                130
Proceeding Straight                     91
Slowing/Stopping                        19
Making Right Turn                       15
Making Left Turn                        12
Parking                                 10
Backing                                  8
Ch

### Grouping data

#### Column Weather Vehicle 1
We are grouping Fog/visiblity with Dark because they are visual impariements\
We are grouping slippery, wind with raining because when it rains, it usally reults in slipperiness and sometimes wind\

#### Lighting Vehicle 1
Grouping no unusla condition and daylight b/c they basically the same\
Grouping Dusk/dawn with dark with street lights b/c they are relatively similar conditions\

#### Movement Preceding Collision Vehicle 1
Wet roads = slippery roads\
Dropped other and proceeding straight b/c they dont provide much value\



In [610]:
# grouping less frequent data 
CA_data['Weather Vehicle 1'].replace(['Fog/Visibility', 'Dark'], 'Fog/Visibility/Dark', inplace=True)
CA_data['Weather Vehicle 1'].replace(['Slippery', 'Wind'], 'Raining', inplace=True)

CA_data['Lighting Vehicle 1'].replace(['No unusual condition'], 'Daylight', inplace=True)
CA_data['Lighting Vehicle 1'].replace(['Dusk/Dawn'], 'Dark w Street-lights', inplace=True)


CA_data['Roadway Surface Vehicle 1'].replace(['Slippery'], 'Wet', inplace=True)
CA_data.drop(CA_data[(CA_data['Roadway Surface Vehicle 1'] == 'Other') | (CA_data['Roadway Surface Vehicle 1'] == 'Proceeding Straight')].index, inplace=True)

CA_data['Movement Preceding Collision Vehicle 1'].replace(['Making Right Turn', 'Making Left Turn', 'Making U turn', 'Making Right Turn, Slowing/Stopping'], 'Turning', inplace=True)
CA_data['Movement Preceding Collision Vehicle 1'].replace(['Changing Lanes', 'Entering Traffic', 'Entrering Traffic','Xing into opposing lane', 'Passing Other Vehicle'], 'Highway movement', inplace=True)
CA_data['Movement Preceding Collision Vehicle 1'].replace(['Slowing/Stopping', 'Stopped in Traffic', 'Parked', 'Stopped, Merging'], 'Stopped', inplace=True)
CA_data['Movement Preceding Collision Vehicle 1'].replace(['Parking Manuerver'], 'Parking', inplace=True)
CA_data.drop(CA_data[(CA_data['Movement Preceding Collision Vehicle 1'] == 'Other')].index, inplace=True)


for col in columns:
    print()
    print(CA_data[col].value_counts())


Weather Vehicle 1
Clear                  261
Cloudy                  28
Raining                 11
Fog/Visibility/Dark      3
Name: count, dtype: int64

Lighting Vehicle 1
Daylight                224
Dark w Street-lights     79
Name: count, dtype: int64

Roadway Surface Vehicle 1
Dry    283
Wet     20
Name: count, dtype: int64

Movement Preceding Collision Vehicle 1
Stopped                151
Proceeding Straight     91
Turning                 29
Parking                 11
Highway movement        10
Backing                  8
Parking Manuever         3
Name: count, dtype: int64


In [611]:
# one hot encode our new columns
encoded_CA_data = pd.get_dummies(CA_data, columns=columns, prefix='encoded_')
encoded_columns = encoded_CA_data.columns[encoded_CA_data.columns.str.startswith('encoded_')]



Preprocess outputs

In [612]:
from sklearn.preprocessing import LabelEncoder

encoded_CA_data.drop(encoded_CA_data[(encoded_CA_data['Vehicle Damage'] == 'Not Available')].index, inplace=True)



label_encoder = LabelEncoder()
encoded_CA_data['Vehicle Damage'] = label_encoder.fit_transform(encoded_CA_data['Vehicle Damage'])

print(encoded_CA_data['Vehicle Damage'].value_counts())


Vehicle Damage
1    217
3     53
2     49
0      8
Name: count, dtype: int64


In [613]:
# Split data into training and testing
from sklearn.model_selection import train_test_split

X = encoded_CA_data[encoded_columns]
y = encoded_CA_data['Vehicle Damage']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)   



In [614]:
y.value_counts()

Vehicle Damage
1    217
3     53
2     49
0      8
Name: count, dtype: int64

### SVM

In [615]:
from sklearn import svm
from sklearn.model_selection import cross_val_score


clf = svm.SVC(kernel='linear')
scores = cross_val_score(clf, X, y, cv=7)

scores




array([0.65957447, 0.63829787, 0.74468085, 0.82978723, 0.80851064,
       0.76086957, 0.67391304])

### Neural network

In [616]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class FNN(nn.Module):
    def __init__(self, input_channels, output_channels):

        super(FNN, self).__init__()
        self.fc1 = nn.Linear(input_channels, 6)
        self.fc2 = nn.Linear(6, 4)
        self.fc3 = nn.Linear(4, output_channels)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [617]:
y_train.value_counts()

Vehicle Damage
1    174
3     41
2     40
0      6
Name: count, dtype: int64

In [618]:
X_train_np = X_train.values
X_test_np = X_test.values
y_train_np = y_train.values
y_test_np = y_test.values


X_train_tensor = torch.tensor(X_train_np, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_np, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_np, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_np, dtype=torch.float32)


In [619]:
# model
model = FNN(X_train_tensor.shape[1], y.value_counts().shape[0])
print(X_train_tensor.shape[1], y.value_counts().shape[0])
device = 'mps'
model.to(device)

# hyper parameters
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 100


15 4


In [620]:
from sklearn.metrics import accuracy_score


for epoch in range(epochs):
    inputs, targets = X_train_tensor.to(device), y_train_tensor.to(device)
    optimizer.zero_grad()  # Zero the gradients
    outputs = model(inputs)  # Forward pass
    loss = criterion(outputs, targets)  # Compute the loss
    loss.backward()  # Backward pass (compute gradients)
    optimizer.step()  # Update the weights

    
    print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item()}')



Epoch [1/100], Loss: 1.363842248916626
Epoch [2/100], Loss: 1.3598623275756836
Epoch [3/100], Loss: 1.3557498455047607
Epoch [4/100], Loss: 1.3516643047332764
Epoch [5/100], Loss: 1.3476084470748901
Epoch [6/100], Loss: 1.3435776233673096
Epoch [7/100], Loss: 1.3395769596099854
Epoch [8/100], Loss: 1.3356046676635742
Epoch [9/100], Loss: 1.3316713571548462
Epoch [10/100], Loss: 1.3277736902236938
Epoch [11/100], Loss: 1.3239063024520874
Epoch [12/100], Loss: 1.3200697898864746
Epoch [13/100], Loss: 1.3162603378295898
Epoch [14/100], Loss: 1.312514066696167
Epoch [15/100], Loss: 1.3088107109069824
Epoch [16/100], Loss: 1.305137276649475
Epoch [17/100], Loss: 1.3014858961105347
Epoch [18/100], Loss: 1.2978638410568237
Epoch [19/100], Loss: 1.2942626476287842
Epoch [20/100], Loss: 1.2906603813171387
Epoch [21/100], Loss: 1.287085771560669
Epoch [22/100], Loss: 1.2835372686386108
Epoch [23/100], Loss: 1.2800122499465942
Epoch [24/100], Loss: 1.2765142917633057
Epoch [25/100], Loss: 1.27303

In [621]:
# Evaluation on test data

model.eval()
with torch.no_grad():
    inputs, targets = X_test_tensor.to(device), y_test_tensor.to(device)
    outputs = model(inputs)
    _, predicted = torch.max(outputs, 1)
    accuracy = accuracy_score(predicted.cpu(), targets.cpu())
    
    print(f'Test Accuracy: {accuracy}')

Test Accuracy: 0.6515151515151515
