In [1]:
import pandas as pd
import numpy as np


#### Reading data

In [2]:
CA_data = pd.read_csv('CA_data.csv')

columns = ['Weather Vehicle 1', 'Lighting Vehicle 1', 'Roadway Surface Vehicle 1', 
           'Movement Preceding Collision Vehicle 1','Vehicle 1 was Stopped in Traffic', 
           'Vehicle 1 was Moving','Number of Vehicles involved in Accident (w V1)']



### Preprocess Data

In [3]:
# dropping columns with null data
for col in columns:
    print("\n")
    CA_data.drop(CA_data[CA_data[col] == 'Not Available'].index, inplace=True)
    



















### Grouping data



#### Column Weather Vehicle 1
We are grouping Fog/visiblity with Dark because they are visual impariements\
We are grouping slippery, wind with raining because when it rains, it usally reults in slipperiness and sometimes wind\

In [4]:
# grouping less frequent data 
CA_data['Weather Vehicle 1'].replace(['Fog/Visibility', 'Dark'], 'Fog/Visibility/Dark', inplace=True)
CA_data['Weather Vehicle 1'].replace(['Slippery', 'Wind'], 'Raining', inplace=True)

print(CA_data['Weather Vehicle 1'].value_counts())

Weather Vehicle 1
Clear                  263
Cloudy                  29
Raining                 14
Fog/Visibility/Dark      4
Name: count, dtype: int64


#### Lighting Vehicle 1
Grouping no unusla condition and daylight b/c they basically the same\
Grouping Dusk/dawn with dark with street lights b/c they are relatively similar conditions\

In [5]:
CA_data['Lighting Vehicle 1'].replace(['No unusual condition'], 'Daylight', inplace=True)
CA_data['Lighting Vehicle 1'].replace(['Dusk/Dawn'], 'Dark w Street-lights', inplace=True)


print(CA_data['Lighting Vehicle 1'].value_counts())

Lighting Vehicle 1
Daylight                                225
Dark w Street-lights                     81
No unusual conditions                     2
Dark w Non-functioning Street-lights      2
Name: count, dtype: int64


#### Movement Preceding Collision Vehicle 1
Wet roads = slippery roads\
Dropped other and proceeding straight b/c they dont provide much value\

In [6]:

CA_data['Roadway Surface Vehicle 1'].replace(['Slippery'], 'Wet', inplace=True)
CA_data.drop(CA_data[(CA_data['Roadway Surface Vehicle 1'] == 'Other') | (CA_data['Roadway Surface Vehicle 1'] == 'Proceeding Straight')].index, inplace=True)

print(CA_data['Roadway Surface Vehicle 1'].value_counts())


Roadway Surface Vehicle 1
Dry    286
Wet     22
Name: count, dtype: int64


### Movement Preceding Collusion vehicle 1

#### grouping turns, high movements, parking, and dropping others

In [7]:
CA_data['Movement Preceding Collision Vehicle 1'].replace(['Making Right Turn', 'Making Left Turn', 'Making U turn', 'Making Right Turn, Slowing/Stopping'], 'Turning', inplace=True)
CA_data['Movement Preceding Collision Vehicle 1'].replace(['Changing Lanes', 'Entering Traffic', 'Entrering Traffic','Xing into opposing lane', 'Passing Other Vehicle'], 'Highway movement', inplace=True)
CA_data['Movement Preceding Collision Vehicle 1'].replace(['Slowing/Stopping', 'Stopped in Traffic', 'Parked', 'Stopped, Merging'], 'Stopped', inplace=True)
CA_data['Movement Preceding Collision Vehicle 1'].replace(['Parking Manuerver'], 'Parking', inplace=True)
CA_data.drop(CA_data[(CA_data['Movement Preceding Collision Vehicle 1'] == 'Other')].index, inplace=True)

print(CA_data['Movement Preceding Collision Vehicle 1'].value_counts())

Movement Preceding Collision Vehicle 1
Stopped                151
Proceeding Straight     91
Turning                 29
Parking                 11
Highway movement        10
Backing                  8
Parking Manuever         3
Name: count, dtype: int64


In [8]:
print(columns)

['Weather Vehicle 1', 'Lighting Vehicle 1', 'Roadway Surface Vehicle 1', 'Movement Preceding Collision Vehicle 1', 'Vehicle 1 was Stopped in Traffic', 'Vehicle 1 was Moving', 'Number of Vehicles involved in Accident (w V1)']


Fixing some off the mislabeled data

In [9]:

# print(CA_data['Vehicle 1 was Stopped in Traffic'].value_counts())  
# print(CA_data['Vehicle 1 was Moving'].value_counts())

CA_data['Vehicle 1 was Stopped in Traffic'].replace(['/Off'], 'Yes', inplace=True)
CA_data['Vehicle 1 was Moving'].replace(['/Off'], 'No', inplace=True)

CA_data['Vehicle 1 was Stopped in Traffic'].replace(['yes'], 'Yes', inplace=True)
CA_data['Vehicle 1 was Moving'].replace(['Moving'], 'Yes', inplace=True)

print(CA_data['Vehicle 1 was Stopped in Traffic'].value_counts())  
print(CA_data['Vehicle 1 was Moving'].value_counts())

Vehicle 1 was Stopped in Traffic
No     192
Yes    165
Name: count, dtype: int64
Vehicle 1 was Moving
Yes    183
No     174
Name: count, dtype: int64


In [10]:
print(CA_data['Number of Vehicles involved in Accident (w V1)'].value_counts())

Number of Vehicles involved in Accident (w V1)
2.0    305
1.0     43
3.0      6
Name: count, dtype: int64


In [11]:
# one hot encode our new columns
categorical_columns = ['Weather Vehicle 1', 'Lighting Vehicle 1', 'Roadway Surface Vehicle 1', 
                        'Movement Preceding Collision Vehicle 1']

encoded_CA_data = pd.get_dummies(CA_data, columns=categorical_columns, prefix='encoded_')
encoded_columns = encoded_CA_data.columns[encoded_CA_data.columns.str.startswith('encoded_')]



Preprocess outputs

In [12]:
from sklearn.preprocessing import LabelEncoder

encoded_CA_data.drop(encoded_CA_data[(encoded_CA_data['Vehicle Damage'] == 'Not Available')].index, inplace=True)



label_encoder = LabelEncoder()
encoded_CA_data['Vehicle Damage'] = label_encoder.fit_transform(encoded_CA_data['Vehicle Damage'])
encoded_CA_data['Vehicle 1 was Stopped in Traffic'] = label_encoder.fit_transform(encoded_CA_data['Vehicle 1 was Stopped in Traffic'])

encoded_CA_data['Vehicle 1 was Moving'] = label_encoder.fit_transform(encoded_CA_data['Vehicle 1 was Moving'])


print(encoded_CA_data['Vehicle Damage'].value_counts())
print(encoded_CA_data['Vehicle 1 was Stopped in Traffic'].value_counts())
print(encoded_CA_data['Vehicle 1 was Moving'].value_counts())



Vehicle Damage
1    217
3     53
2     49
0      8
Name: count, dtype: int64
Vehicle 1 was Stopped in Traffic
0    175
1    152
Name: count, dtype: int64
Vehicle 1 was Moving
1    169
0    158
Name: count, dtype: int64


In [13]:
# Split data into training and testing
from sklearn.model_selection import train_test_split

X = encoded_CA_data[encoded_columns]
y = encoded_CA_data['Vehicle Damage']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)   



In [14]:
y.value_counts()

Vehicle Damage
1    217
3     53
2     49
0      8
Name: count, dtype: int64

### SVM

In [68]:
from sklearn import svm
from sklearn.model_selection import cross_val_score


clf = svm.SVC(kernel='linear')
scores = cross_val_score(clf, X, y, cv=7)

print(scores)
print(np.mean(scores))




[0.65957447 0.63829787 0.74468085 0.82978723 0.80851064 0.76086957
 0.67391304]
0.730804810360777


In [95]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

parameters = {
    'n_estimators': [25,50,75],
    'max_depth': [16,24,32,64,128],
    'min_samples_split': [7,9,10,11,13],
    'min_samples_leaf': [4,6,8]

}
rf = RandomForestClassifier(random_state=42)

clf = RandomizedSearchCV(rf, parameters, n_iter=10, cv=7, random_state=42)

clf.fit(X_train, y_train)
# {'n_estimators': 50, 'min_samples_split': 13, 'min_samples_leaf': 4, 'max_depth': 64}
clf.best_params_





{'n_estimators': 25,
 'min_samples_split': 7,
 'min_samples_leaf': 8,
 'max_depth': 32}

In [105]:
new_rf = RandomForestClassifier(n_estimators=25, min_samples_split=7, min_samples_leaf=8, max_depth=32, random_state=42)
scores = cross_val_score(new_rf, X, y, cv=7)

print(scores)
print(np.mean(scores))

[0.65957447 0.63829787 0.74468085 0.80851064 0.80851064 0.76086957
 0.67391304]
0.7277652966829654


In [89]:
from xgboost import XGBClassifier

xgb = XGBClassifier(random_state=42)
parameters = {
    'n_estimators': [25,50,75],
    'max_depth': [16,24,32,64,128],
    'min_child_weight': [1,3,5,7],
    'learning_rate': [0.1,0.01,0.001]
}

clf = RandomizedSearchCV(xgb, parameters, n_iter=10, cv=6, random_state=42)
clf.fit(X_train,y_train)
clf.best_params_


{'n_estimators': 25,
 'min_child_weight': 5,
 'max_depth': 64,
 'learning_rate': 0.1}

In [92]:
# best_xgb 
best_xgb = XGBClassifier(n_estimators=25, min_child_weight=5, max_depth=64, learning_rate=0.1, random_state=42)
scores = cross_val_score(best_xgb, X, y, cv=7)
print(scores)
print(np.mean(scores))


[0.65957447 0.63829787 0.74468085 0.80851064 0.78723404 0.76086957
 0.67391304]
0.724725783005154


### Neural network

In [59]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class FNN(nn.Module):
    def __init__(self, input_channels, output_channels):

        super(FNN, self).__init__()
        self.fc1 = nn.Linear(input_channels, 25)
        self.fc2 = nn.Linear(25, 4)
        self.fc3 = nn.Linear(4, output_channels)
        

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [60]:
y_train.value_counts()

Vehicle Damage
1    174
3     41
2     40
0      6
Name: count, dtype: int64

In [61]:
X_train_np = X_train.values
X_test_np = X_test.values
y_train_np = y_train.values
y_test_np = y_test.values


X_train_tensor = torch.tensor(X_train_np, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_np, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_np, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_np, dtype=torch.float32)


In [62]:
# model
model = FNN(X_train_tensor.shape[1], y.value_counts().shape[0])
print(X_train_tensor.shape[1], y.value_counts().shape[0])
device = 'mps'
print(torch.__version__)
model.to(device)   

# hyper parameters
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 200


15 4
2.3.0


In [63]:
from sklearn.metrics import accuracy_score


for epoch in range(epochs):
    inputs, targets = X_train_tensor.to(device), y_train_tensor.to(device)
    optimizer.zero_grad()  # Zero the gradients
    outputs = model(inputs)  # Forward pass
    loss = criterion(outputs, targets)  # Compute the loss
    loss.backward()  # Backward pass (compute gradients)
    optimizer.step()  # Update the weights

    if epoch % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item()}')



Epoch [1/200], Loss: 1.566071629524231
Epoch [11/200], Loss: 1.5414338111877441
Epoch [21/200], Loss: 1.5207993984222412
Epoch [31/200], Loss: 1.5018703937530518
Epoch [41/200], Loss: 1.4766266345977783
Epoch [51/200], Loss: 1.4450372457504272
Epoch [61/200], Loss: 1.4069111347198486
Epoch [71/200], Loss: 1.3535767793655396
Epoch [81/200], Loss: 1.2886368036270142
Epoch [91/200], Loss: 1.2217596769332886
Epoch [101/200], Loss: 1.1610654592514038
Epoch [111/200], Loss: 1.1113449335098267
Epoch [121/200], Loss: 1.0720146894454956
Epoch [131/200], Loss: 1.0400640964508057
Epoch [141/200], Loss: 1.013671636581421
Epoch [151/200], Loss: 0.9911019802093506
Epoch [161/200], Loss: 0.9715713858604431
Epoch [171/200], Loss: 0.9544041752815247
Epoch [181/200], Loss: 0.938666820526123
Epoch [191/200], Loss: 0.9239813089370728


In [93]:
# Evaluation on test data

model.eval()
with torch.no_grad():
    inputs, targets = X_test_tensor.to(device), y_test_tensor.to(device)
    outputs = model(inputs)
    _, predicted = torch.max(outputs, 1)
    accuracy = accuracy_score(predicted.cpu(), targets.cpu())
    
    print(f'Test Accuracy: {accuracy}')

Test Accuracy: 0.6515151515151515
