<a href="https://colab.research.google.com/github/shane-moxley/accident-fatality-prediction/blob/main/AI4ALL_Car_Accidents_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import libraries

In [None]:
# Data preprocessing and visualization
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from mlxtend.plotting import plot_decision_regions

# For KMeans
from sklearn.cluster import KMeans as KMeans

# For the Random Forest algorithm
from sklearn.ensemble import RandomForestClassifier

# For the KNN algorithm
from sklearn.neighbors import KNeighborsClassifier

# For the GaussianNB algorithm
from sklearn.naive_bayes import GaussianNB

# For the Logistic Regression algorithm
from sklearn.linear_model import LogisticRegression

%matplotlib inline
df = pd.read_csv('https://query.data.world/s/blqa522e3ejv23n7ifrh22574g2nxo')

Head of uncleaned dataset

In [None]:
df.head()

Unnamed: 0,A_CRAINJ,A_CT,A_D15_19,A_D15_20,A_D16_19,A_D16_20,A_D16_24,A_D21_24,A_D65PLS,A_DIST,...,A_SPCRA,A_TOD,BIA,COUNTY,FATALS,INDIAN_RES,SPJ_INDIAN,STATE,ST_CASE,YEAR
0,1,1,2,2,2,2,2,2,1,2,...,2,1,,21,2,,,1,10001,1982
1,1,1,2,2,2,2,2,2,2,2,...,2,2,,97,1,,,1,10002,1982
2,1,1,2,2,2,2,2,2,2,2,...,1,1,,103,1,,,1,10003,1982
3,1,1,1,1,1,1,1,2,2,2,...,1,2,,45,2,,,1,10004,1982
4,1,1,2,2,2,2,2,2,2,2,...,2,1,,97,1,,,1,10005,1982


Data preprocessing

In [None]:
# see if you can make fatals binary, i.e -- make 1 fatality == 0 and fatalies greater than 1 == 1
year_select_df = df[(df['YEAR']  >= 2010)]
clean_df = year_select_df[['FATALS','A_CRAINJ','A_CT', 'A_D15_20','A_D21_24','A_D65PLS',
                                  'A_DIST', 'A_DOW', 'A_DROWSY', 'A_POSBAC', 'A_SPCRA',
                                  'A_TOD', 'YEAR']]                                                               
clean_df.rename(columns={'FATALS' : 'FATALITIES', 'A_CRAINJ' : 'INJURY_TYPE', 'A_CT' : 'CRASH_TYPE', 'A_D15_20' : 'BTWN_15_20', 'A_D21_24' : 'BTWN_21_24', 'A_D65PLS' : '65PLS', 'A_DIST' : 'DISTRACTED',
                  'A_DOW' : 'DOW', 'A_DROWSY' : 'DROWSY', 'A_POSBAC' : 'POSBAC', 'A_SPCRA' : 'SPEEDING', 'A_TOD' : 'TOD'}, inplace=True)                             
# Declares feature vector and target variable
X = clean_df
y = clean_df['FATALITIES']
# Feature scaling
cols = X.columns
ms = MinMaxScaler()
X = ms.fit_transform(X)
X = pd.DataFrame(X, columns=[cols])
X.head()

Preliminary analysis to determine optimal number of clusters for Kmeans

Elbow method to find optimal number of clusters


In [None]:
from sklearn.cluster import KMeans
cs = []
for i in range(1, 50):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
    kmeans.fit(X)
    cs.append(kmeans.inertia_)
plt.plot(range(1, 50), cs)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('CS')
plt.show()


Silhouette Method to find optimal number of clusters

In [None]:
import sklearn.cluster as cluster
import sklearn.metrics as metrics
ss = []
for i in range(2, 50):
  labels = cluster.KMeans(n_clusters = i, init = "k-means++", max_iter = 3000, n_init = 100, random_state = 0).fit(X).labels_
  ss.append(metrics.silhouette_score(X, labels, metric = "euclidean", sample_size = 1000, random_state = 0))
  
plt.plot(range(2, 50), ss)
plt.title("The Silhouette Method")
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()


Creation of KMeans models and analysis of their accuracy

In [None]:
scores = []
for clusters in range(2,16):
  kmeans = KMeans(n_clusters = clusters, random_state = 0)
  kmeans.fit(X)
  labels = kmeans.labels_
  # check how many of the samples were correctly labeled
  correct_labels = sum(y == labels)
  score = [clusters, correct_labels/float(y.size)]
  scores.append(score)
KMeans_scores = pd.DataFrame(scores, columns = ["CLUSTERS", "ACCURACY"])
display(KMeans_scores)

Training data set and testing data set selection for KNN and random forest

In [None]:
X, y = clean_df.drop(columns=['FATALS']), clean_df['FATALS']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    random_state=123,
                                                    shuffle=True)
print(f'X_train.shape: {X_train.shape}')
print(f'y_train.shape: {y_train.shape}')
print(f'X_test.shape: {X_test.shape}')
print(f'y_test.shape: {y_test.shape}')

KNN

In [None]:
fig, axes = plt.subplots(3,figsize=(10,10))
classifiers = [KNeighborsClassifier(), GaussianNB(), LogisticRegression()]
for model, ax in zip (classifiers, axes):
    clf=model.fit(X_train, y_train)
    print("=====================================")
    print("=====================================")
    print(f"For {model}: \n")
    print(f'Training set accuracy: {clf.score(X_train, y_train)*100:.2f}%')
    print(f'Test set accuracy: {clf.score(X_test, y_test)*100:.2f}%')
    plot_decision_regions(X_train, y_train, clf, ax=ax)

Random Forest

In [None]:
features_per_tree = ["sqrt", "log2", None]     #None means max_features = n_features
scores =[]
y_test.head()
for n_trees in range(30, 80, 10):
  for n_features in features_per_tree:
    model2 = RandomForestClassifier(n_estimators = n_trees, max_features = n_features)
    model2.fit(X_train, y_train)
    score = [n_trees, n_features, model2.score(X_test, y_test)]
    scores.append(score)
RFscores = pd.DataFrame(scores, columns=["NUM_TREES", "FEATS_PER_TREE", "ACCURACY"])
display(RFscores)