In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objects as go
import warnings

init_notebook_mode(connected=True)

warnings.filterwarnings("ignore")

%matplotlib inline

In [2]:
 from google.colab import files
# uploaded = files.upload()
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [4]:
file_id = "1ymI30vvIKRS2kQFoHQqCuLRxlCP9zIDD"
downloaded = drive.CreateFile({'id':file_id}) 
downloaded.GetContentFile('Accidents_data_file.csv')  
data = pd.read_csv('Accidents_data_file.csv')

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2906610 entries, 0 to 2906609
Data columns (total 47 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   ID                     object 
 1   Severity               int64  
 2   Start_Time             object 
 3   End_Time               object 
 4   Start_Lat              float64
 5   Start_Lng              float64
 6   End_Lat                float64
 7   End_Lng                float64
 8   Distance(mi)           float64
 9   Description            object 
 10  Number                 float64
 11  Street                 object 
 12  Side                   object 
 13  City                   object 
 14  County                 object 
 15  State                  object 
 16  Zipcode                object 
 17  Country                object 
 18  Timezone               object 
 19  Airport_Code           object 
 20  Weather_Timestamp      object 
 21  Temperature(F)         float64
 22  Wind_Chill(F)     

In [7]:
data.isna().mean()

ID                       0.000000
Severity                 0.000000
Start_Time               0.000000
End_Time                 0.000000
Start_Lat                0.000000
Start_Lng                0.000000
End_Lat                  0.097303
End_Lng                  0.097303
Distance(mi)             0.000000
Description              0.000000
Number                   0.650817
Street                   0.000000
Side                     0.000000
City                     0.000037
County                   0.000000
State                    0.000000
Zipcode                  0.000383
Country                  0.000000
Timezone                 0.001180
Airport_Code             0.002273
Weather_Timestamp        0.016141
Temperature(F)           0.023128
Wind_Chill(F)            0.407299
Humidity(%)              0.024520
Pressure(in)             0.019579
Visibility(mi)           0.024798
Wind_Direction           0.021838
Wind_Speed(mph)          0.105677
Precipitation(in)        0.447713
Weather_Condit

In [8]:
null_columns = ['End_Lat', 'End_Lng', 'Number', 'Wind_Chill(F)', 'Precipitation(in)']

data.drop(null_columns, axis=1, inplace = True)

In [9]:
data.isna().sum()

ID                            0
Severity                      0
Start_Time                    0
End_Time                      0
Start_Lat                     0
Start_Lng                     0
Distance(mi)                  0
Description                   0
Street                        0
Side                          0
City                        108
County                        0
State                         0
Zipcode                    1114
Country                       0
Timezone                   3430
Airport_Code               6608
Weather_Timestamp         46917
Temperature(F)            67224
Humidity(%)               71270
Pressure(in)              56908
Visibility(mi)            72078
Wind_Direction            63474
Wind_Speed(mph)          307163
Weather_Condition         71851
Amenity                       0
Bump                          0
Crossing                      0
Give_Way                      0
Junction                      0
No_Exit                       0
Railway 

In [10]:
data.dropna(axis=0, inplace = True)

In [11]:
data.head()

Unnamed: 0,ID,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Distance(mi),Description,Street,Side,City,County,State,Zipcode,Country,Timezone,Airport_Code,Weather_Timestamp,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),Weather_Condition,Amenity,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,2,2019-05-21 08:29:55,2019-05-21 09:29:40,34.808868,-82.269157,0.0,Accident on Tanner Rd at Pennbrooke Ln.,Tanner Rd,R,Greenville,Greenville,SC,29607-6027,US,US/Eastern,KGMU,2019-05-21 08:53:00,76.0,52.0,28.91,10.0,N,7.0,Fair,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
1,A-2,2,2019-10-07 17:43:09,2019-10-07 19:42:50,35.09008,-80.74556,0.0,Accident on Houston Branch Rd at Providence Br...,Providence Branch Ln,R,Charlotte,Mecklenburg,NC,28270-8560,US,US/Eastern,KEQY,2019-10-07 17:53:00,76.0,62.0,29.3,10.0,VAR,3.0,Cloudy,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day
2,A-3,2,2020-12-13 21:53:00,2020-12-13 22:44:00,37.14573,-121.985052,1.4,Stationary traffic on CA-17 from Summit Rd (CA...,Santa Cruz Hwy,R,Los Gatos,Santa Clara,CA,95033,US,US/Pacific,KSJC,2020-12-13 21:53:00,51.0,80.0,30.17,10.0,W,6.0,Fair,False,False,False,False,False,False,False,False,False,False,False,False,False,Night,Night,Night,Night
3,A-4,2,2018-04-17 16:51:23,2018-04-17 17:50:46,39.11039,-119.773781,0.0,Accident on US-395 Southbound at Topsy Ln.,US Highway 395 S,R,Carson City,Douglas,NV,89705,US,US/Pacific,KCXP,2018-04-17 16:55:00,53.6,16.0,30.16,10.0,SSW,4.6,Clear,False,False,False,False,False,False,False,False,False,False,False,True,False,Day,Day,Day,Day
4,A-5,3,2016-08-31 17:40:49,2016-08-31 18:10:49,26.102942,-80.265091,0.0,Accident on I-595 Westbound at Exit 4 / Pine I...,I-595 W,R,Fort Lauderdale,Broward,FL,33324,US,US/Eastern,KHWO,2016-08-31 17:53:00,84.2,84.0,29.92,10.0,SSE,13.8,Overcast,False,False,False,False,True,False,False,False,False,False,False,True,False,Day,Day,Day,Day


In [12]:
print("Total missing values:", data.isna().sum().sum())

Total missing values: 0


In [13]:
{column: len(data[column].unique()) for column in data.columns if data.dtypes[column] == 'object'}

{'Airport_Code': 1975,
 'Astronomical_Twilight': 2,
 'City': 11367,
 'Civil_Twilight': 2,
 'Country': 1,
 'County': 1707,
 'Description': 1331271,
 'End_Time': 2230030,
 'ID': 2571316,
 'Nautical_Twilight': 2,
 'Side': 3,
 'Start_Time': 2114391,
 'State': 49,
 'Street': 164333,
 'Sunrise_Sunset': 2,
 'Timezone': 4,
 'Weather_Condition': 124,
 'Weather_Timestamp': 478143,
 'Wind_Direction': 23,
 'Zipcode': 339727}

In [14]:
unneeded_columns = ['ID', 'Description', 'Street', 'City', 'Zipcode', 'Country']
data.columns.to_list()
data.drop(unneeded_columns, axis=1, inplace = True)

In [15]:
def get_years(df, column):
    return df[column].apply(lambda date: date[0:4])

def get_months(df, column):
    return df[column].apply(lambda date: date[5:7])
data.columns.to_list()

['Severity',
 'Start_Time',
 'End_Time',
 'Start_Lat',
 'Start_Lng',
 'Distance(mi)',
 'Side',
 'County',
 'State',
 'Timezone',
 'Airport_Code',
 'Weather_Timestamp',
 'Temperature(F)',
 'Humidity(%)',
 'Pressure(in)',
 'Visibility(mi)',
 'Wind_Direction',
 'Wind_Speed(mph)',
 'Weather_Condition',
 'Amenity',
 'Bump',
 'Crossing',
 'Give_Way',
 'Junction',
 'No_Exit',
 'Railway',
 'Roundabout',
 'Station',
 'Stop',
 'Traffic_Calming',
 'Traffic_Signal',
 'Turning_Loop',
 'Sunrise_Sunset',
 'Civil_Twilight',
 'Nautical_Twilight',
 'Astronomical_Twilight']

In [16]:
data['Start_Time_Month'] = get_months(data, 'Start_Time')
data['Start_Time_Year'] = get_years(data, 'Start_Time')

data['End_Time_Month'] = get_months(data, 'End_Time')
data['End_Time_Year'] = get_years(data, 'End_Time')

data['Weather_Timestamp_Month'] = get_months(data, 'Weather_Timestamp')
data['Weather_Timestamp_Year'] = get_years(data, 'Weather_Timestamp')


data = data.drop(['Start_Time', 'End_Time', 'Weather_Timestamp'], axis=1)

In [17]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [18]:
columns= ['Side', 'County', 'State', 'Timezone', 'Airport_Code', 'Wind_Direction', 'Weather_Condition']

In [19]:
for i in columns:
    le.fit(data[i])
    data[i] = le.transform(data[i])

In [20]:
def get_binary_column(df, column):
        return data[column].apply(lambda x: 1 if x == 'Day' else 0)

In [21]:
data['Sunrise_Sunset'] = get_binary_column(data, 'Sunrise_Sunset')
data['Civil_Twilight'] = get_binary_column(data, 'Civil_Twilight')
data['Nautical_Twilight'] = get_binary_column(data, 'Nautical_Twilight')
data['Astronomical_Twilight'] = get_binary_column(data, 'Astronomical_Twilight')

In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2571316 entries, 0 to 2906609
Data columns (total 39 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   Severity                 int64  
 1   Start_Lat                float64
 2   Start_Lng                float64
 3   Distance(mi)             float64
 4   Side                     int64  
 5   County                   int64  
 6   State                    int64  
 7   Timezone                 int64  
 8   Airport_Code             int64  
 9   Temperature(F)           float64
 10  Humidity(%)              float64
 11  Pressure(in)             float64
 12  Visibility(mi)           float64
 13  Wind_Direction           int64  
 14  Wind_Speed(mph)          float64
 15  Weather_Condition        int64  
 16  Amenity                  bool   
 17  Bump                     bool   
 18  Crossing                 bool   
 19  Give_Way                 bool   
 20  Junction                 bool   
 21  No_Exit 

In [23]:
data_y = data['Severity']
data_X = data.drop('Severity', axis=1)
#data_X = X.astype(np.float)

In [24]:
scaler = StandardScaler()

data_X = scaler.fit_transform(data_X)

In [25]:
import sklearn

X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, train_size=0.7, random_state=100)

DECISION TREE

In [26]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
clf = DecisionTreeClassifier(random_state=42)

# Train & Test
clf.fit(X_train, y_train)
y_pred= clf.predict(X_test)

In [27]:
# Print accuracy_entropy
print('Decision Tree accuracy_score: {:.3f}.'.format(accuracy_score(y_test, y_pred)))

Decision Tree accuracy_score: 0.812.


NN

In [28]:
data.shape

(2571316, 39)

In [29]:
y_train = y_train -1
y_test = y_test - 1

In [30]:
from keras.models import Sequential,Input,Model
inputs = tf.keras.Input(shape=(data_X.shape[1],))
x = tf.keras.layers.Dense(64, activation='relu')(inputs)
x = tf.keras.layers.Dense(64, activation='relu')(x)
outputs = tf.keras.layers.Dense(4, activation='softmax')(x)

model = tf.keras.Model(inputs, outputs)

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

batch_size = 32
epochs = 20

history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    batch_size=batch_size,
    epochs=epochs,
        )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [31]:
model.evaluate(X_test,y_test)



[0.4798257648944855, 0.7918498516082764]

SVM

In [34]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objects as go
import warnings

init_notebook_mode(connected=True)

warnings.filterwarnings("ignore")

%matplotlib inline

In [35]:
data = pd.read_csv('Accidents_data_file.csv')

In [36]:
# Due to resources constraint, we had to limit the subset size for SVM modelling. So, 
# the result for SVM might vary here as compared to what was presented in presnetation
data = data[:50000]

In [37]:
null_columns = ['End_Lat', 'End_Lng', 'Number', 'Wind_Chill(F)', 'Precipitation(in)']

data.drop(null_columns, axis=1, inplace = True)

In [38]:
data.dropna(axis=0, inplace = True)

In [39]:
{column: len(data[column].unique()) for column in data.columns if data.dtypes[column] == 'object'}

{'Airport_Code': 376,
 'Astronomical_Twilight': 2,
 'City': 516,
 'Civil_Twilight': 2,
 'Country': 1,
 'County': 255,
 'Description': 889,
 'End_Time': 895,
 'ID': 895,
 'Nautical_Twilight': 2,
 'Side': 2,
 'Start_Time': 895,
 'State': 39,
 'Street': 712,
 'Sunrise_Sunset': 2,
 'Timezone': 4,
 'Weather_Condition': 33,
 'Weather_Timestamp': 893,
 'Wind_Direction': 23,
 'Zipcode': 826}

In [40]:
unneeded_columns = ['ID', 'Description', 'Street', 'City', 'Zipcode', 'Country']
data.columns.to_list()
data.drop(unneeded_columns, axis=1, inplace = True)

In [41]:
def get_years(df, column):
    return df[column].apply(lambda date: date[0:4])

def get_months(df, column):
    return df[column].apply(lambda date: date[5:7])
data.columns.to_list()

['Severity',
 'Start_Time',
 'End_Time',
 'Start_Lat',
 'Start_Lng',
 'Distance(mi)',
 'Side',
 'County',
 'State',
 'Timezone',
 'Airport_Code',
 'Weather_Timestamp',
 'Temperature(F)',
 'Humidity(%)',
 'Pressure(in)',
 'Visibility(mi)',
 'Wind_Direction',
 'Wind_Speed(mph)',
 'Weather_Condition',
 'Amenity',
 'Bump',
 'Crossing',
 'Give_Way',
 'Junction',
 'No_Exit',
 'Railway',
 'Roundabout',
 'Station',
 'Stop',
 'Traffic_Calming',
 'Traffic_Signal',
 'Turning_Loop',
 'Sunrise_Sunset',
 'Civil_Twilight',
 'Nautical_Twilight',
 'Astronomical_Twilight']

In [42]:
data['Start_Time_Month'] = get_months(data, 'Start_Time')
data['Start_Time_Year'] = get_years(data, 'Start_Time')

data['End_Time_Month'] = get_months(data, 'End_Time')
data['End_Time_Year'] = get_years(data, 'End_Time')

data['Weather_Timestamp_Month'] = get_months(data, 'Weather_Timestamp')
data['Weather_Timestamp_Year'] = get_years(data, 'Weather_Timestamp')

data = data.drop(['Start_Time', 'End_Time', 'Weather_Timestamp'], axis=1)

In [43]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [44]:
columns= ['Side', 'County', 'State', 'Timezone', 'Airport_Code', 'Wind_Direction', 'Weather_Condition']

In [45]:
for i in columns:
    le.fit(data[i])
    data[i] = le.transform(data[i])

In [46]:
def get_binary_column(df, column):
        return data[column].apply(lambda x: 1 if x == 'Day' else 0)

In [47]:
data['Sunrise_Sunset'] = get_binary_column(data, 'Sunrise_Sunset')
data['Civil_Twilight'] = get_binary_column(data, 'Civil_Twilight')
data['Nautical_Twilight'] = get_binary_column(data, 'Nautical_Twilight')
data['Astronomical_Twilight'] = get_binary_column(data, 'Astronomical_Twilight')

In [48]:
data_y = data['Severity']
data_X = data.drop('Severity', axis=1)
#data_X = X.astype(np.float)

In [49]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf

In [50]:
scaler = StandardScaler()

data_X = scaler.fit_transform(data_X)

In [51]:
import sklearn

X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, train_size=0.7, random_state=100)

In [52]:
from sklearn import svm
clf = svm.SVC()
clf.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [53]:
from sklearn.metrics import accuracy_score
y_pred= clf.predict(X_test)
print('Decision Tree accuracy_score: {:.3f}.'.format(accuracy_score(y_test, y_pred)))

Decision Tree accuracy_score: 0.777.
