In [28]:
# Importing required packages
import pandas as pd
import numpy as np
import scipy.stats as stat
import matplotlib.pyplot as plt
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Reading data
data = pd.read_csv("Accident_train.csv",header=0)

In [3]:
# Cleaning data
data.Hour_of_Collision = data.Hour_of_Collision.replace('NA',np.nan)
data.Junction_Detail = data.Junction_Detail.replace('NA',np.nan)
data.Junction_Control = data.Junction_Control.replace('NA',np.nan)
data.Ped_Crossing_HC = data.Ped_Crossing_HC.replace('NA',np.nan)
data.Ped_Crossing_PC = data.Ped_Crossing_PC.replace('NA',np.nan)
data.Road_Surface_Conditions=data.Road_Surface_Conditions.replace('NA',np.nan)
data.Special_Conditions_at_Site=data.Special_Conditions_at_Site.replace('NA',np.nan)
data.fillna(data.mode().iloc[0], inplace = True)
data.head()

Unnamed: 0,Collision_Ref_No,Policing_Area,Collision_Severity,Weekday_of_Collision,Day_of_Collision,Month_of_Collision,Hour_of_Collision,Carriageway_Type,Speed_Limit,Junction_Detail,Junction_Control,Ped_Crossing_HC,Ped_Crossing_PC,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Special_Conditions_at_Site
0,3518,CREA,3,MON,4,8,14.0,13,60,1.0,1.0,1.0,1.0,2,9,9.0,1.0
1,10557,BELC,3,SAT,8,8,17.0,11,50,12.0,7.0,1.0,1.0,4,3,1.0,1.0
2,5002,LISB,3,WED,5,11,17.0,1,60,12.0,7.0,1.0,1.0,2,2,2.0,1.0
3,11714,BELC,3,SUN,18,10,16.0,12,70,6.0,7.0,1.0,1.0,1,3,1.0,1.0
4,12416,MIDU,3,MON,23,11,9.0,13,60,6.0,7.0,1.0,1.0,2,3,1.0,1.0


In [4]:
data_df=data.copy()

In [5]:
# Converting column Policing_Area using one-hot encoder
data_df1=pd.get_dummies(data_df['Policing_Area'])

In [6]:
# Converting column Weekday_of_Collision using one-hot encoder
data_df2=pd.get_dummies(data_df['Weekday_of_Collision'])

In [7]:
# Concatenating one-hot columns with data
train_df=pd.concat([data,data_df1,data_df2],axis=1)

In [8]:
# Remove Columns which have been one-hot encoded and Collision_Ref_No
train_df=train_df.drop(['Policing_Area','Weekday_of_Collision','Collision_Ref_No'],axis=1)

In [9]:
# Clean NA's if any
train_df=train_df.dropna()

In [10]:
# Setting the data for input to train_test_split
X=train_df.drop(['Collision_Severity'],axis=1)
y=train_df['Collision_Severity']

In [11]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
train_data, test_data, train_target, test_target = train_test_split(X, y, test_size = 0.1, random_state = 0)

In [12]:
# Fit model no training data
model = XGBClassifier()
model.fit(train_data, train_target)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [13]:
# Make predictions for test data
y_pred = model.predict(test_data)
predictions = [round(value) for value in test_target]

In [14]:
# Count of number of Collision Severity
print(np.unique(predictions, return_counts = True))

(array([1, 2, 3]), array([  9,  78, 798]))


In [15]:
# Evaluate predictions
accuracy = accuracy_score(test_target, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 100.00%


In [16]:
# Input test data from file to predict
data_df = pd.read_csv("Accident_test.csv",header=0)

In [17]:
# Data cleaning
data_df.Hour_of_Collision = data_df.Hour_of_Collision.replace('NA',np.nan)
data_df.Junction_Detail = data_df.Junction_Detail.replace('NA',np.nan)
data_df.Junction_Control = data_df.Junction_Control.replace('NA',np.nan)
data_df.Ped_Crossing_HC = data_df.Ped_Crossing_HC.replace('NA',np.nan)
data_df.Ped_Crossing_PC = data_df.Ped_Crossing_PC.replace('NA',np.nan)
data_df.Road_Surface_Conditions=data_df.Road_Surface_Conditions.replace('NA',np.nan)
data_df.Special_Conditions_at_Site=data_df.Special_Conditions_at_Site.replace('NA',np.nan)
data_df.fillna(data_df.mode().iloc[0], inplace = True)
data_df.head()

Unnamed: 0,Collision_Ref_No,Policing_Area,Collision_Severity,Weekday_of_Collision,Day_of_Collision,Month_of_Collision,Hour_of_Collision,Carriageway_Type,Speed_Limit,Junction_Detail,Junction_Control,Ped_Crossing_HC,Ped_Crossing_PC,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Special_Conditions_at_Site
0,812,DAST,Predict,MON,17,2,21.0,13,60,10.0,7.0,1.0,1.0,5.0,9.0,6.0,1.0
1,7159,ARBC,Predict,TUE,13,1,8.0,13,60,10.0,7.0,1.0,1.0,7.0,1.0,1.0,1.0
2,11833,NEMD,Predict,SAT,24,10,18.0,13,30,6.0,7.0,1.0,1.0,6.0,1.0,1.0,1.0
3,9142,MEAN,Predict,SUN,10,5,13.0,11,70,12.0,7.0,1.0,1.0,1.0,9.0,1.0,1.0
4,378,FOYL,Predict,THU,23,1,10.0,13,60,1.0,1.0,1.0,1.0,2.0,3.0,2.0,1.0


In [18]:
# Converting column Policing_Area using one-hot encoder
data_df3=pd.get_dummies(data_df['Policing_Area'])

In [19]:
# Converting column Weekday_of_Collision using one-hot encoder
data_df4=pd.get_dummies(data_df['Weekday_of_Collision'])

In [20]:
# Concatenating one-hot columns with data
test_df=pd.concat([data_df,data_df3,data_df4],axis=1)

In [21]:
# Remove Columns which have been one-hot encoded and Collision_Ref_No
test_df=test_df.drop(['Policing_Area','Weekday_of_Collision','Collision_Ref_No'],axis=1)

In [22]:
# Clean NA's if any
test_df=test_df.dropna()

In [23]:
# Setting the data for input to model prediction
X_test=test_df.drop(['Collision_Severity'],axis=1)
y_test=test_df['Collision_Severity']

In [24]:
# Predicted Collision_Severity
y_hat_test=model.predict(X_test)

In [25]:
np.unique(y_hat_test, return_counts=True)

(array([2, 3]), array([  32, 1517]))

In [26]:
test_df['Collision_Severity']=y_hat_test

In [27]:
# Passing Collision_Severity data into csv file
test_df['Collision_Severity'].to_csv('submission_final_grid_xgb.csv')