In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns',None)
# pd.set_option('display.max_rows',None)

In [2]:
data_train = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')

In [3]:
data_train_copy = data_train.copy(deep = True)
data_test_copy = data_test.copy(deep = True)

In [4]:
data_train.Severity.replace({'Minor_Damage_And_Injuries' : 1,
                             'Significant_Damage_And_Serious_Injuries' :2,
                             'Significant_Damage_And_Fatalities' :3,
                             'Highly_Fatal_And_Damaging' : 4 }, inplace=True) # add main

In [5]:
def clean(dataset):
    data = dataset.copy(deep = True)
    data = data.drop(columns= ['Accident_ID'])   #add main

    data['Accident_Type_Code'] = data['Accident_Type_Code'].astype('object')
    data['Violations'] = data['Violations'].astype('object')
    data = pd.get_dummies(data )   ## add main
    
    return data

In [6]:
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,AdaBoostClassifier
from xgboost import XGBRFClassifier,XGBClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,roc_curve,auc,recall_score,classification_report

In [7]:
train_data = clean(data_train)

In [8]:
X = train_data.drop(columns=['Severity'])
y = train_data.Severity

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X,y, random_state = 6, train_size = .8, stratify = y)

In [10]:
model = DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',           ## add main
                       max_depth=11, max_features=19, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=0.05,
                       min_samples_leaf=5, min_samples_split=15,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=5935, splitter='best')


model.fit(X_train, y_train)

y_predict = model.predict(X_val)
print(confusion_matrix(y_val,y_predict)) 
print(accuracy_score(y_val,y_predict))
print(classification_report(y_val,y_predict))

[[478  11   6  10]
 [ 24 505   3  14]
 [  4   5 324   6]
 [ 15  11  18 566]]
0.9365
              precision    recall  f1-score   support

           1       0.92      0.95      0.93       505
           2       0.95      0.92      0.94       546
           3       0.92      0.96      0.94       339
           4       0.95      0.93      0.94       610

    accuracy                           0.94      2000
   macro avg       0.93      0.94      0.94      2000
weighted avg       0.94      0.94      0.94      2000



In [11]:
model =RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,       ##add main
                       criterion='entropy', max_depth=12, max_features=16,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=61,
                       verbose=0, warm_start=False)

model.fit(X_train, y_train)

y_predict = model.predict(X_val)
print(confusion_matrix(y_val,y_predict)) 
print(accuracy_score(y_val,y_predict))
print(classification_report(y_val,y_predict))


[[480   9   3  13]
 [ 19 513   1  13]
 [  5   4 328   2]
 [  3   5  19 583]]
0.952
              precision    recall  f1-score   support

           1       0.95      0.95      0.95       505
           2       0.97      0.94      0.95       546
           3       0.93      0.97      0.95       339
           4       0.95      0.96      0.95       610

    accuracy                           0.95      2000
   macro avg       0.95      0.95      0.95      2000
weighted avg       0.95      0.95      0.95      2000



In [12]:
model =XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,            ## add main
              colsample_bynode=1, colsample_bytree=0.9, gamma=0.1,
              learning_rate=0.5, max_delta_step=0, max_depth=10,
              min_child_weight=2, missing=None, n_estimators=50, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=6,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.7, verbosity=0)

model.fit(X_train, y_train)

y_predict = model.predict(X_val)
print(confusion_matrix(y_val,y_predict)) 
print(accuracy_score(y_val,y_predict))
print(classification_report(y_val,y_predict))



[[488  10   1   6]
 [ 18 518   4   6]
 [  4   2 329   4]
 [  6   3  16 585]]
0.96
              precision    recall  f1-score   support

           1       0.95      0.97      0.96       505
           2       0.97      0.95      0.96       546
           3       0.94      0.97      0.96       339
           4       0.97      0.96      0.97       610

    accuracy                           0.96      2000
   macro avg       0.96      0.96      0.96      2000
weighted avg       0.96      0.96      0.96      2000



In [13]:
X_test = clean(data_test)

In [14]:
y_test = model.predict(X_test)

In [15]:
data_test_copy['Severity'] = y_test

In [16]:
data_test_copy.Severity.replace({1:'Minor_Damage_And_Injuries' ,
                             2:'Significant_Damage_And_Serious_Injuries',
                             3:'Significant_Damage_And_Fatalities',
                             4:'Highly_Fatal_And_Damaging' },inplace = True)

In [17]:
data_test_copy[['Accident_ID','Severity']].to_csv('rgd_xgb_air_accident.csv')