In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

In [13]:
import warnings
warnings.filterwarnings('ignore')

In [35]:
data = pd.read_csv('train_mod.csv')
test = pd.read_csv('test_mod.csv')
target_col = ['Severity']

In [15]:
label_dict = {}
for idx, val in enumerate(data['Severity'].unique()):
    label_dict[val] = idx+1

data['Severity'] = data['Severity'].apply(lambda row: label_dict[row] )

In [19]:
def remove_feature(drop_col, target_col, id_col, dataframe):
    cols = [i for i in dataframe.columns if i not in drop_col + target_col + id_col]
    return cols


def rfe_results(model, df, id_col, drop_col, target_col):
    Y = df[target_col]
    keep_cols = remove_feature(drop_col,target_col, id_col, df)
    X= df[keep_cols]
    scores = cross_val_score(model, X, Y, cv = 5)
    print('Eliminating column ',drop_col,' Score is: ',scores.sum()/len(scores))

def rfe_report(model, data):
    id_col = ['Accident_ID']
    target_col = ['Severity']
    all_cols = [i for i in data.columns if i not in id_col+target_col]
    
    for column in all_cols:
        drop_cols = [column]
        rfe_results(model, data, id_col, drop_cols, target_col)
    
    scores = cross_val_score(model, data[all_cols], data[target_col], cv = 5)
    print('Not Eliminating any column, score is: ',scores.sum()/len(scores))
    return all_cols

## AdaBoost

In [20]:
data_ab = data.copy()
test_ab = test.copy()

In [22]:
ab = AdaBoostClassifier(n_estimators= 200, learning_rate=1.0, random_state=123)

all_cols = rfe_report(ab,data_ab)

Eliminating column  ['Safety_Score']  Score is:  0.5708
Eliminating column  ['Days_Since_Inspection']  Score is:  0.5707000000000001
Eliminating column  ['Total_Safety_Complaints']  Score is:  0.5662999999999999
Eliminating column  ['Control_Metric']  Score is:  0.6237
Eliminating column  ['Turbulence_In_gforces']  Score is:  0.5677
Eliminating column  ['Cabin_Temperature']  Score is:  0.5687
Eliminating column  ['Accident_Type_Code']  Score is:  0.562
Eliminating column  ['Max_Elevation']  Score is:  0.57
Eliminating column  ['Violations']  Score is:  0.5699
Eliminating column  ['Adverse_Weather_Metric']  Score is:  0.5650999999999999
Eliminating column  ['ftr1']  Score is:  0.5556
Eliminating column  ['ftr2']  Score is:  0.5698000000000001
Eliminating column  ['ftr3']  Score is:  0.5600999999999999
Eliminating column  ['DSI_Segmented']  Score is:  0.5683999999999999
Not Eliminating any column, score is:  0.5683999999999999


## Gradient Boost

In [30]:
data_gb = data.copy()
test_gb = test.copy()

In [23]:
gb = GradientBoostingClassifier()
gb

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [26]:
gb = GradientBoostingClassifier(max_depth = 2)

all_cols = rfe_report(gb,data_gb)

Eliminating column  ['Safety_Score']  Score is:  0.6954
Eliminating column  ['Days_Since_Inspection']  Score is:  0.7545999999999999
Eliminating column  ['Total_Safety_Complaints']  Score is:  0.7723000000000001
Eliminating column  ['Control_Metric']  Score is:  0.7212
Eliminating column  ['Turbulence_In_gforces']  Score is:  0.7703
Eliminating column  ['Cabin_Temperature']  Score is:  0.7722
Eliminating column  ['Accident_Type_Code']  Score is:  0.7743
Eliminating column  ['Max_Elevation']  Score is:  0.7721
Eliminating column  ['Violations']  Score is:  0.7727
Eliminating column  ['Adverse_Weather_Metric']  Score is:  0.7744
Eliminating column  ['ftr1']  Score is:  0.7499
Eliminating column  ['ftr2']  Score is:  0.7713
Eliminating column  ['ftr3']  Score is:  0.7699
Eliminating column  ['DSI_Segmented']  Score is:  0.7727
Not Eliminating any column, score is:  0.7727


In [31]:
data_gb = data_gb.drop(columns={'Adverse_Weather_Metric'})
all_cols.remove('Adverse_Weather_Metric')

all_cols= rfe_report(gb, data_gb)

Eliminating column  ['Safety_Score']  Score is:  0.6968
Eliminating column  ['Days_Since_Inspection']  Score is:  0.7552
Eliminating column  ['Total_Safety_Complaints']  Score is:  0.7748999999999999
Eliminating column  ['Control_Metric']  Score is:  0.7207
Eliminating column  ['Turbulence_In_gforces']  Score is:  0.7745
Eliminating column  ['Cabin_Temperature']  Score is:  0.7745999999999998
Eliminating column  ['Accident_Type_Code']  Score is:  0.7870999999999999
Eliminating column  ['Max_Elevation']  Score is:  0.7746
Eliminating column  ['Violations']  Score is:  0.7744
Eliminating column  ['ftr1']  Score is:  0.7484999999999999
Eliminating column  ['ftr2']  Score is:  0.7732
Eliminating column  ['ftr3']  Score is:  0.7680999999999999
Eliminating column  ['DSI_Segmented']  Score is:  0.7744
Not Eliminating any column, score is:  0.7744


In [32]:
data_gb = data_gb.drop(columns={'Accident_Type_Code'})
all_cols.remove('Accident_Type_Code')

all_cols= rfe_report(gb, data_gb)

Eliminating column  ['Safety_Score']  Score is:  0.693
Eliminating column  ['Days_Since_Inspection']  Score is:  0.7585
Eliminating column  ['Total_Safety_Complaints']  Score is:  0.7861
Eliminating column  ['Control_Metric']  Score is:  0.7414
Eliminating column  ['Turbulence_In_gforces']  Score is:  0.7884
Eliminating column  ['Cabin_Temperature']  Score is:  0.7889999999999999
Eliminating column  ['Max_Elevation']  Score is:  0.7886000000000001
Eliminating column  ['Violations']  Score is:  0.7870999999999999
Eliminating column  ['ftr1']  Score is:  0.748
Eliminating column  ['ftr2']  Score is:  0.7878000000000001
Eliminating column  ['ftr3']  Score is:  0.8099999999999999
Eliminating column  ['DSI_Segmented']  Score is:  0.7870999999999999
Not Eliminating any column, score is:  0.7870999999999999


In [33]:
data_gb = data_gb.drop(columns={'ftr3'})
all_cols.remove('ftr3')

In [51]:
gb = GradientBoostingClassifier(max_depth = 2)
all_cols= rfe_report(gb, data_gb)

Eliminating column  ['Safety_Score']  Score is:  0.7339
Eliminating column  ['Days_Since_Inspection']  Score is:  0.7896
Eliminating column  ['Total_Safety_Complaints']  Score is:  0.8108000000000001
Eliminating column  ['Control_Metric']  Score is:  0.7589
Eliminating column  ['Turbulence_In_gforces']  Score is:  0.8116000000000001
Eliminating column  ['Cabin_Temperature']  Score is:  0.8099999999999999
Eliminating column  ['Max_Elevation']  Score is:  0.8141999999999999
Eliminating column  ['Violations']  Score is:  0.8089999999999999
Eliminating column  ['ftr1']  Score is:  0.7813000000000001
Eliminating column  ['ftr2']  Score is:  0.8119
Eliminating column  ['DSI_Segmented']  Score is:  0.8099999999999999
Not Eliminating any column, score is:  0.8099000000000001


In [52]:
data_gb = data_gb.drop(columns={'Max_Elevation'})
all_cols.remove('Max_Elevation')

all_cols= rfe_report(gb, data_gb)

Eliminating column  ['Safety_Score']  Score is:  0.7383
Eliminating column  ['Days_Since_Inspection']  Score is:  0.7907000000000001
Eliminating column  ['Total_Safety_Complaints']  Score is:  0.8115
Eliminating column  ['Control_Metric']  Score is:  0.7623
Eliminating column  ['Turbulence_In_gforces']  Score is:  0.8165000000000001
Eliminating column  ['Cabin_Temperature']  Score is:  0.8119
Eliminating column  ['Violations']  Score is:  0.8128
Eliminating column  ['ftr1']  Score is:  0.7888
Eliminating column  ['ftr2']  Score is:  0.8093
Eliminating column  ['DSI_Segmented']  Score is:  0.8143
Not Eliminating any column, score is:  0.8140000000000001


In [68]:
gb = GradientBoostingClassifier(max_depth = 2)
data_gb = data_gb.drop(columns={'Turbulence_In_gforces'})
all_cols.remove('Turbulence_In_gforces')

all_cols= rfe_report(gb, data_gb)

Eliminating column  ['Safety_Score']  Score is:  0.7321
Eliminating column  ['Days_Since_Inspection']  Score is:  0.7908000000000002
Eliminating column  ['Total_Safety_Complaints']  Score is:  0.8160000000000001
Eliminating column  ['Control_Metric']  Score is:  0.7577999999999999
Eliminating column  ['Cabin_Temperature']  Score is:  0.8177999999999999
Eliminating column  ['Violations']  Score is:  0.8162
Eliminating column  ['ftr1']  Score is:  0.7907
Eliminating column  ['ftr2']  Score is:  0.8151999999999999
Eliminating column  ['DSI_Segmented']  Score is:  0.8165000000000001
Not Eliminating any column, score is:  0.8165000000000001


In [None]:
gb = GradientBoostingClassifier(max_depth = 2)
data_gb = data_gb.drop(columns={'Turbulence_In_gforces'})
all_cols.remove('Turbulence_In_gforces')

all_cols= rfe_report(gb, data_gb)

In [72]:
gb = GradientBoostingClassifier(random_state=123, n_estimators=1000, max_depth = 4, max_features= 3)
print(gb)
train_X, test_X,train_y, test_y = train_test_split(data_gb[all_cols], data_gb[target_col], random_state=111)
gb.fit(train_X,train_y)
print('Train Accuracy ',gb.score(train_X,train_y)*100,'%')
print('Test Accuracy ',gb.score(test_X, test_y)*100,'%')

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=4,
                           max_features=3, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=1000,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=123, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
Train Accuracy  100.0 %
Test Accuracy  95.72 %


In [70]:
test_gb['SeverityCode'] = gb.predict(test_gb[all_cols])

def get_key(code):
    for key, val in label_dict.items():
        if code == val:
            return key
    return 'False'
test_gb['Severity'] = test_gb['SeverityCode'].apply(lambda row: get_key(row))

test_save = test_gb[['Accident_ID', 'Severity']]

test_save= test_save.set_index('Accident_ID')

test_save.to_csv('final_submit_gb.csv')

## Random Forest

In [27]:
data_rf = data.copy()
test_gb = test.copy()

In [77]:
rf = RandomForestClassifier(random_state=123, n_estimators=200, n_jobs=-1) 

all_cols= rfe_report(rf, data_rf)

Eliminating column  ['Safety_Score']  Score is:  0.8135
Eliminating column  ['Days_Since_Inspection']  Score is:  0.8852
Eliminating column  ['Total_Safety_Complaints']  Score is:  0.9379
Eliminating column  ['Control_Metric']  Score is:  0.8443999999999999
Eliminating column  ['Turbulence_In_gforces']  Score is:  0.9389
Eliminating column  ['Cabin_Temperature']  Score is:  0.9378
Eliminating column  ['Accident_Type_Code']  Score is:  0.9459
Eliminating column  ['Max_Elevation']  Score is:  0.9385999999999999
Eliminating column  ['Violations']  Score is:  0.9396000000000001
Eliminating column  ['Adverse_Weather_Metric']  Score is:  0.9469999999999998
Eliminating column  ['ftr1']  Score is:  0.9161000000000001
Eliminating column  ['ftr2']  Score is:  0.9404
Eliminating column  ['ftr3']  Score is:  0.9475
Eliminating column  ['DSI_Segmented']  Score is:  0.9277
Not Eliminating any column, score is:  0.9322000000000001


In [79]:
data_rf = data_rf.drop(columns={'Adverse_Weather_Metric'})
all_cols.remove('Adverse_Weather_Metric')

all_cols= rfe_report(rf, data_rf)

Eliminating column  ['Safety_Score']  Score is:  0.861
Eliminating column  ['Days_Since_Inspection']  Score is:  0.9087
Eliminating column  ['Total_Safety_Complaints']  Score is:  0.9483
Eliminating column  ['Control_Metric']  Score is:  0.8614
Eliminating column  ['Turbulence_In_gforces']  Score is:  0.9483
Eliminating column  ['Cabin_Temperature']  Score is:  0.9490999999999999
Eliminating column  ['Accident_Type_Code']  Score is:  0.954
Eliminating column  ['Max_Elevation']  Score is:  0.9488
Eliminating column  ['Violations']  Score is:  0.9481999999999999
Eliminating column  ['ftr1']  Score is:  0.9324999999999999
Eliminating column  ['ftr2']  Score is:  0.9501
Eliminating column  ['ftr3']  Score is:  0.9551999999999999
Eliminating column  ['DSI_Segmented']  Score is:  0.9446
Not Eliminating any column, score is:  0.9469999999999998


In [82]:
data_rf = data_rf.drop(columns={'ftr3'})
all_cols.remove('ftr3')

all_cols= rfe_report(rf, data_rf)

Eliminating column  ['Safety_Score']  Score is:  0.9075
Eliminating column  ['Days_Since_Inspection']  Score is:  0.9350000000000002
Eliminating column  ['Total_Safety_Complaints']  Score is:  0.9565999999999999
Eliminating column  ['Control_Metric']  Score is:  0.8804000000000001
Eliminating column  ['Turbulence_In_gforces']  Score is:  0.9576
Eliminating column  ['Cabin_Temperature']  Score is:  0.9575000000000001
Eliminating column  ['Accident_Type_Code']  Score is:  0.9565999999999999
Eliminating column  ['Max_Elevation']  Score is:  0.9587
Eliminating column  ['Violations']  Score is:  0.9577
Eliminating column  ['ftr1']  Score is:  0.9496
Eliminating column  ['ftr2']  Score is:  0.958
Eliminating column  ['DSI_Segmented']  Score is:  0.9555
Not Eliminating any column, score is:  0.9551999999999999


In [84]:
data_rf = data_rf.drop(columns={'Turbulence_In_gforces'})
all_cols.remove('Turbulence_In_gforces')

all_cols= rfe_report(rf, data_rf)

Eliminating column  ['Safety_Score']  Score is:  0.9186
Eliminating column  ['Days_Since_Inspection']  Score is:  0.9414999999999999
Eliminating column  ['Total_Safety_Complaints']  Score is:  0.9577
Eliminating column  ['Control_Metric']  Score is:  0.8831
Eliminating column  ['Cabin_Temperature']  Score is:  0.9596
Eliminating column  ['Accident_Type_Code']  Score is:  0.9559000000000001
Eliminating column  ['Max_Elevation']  Score is:  0.9583
Eliminating column  ['Violations']  Score is:  0.9591000000000001
Eliminating column  ['ftr1']  Score is:  0.9538
Eliminating column  ['ftr2']  Score is:  0.9611000000000001
Eliminating column  ['DSI_Segmented']  Score is:  0.9567
Not Eliminating any column, score is:  0.9576


In [89]:
data_rf = data.drop(columns={'Max_Elevation'})
all_cols.remove('Max_Elevation')

all_cols= rfe_report(rf, data_rf)

Eliminating column  ['Safety_Score']  Score is:  0.9209999999999999
Eliminating column  ['Days_Since_Inspection']  Score is:  0.9456
Eliminating column  ['Total_Safety_Complaints']  Score is:  0.9611999999999998
Eliminating column  ['Control_Metric']  Score is:  0.8924
Eliminating column  ['Cabin_Temperature']  Score is:  0.9639999999999999
Eliminating column  ['Accident_Type_Code']  Score is:  0.9574999999999999
Eliminating column  ['Violations']  Score is:  0.961
Eliminating column  ['ftr1']  Score is:  0.9559
Eliminating column  ['ftr2']  Score is:  0.9620999999999998
Eliminating column  ['DSI_Segmented']  Score is:  0.9593999999999999
Not Eliminating any column, score is:  0.9583


In [92]:
data_rf = data_rf.drop(columns={'Cabin_Temperature'})
all_cols.remove('Cabin_Temperature')

all_cols= rfe_report(rf, data_rf)

Eliminating column  ['Safety_Score']  Score is:  0.9182
Eliminating column  ['Days_Since_Inspection']  Score is:  0.9373999999999999
Eliminating column  ['Total_Safety_Complaints']  Score is:  0.9585999999999999
Eliminating column  ['Control_Metric']  Score is:  0.8986000000000001
Eliminating column  ['Accident_Type_Code']  Score is:  0.9574999999999999
Eliminating column  ['Violations']  Score is:  0.9602
Eliminating column  ['ftr1']  Score is:  0.9526999999999999
Eliminating column  ['ftr2']  Score is:  0.9607000000000001
Eliminating column  ['DSI_Segmented']  Score is:  0.9568
Not Eliminating any column, score is:  0.9639999999999999


In [7]:
rf = RandomForestClassifier(random_state=123, n_estimators=165, n_jobs=-1)
print(rf)
train_X, test_X,train_y, test_y = train_test_split(data_rf[all_cols], data_rf[target_col], random_state=111)
rf.fit(train_X,train_y)
print('Train Accuracy ',rf.score(train_X,train_y)*100,'%')
print('Test Accuracy ',rf.score(test_X, test_y)*100,'%')

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=165,
                       n_jobs=-1, oob_score=False, random_state=123, verbose=0,
                       warm_start=False)


NameError: name 'all_cols' is not defined

In [114]:
test_rf['SeverityCode'] = rf.predict(test_rf[all_cols])

In [115]:
def get_key(code):
    for key, val in label_dict.items():
        if code == val:
            return key
    return 'False'
test_rf['Severity'] = test_rf['SeverityCode'].apply(lambda row: get_key(row))

In [116]:
test_save = test_rf[['Accident_ID', 'Severity']]

In [119]:
test_save= test_save.set_index('Accident_ID')

In [120]:
test_save.to_csv('final_submit_rf.csv')