In [312]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import cross_validate
from sklearn import metrics   #Additional scklearn functions
from sklearn.model_selection import learning_curve, GridSearchCV    #Perforing grid search

In [313]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
ac_id = test['Accident_ID']
test.drop('Accident_ID',axis=1, inplace = True)
train.columns

Index(['Severity', 'Safety_Score', 'Days_Since_Inspection',
       'Total_Safety_Complaints', 'Control_Metric', 'Turbulence_In_gforces',
       'Cabin_Temperature', 'Accident_Type_Code', 'Max_Elevation',
       'Violations', 'Adverse_Weather_Metric', 'Accident_ID'],
      dtype='object')

In [314]:
train

Unnamed: 0,Severity,Safety_Score,Days_Since_Inspection,Total_Safety_Complaints,Control_Metric,Turbulence_In_gforces,Cabin_Temperature,Accident_Type_Code,Max_Elevation,Violations,Adverse_Weather_Metric,Accident_ID
0,Minor_Damage_And_Injuries,49.223744,14,22,71.285324,0.272118,78.04,2,31335.476824,3,0.424352,7570
1,Minor_Damage_And_Injuries,62.465753,10,27,72.288058,0.423939,84.54,2,26024.711057,2,0.352350,12128
2,Significant_Damage_And_Fatalities,63.059361,13,16,66.362808,0.322604,78.86,7,39269.053927,3,0.003364,2181
3,Significant_Damage_And_Serious_Injuries,48.082192,11,9,74.703737,0.337029,81.79,3,42771.499200,1,0.211728,5946
4,Significant_Damage_And_Fatalities,26.484018,13,25,47.948952,0.541140,77.16,3,35509.228515,2,0.176883,9054
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,Significant_Damage_And_Serious_Injuries,56.118721,8,1,63.445761,0.482359,78.64,5,38537.363408,2,0.026150,11493
9996,Highly_Fatal_And_Damaging,40.365297,10,7,62.169553,0.421775,79.77,4,40846.916900,2,0.074898,305
9997,Significant_Damage_And_Serious_Injuries,27.853881,17,1,69.598906,0.318277,80.37,3,33771.996300,3,0.168173,612
9998,Minor_Damage_And_Injuries,56.210046,8,0,39.835916,0.598118,76.92,5,39820.419251,2,0.026381,4963


In [315]:
test

Unnamed: 0,Safety_Score,Days_Since_Inspection,Total_Safety_Complaints,Control_Metric,Turbulence_In_gforces,Cabin_Temperature,Accident_Type_Code,Max_Elevation,Violations,Adverse_Weather_Metric
0,19.497717,16,6,72.151322,0.388959,78.32,4,37949.724386,2,0.069692
1,58.173516,15,3,64.585232,0.250841,78.60,7,30194.805567,2,0.002777
2,33.287671,15,3,64.721969,0.336669,86.96,6,17572.925484,1,0.004316
3,3.287671,21,5,66.362808,0.421775,80.86,3,40209.186341,2,0.199990
4,10.867580,18,2,56.107566,0.313228,79.22,2,35495.525408,2,0.483696
...,...,...,...,...,...,...,...,...,...,...
2495,68.127854,8,11,49.680948,0.470819,77.80,2,48251.586622,2,0.659387
2496,54.840183,9,9,71.194166,0.417087,79.77,6,20761.984416,1,0.005010
2497,36.712329,14,10,91.203282,0.313950,83.96,6,36676.100601,2,0.008629
2498,38.721461,13,14,73.154057,0.275003,77.38,6,24211.359510,1,0.006247


In [316]:
train.drop(['Adverse_Weather_Metric','Max_Elevation','Accident_ID','Cabin_Temperature','Turbulence_In_gforces'],axis = 1, inplace=True)

In [317]:
train

Unnamed: 0,Severity,Safety_Score,Days_Since_Inspection,Total_Safety_Complaints,Control_Metric,Accident_Type_Code,Violations
0,Minor_Damage_And_Injuries,49.223744,14,22,71.285324,2,3
1,Minor_Damage_And_Injuries,62.465753,10,27,72.288058,2,2
2,Significant_Damage_And_Fatalities,63.059361,13,16,66.362808,7,3
3,Significant_Damage_And_Serious_Injuries,48.082192,11,9,74.703737,3,1
4,Significant_Damage_And_Fatalities,26.484018,13,25,47.948952,3,2
...,...,...,...,...,...,...,...
9995,Significant_Damage_And_Serious_Injuries,56.118721,8,1,63.445761,5,2
9996,Highly_Fatal_And_Damaging,40.365297,10,7,62.169553,4,2
9997,Significant_Damage_And_Serious_Injuries,27.853881,17,1,69.598906,3,3
9998,Minor_Damage_And_Injuries,56.210046,8,0,39.835916,5,2


In [318]:
test.drop(['Adverse_Weather_Metric','Max_Elevation','Cabin_Temperature','Turbulence_In_gforces'],axis = 1, inplace=True)

In [319]:
train['Severity'] = train['Severity'].astype('category')

In [320]:
class_dict = dict(zip(train['Severity'].cat.codes,train['Severity']))

In [321]:
train['Severity'] = train['Severity'].cat.codes

In [322]:
class_dict

{1: 'Minor_Damage_And_Injuries',
 2: 'Significant_Damage_And_Fatalities',
 3: 'Significant_Damage_And_Serious_Injuries',
 0: 'Highly_Fatal_And_Damaging'}

In [323]:
target = 'Severity'
IDcol = 'Accident_ID'

In [333]:
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    #X_train, X_test, y_train, y_test = train_test_split(dtrain[predictors], dtrain['Severity'], test_size=0.25, random_state=42)
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgb_param['num_class'] = 4
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='mlogloss', early_stopping_rounds=early_stopping_rounds,verbose_eval=None)
        alg.set_params(n_estimators=cvresult.shape[0])
    print(cvresult)
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['Severity'],eval_metric='mlogloss')
    #alg.fit(X_train, y_train,eval_metric='mlogloss')
    #Predict training set:
    dtrain_predictions = alg.predict(test[predictors])
    #dtrain_predictions = alg.predict(X_test)
    dtrain_predprob = alg.predict_proba(test[predictors])[:,1]
   
    #Print model report:
    print ("\nModel Report")
    #f1 = f1_score(y_test.values, dtrain_predictions, average='weighted')
    #print('F1 Score',f1 )
    #print ("Accuracy : %.4g" % metrics.accuracy_score(dtrain['Severity'].values, dtrain_predictions))
    #print ("Accuracy : %.4g" % metrics.accuracy_score(y_test.values, dtrain_predictions))
    #print ("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Severity'], dtrain_predprob))
    dtrain_predictions = pd.DataFrame({'Accident_ID':ac_id,'Severity': dtrain_predictions})
    dtrain_predictions.Severity.replace(class_dict, inplace=True)
    dtrain_predictions.to_csv('sample_submission.csv', index=False)
    #feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    #feat_imp.plot(kind='bar', title='Feature Importances')
    #plt.ylabel('Feature Importance Score')

In [334]:
#Choose all predictors except target & IDcols
predictors = [x for x in train.columns if x not in [target, IDcol]]
xgb1 = XGBClassifier(
 learning_rate =0.15,
 n_estimators=1100,
 max_depth=9,
 min_child_weight=1,
 gamma=0.0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'multi:softmax',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb1, train, predictors)

     train-mlogloss-mean  train-mlogloss-std  test-mlogloss-mean  \
0               1.238870            0.000582            1.256405   
1               1.119072            0.019584            1.148569   
2               1.017633            0.023995            1.058730   
3               0.922486            0.017821            0.973072   
4               0.847333            0.023930            0.906199   
..                   ...                 ...                 ...   
260             0.005521            0.000067            0.093393   
261             0.005498            0.000066            0.093383   
262             0.005480            0.000067            0.093365   
263             0.005459            0.000064            0.093332   
264             0.005435            0.000063            0.093325   

     test-mlogloss-std  
0             0.002301  
1             0.020372  
2             0.025159  
3             0.017812  
4             0.024921  
..                 ...  
260     

In [336]:
nums=[2,7,11,9]

In [338]:
s = set(nums)

In [339]:
7 in s

True