In [42]:
import numpy as np # linear algebra
import gc
import os
import time
from operator import itemgetter
import lightgbm as lgb

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import matplotlib.pyplot as pltz
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV,StratifiedKFold,KFold,GroupKFold,train_test_split
from sklearn.metrics import f1_score,roc_auc_score,classification_report,confusion_matrix

In [43]:
train_df = pd.read_csv(os.path.join(os.getcwd()+'/input/train.csv'))
test_df = pd.read_csv(os.path.join(os.getcwd()+'/input/test.csv'))
sample_sub_df = pd.read_csv(os.path.join(os.getcwd()+'/input/sample_submission.csv'))

In [44]:
train_df.head()

Unnamed: 0,Severity,Safety_Score,Days_Since_Inspection,Total_Safety_Complaints,Control_Metric,Turbulence_In_gforces,Cabin_Temperature,Accident_Type_Code,Max_Elevation,Violations,Adverse_Weather_Metric,Accident_ID
0,Minor_Damage_And_Injuries,49.224,14,22,71.285,0.272,78.04,2,31335.477,3,0.424,7570
1,Minor_Damage_And_Injuries,62.466,10,27,72.288,0.424,84.54,2,26024.711,2,0.352,12128
2,Significant_Damage_And_Fatalities,63.059,13,16,66.363,0.323,78.86,7,39269.054,3,0.003,2181
3,Significant_Damage_And_Serious_Injuries,48.082,11,9,74.704,0.337,81.79,3,42771.499,1,0.212,5946
4,Significant_Damage_And_Fatalities,26.484,13,25,47.949,0.541,77.16,3,35509.229,2,0.177,9054


In [45]:
def flag_features(train_df):
    train_df['Violations_FLAG']=np.where(train_df['Violations']>0,1,0)
    train_df['Total_Safety_Complaints_FLAG']=np.where(train_df['Total_Safety_Complaints']>0,1,0)
    if 'Accident_ID' in train_df.columns:
        return train_df.drop(['Accident_ID'],axis=1)
    else:
        return train_df

#mod features
def power_transform(train_df):
    train_df['Total_Safety_Complaints_sq'] = np.power(2, train_df['Total_Safety_Complaints'])
    train_df['Days_Since_Inspection_sq'] = np.power(2, train_df['Days_Since_Inspection'])
    train_df['Safety_Score'] = np.power(2, train_df['Safety_Score'])
    return train_df
    

train_df=flag_features(train_df)
test_df2=flag_features(test_df)
train_df=power_transform(train_df)
test_df2=power_transform(test_df2)

In [46]:
class_map = {
    'Minor_Damage_And_Injuries': 0,
    'Significant_Damage_And_Fatalities': 1,
    'Significant_Damage_And_Serious_Injuries': 2,
    'Highly_Fatal_And_Damaging': 3
}
inverse_class_map = {
    0: 'Minor_Damage_And_Injuries',
    1: 'Significant_Damage_And_Fatalities',
    2: 'Significant_Damage_And_Serious_Injuries',
    3: 'Highly_Fatal_And_Damaging'
}

In [47]:
train_df2=train_df.iloc[:9000,:]
sample_base=train_df.iloc[9001:,:]

In [48]:
params = {
          "objective" : "multiclass",
          "num_class" : 4,
          "num_leaves" : 60,
          "max_depth": -1,
          "learning_rate" : 0.01,
          "bagging_fraction" : 0.9,  # subsample
          "feature_fraction" : 0.9,  # colsample_bytree
          "bagging_freq" : 5,        # subsample_freq
          "bagging_seed" : 2018,
          "verbosity" : -1 }

In [53]:
X=train_df2.drop(['Severity'],axis=1)
y=train_df2['Severity'].map(class_map)

features = [c for c in X.columns]


skf = KFold(n_splits=5, shuffle=True, random_state=2019)
oof = np.zeros((len(X),4))
predictions = np.zeros((len(test_df2),4))
feature_importance_df = pd.DataFrame()

start = time.time()


for fold_, (trn_idx, val_idx) in enumerate(skf.split(X.values, y.values)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(X.iloc[trn_idx][features], label=y.iloc[trn_idx])
    val_data = lgb.Dataset(X.iloc[val_idx][features], label=y.iloc[val_idx])

    num_round = 10000
    clf = lgb.train(params, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 100)
    oof[val_idx] = clf.predict(X.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test_df2[features], num_iteration=clf.best_iteration) / 5



fold n°0


TypeError: argument of type 'Dataset' is not iterable

In [50]:
oofargmax=oof.argmax(axis=1)
print("CV score: {:<8.5f}".format(f1_score(y, oofargmax,average='weighted')))

CV score: 0.95611 


In [54]:
predictions

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [52]:
feature_importance_df.head(15)

Unnamed: 0,feature,importance,fold
0,Safety_Score,66256,1
1,Days_Since_Inspection,43066,1
2,Total_Safety_Complaints,14554,1
3,Control_Metric,42485,1
4,Turbulence_In_gforces,23612,1
5,Cabin_Temperature,20703,1
6,Accident_Type_Code,5762,1
7,Max_Elevation,19330,1
8,Violations,4845,1
9,Adverse_Weather_Metric,18967,1


In [26]:
#submit2 CV score: 0.95644 

CV score: 0.95644 


In [61]:
X=train_df2.drop(['Severity'],axis=1)
y=train_df2['Severity'].map(class_map)

features = [c for c in X.columns]

from catboost import Pool, CatBoostClassifier
model = CatBoostClassifier(loss_function="MultiClass")
kf = KFold(n_splits=5, random_state=42, shuffle=True)

y_valid_pred = np.zeros((len(X),4))
y_test_pred = np.zeros((len(test_df2),4))

start = time.time()


for idx, (train_index, valid_index) in enumerate(kf.split(X.values,y.values)):
    print("fold n°{}".format(idx))
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    X_train, X_valid = X[features].iloc[train_index,:], X[features].iloc[valid_index,:]
    _train = Pool(X_train, label=y_train)
    _valid = Pool(X_valid, label=y_valid)
    print( "\nFold ", idx)
    fit_model = model.fit(_train,
                          eval_set=_valid,
                          use_best_model=True,
                          verbose=200
                         )
    pred = fit_model.predict_proba(X_valid)
    #print( "  f1 = ", f1_score(y_valid, pred,average='weighted') )
    y_valid_pred[valid_index] = pred
    y_test_pred += fit_model.predict_proba(test_df2[features])
#y_test_pred /= 5



fold n°0

Fold  0
0:	learn: 1.3464205	test: 1.3472251	best: 1.3472251 (0)	total: 10.9ms	remaining: 10.9s
200:	learn: 0.2916360	test: 0.3242386	best: 0.3242386 (200)	total: 1.72s	remaining: 6.85s
400:	learn: 0.2073085	test: 0.2506893	best: 0.2506893 (400)	total: 3s	remaining: 4.48s
600:	learn: 0.1677697	test: 0.2208003	best: 0.2207911 (599)	total: 4.27s	remaining: 2.84s
800:	learn: 0.1444700	test: 0.2047006	best: 0.2047006 (800)	total: 5.56s	remaining: 1.38s
999:	learn: 0.1264690	test: 0.1937062	best: 0.1937062 (999)	total: 6.86s	remaining: 0us

bestTest = 0.1937062184
bestIteration = 999

fold n°1

Fold  1
0:	learn: 1.3446956	test: 1.3448221	best: 1.3448221 (0)	total: 54.3ms	remaining: 54.2s
200:	learn: 0.3022717	test: 0.3020873	best: 0.3020873 (200)	total: 1.37s	remaining: 5.44s
400:	learn: 0.2108642	test: 0.2222064	best: 0.2222064 (400)	total: 2.65s	remaining: 3.96s
600:	learn: 0.1705130	test: 0.1914458	best: 0.1914458 (600)	total: 3.93s	remaining: 2.61s
800:	learn: 0.1453094	test: 0

In [63]:
y_valid_pred2=y_valid_pred.argmax(axis=1)
print("CV score: {:<8.5f}".format(f1_score(y, y_valid_pred2,average='weighted')))

CV score: 0.95299 


In [65]:
model.get_feature_importance(Pool(type="ShapValues")

CatBoostError: Feature importance type EFstrType.ShapValues requires training dataset                         to be passed to this function.

In [27]:
submission_file=pd.DataFrame(test_df['Accident_ID'])
submission_file['Severity']=predictions.argmax(axis=1)
submission_file['Severity']=submission_file['Severity'].map(inverse_class_map)

In [29]:
submission_file.to_csv('final_submit1.csv',index=False)