In [1]:
# For reading data filenames 
import glob


# For calculating Time 
import datetime
import time


# For processing data
import math
import pandas as pd
import numpy as np

## For Features Creation
from feature_engineering import *

## For Model Building
import lightgbm as lgb


## For Model Evaluation
from sklearn.metrics import accuracy_score, roc_curve, auc, confusion_matrix, accuracy_score

## For Plotting Graphs
import matplotlib
import matplotlib.pyplot  as plt
import seaborn as sns 


## for cross validation
from sklearn.model_selection import KFold, train_test_split

## Hyperparameter Tunning
import hyperopt
from hyperopt import hp, tpe, STATUS_OK, Trials
import pickle


import warnings
warnings.filterwarnings('ignore')

#from sklearn.metrics import roc_curve, auc, confusion_matrix, accuracy_score

#from keras import Sequential
#from keras.layers import Dense, LSTM

matplotlib.rcParams['figure.figsize'] = [10, 10]



Using TensorFlow backend.


In [2]:
## Load the data and sorted the data at boookingID*second level

def data_load_fun(train_path):
    start = datetime.datetime.now().replace(microsecond=0)
    features = pd.DataFrame()
    filename = []
    for files in glob.glob(train_path):
        filename.append(files)
    for file in filename:
        print (file)
        tmp_df = pd.read_csv(file)
        features = pd.concat([features,tmp_df], axis=0)
    features = features.sort_values(by= ['bookingID',"second"])
    end = datetime.datetime.now().replace(microsecond=0)
    print (len(filename), " Files Loaded Successfuly; Time Taken -->" ,end-start)
    return features
    

In [3]:
def data_pre_processing(df):
    df = df[df.second <10000]
    return df

In [4]:
## Features Creation
def features_creation(features):
    start = datetime.datetime.now().replace(microsecond=0)
    
    features = generic_features_creation_01(features)
    
    windows_feas1 = window_features_creation1_03(features,window_size=10,over_wd=5,
                                   cols =['Accuracy','Bearing','Speed','gyro','acceleration','acc_cal'])
    windows_feas2 = window_features_creation1_03(features,window_size=10,over_wd=5,
                                   cols =['signal_weak','stop','trip_start','missing_ind','trip_end'], 
                                    metrics = {"sum","max"})
    window_feas_final = pd.merge(windows_feas1, windows_feas2, on= ['bookingID','window'], how = "left")
    end = datetime.datetime.now().replace(microsecond=0)
    print (window_feas_final.shape[1], " Features Created at sliding waindow of 10s with overlap of 5s; Time Taken->", end-start)
    
    fset1 = generic_trip_features_02(features)
    fset2 = window_grp_stop_04(window_feas_final)
    fset3 = window_grp_speed_05(window_feas_final)
    fset4 = window_grp_bearing_06(window_feas_final, cols = ['Bearing_std',"Bearing_max","Bearing_min"], thres=10, var_nm="turn")
    fset5 = trip_ending_fes_07(window_feas_final)
    fset6 = window_grp_accuracy_08(window_feas_final)
    fset7 = generic_stats_features_09(window_feas_final, cols= ["Speed","acc_cal","Accuracy","acceleration","gyro"])
    fset8 = events_calculation_10(window_feas_final)

    fset=fset1.copy()
    fset = fset.merge(fset2, on = ['bookingID'], how = "left")
    fset = fset.merge(fset3, on = ['bookingID'], how = "left")
    fset = fset.merge(fset4, on = ['bookingID'], how = "left")
    fset = fset.merge(fset5, on = ['bookingID'], how = "left")
    fset = fset.merge(fset6, on = ['bookingID'], how = "left")
    fset = fset.merge(fset7, on = ['bookingID'], how = "left")
    fset = fset.merge(fset8, on = ['bookingID'], how = "left")
    
    end1 = datetime.datetime.now().replace(microsecond=0)
    print (fset.shape[1], " Features Created from windows data; Time taken ->", end1- end)
    return fset

In [5]:
train_path = '../01_Data/Train/*.csv'
label_path = '../01_Data/labels/part-00000-e9445087-aa0a-433b-a7f6-7f4c19d78ad6-c000.csv'
#train_path = '../01_Data/features/*.csv'
train_data = data_load_fun(train_path)
train_data = data_pre_processing(train_data)
features_all = features_creation(train_data)


../01_Data/Train/part-00001-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
../01_Data/Train/part-00007-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
../01_Data/Train/part-00002-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
../01_Data/Train/part-00003-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
../01_Data/Train/part-00004-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
../01_Data/Train/part-00008-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
../01_Data/Train/part-00009-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
../01_Data/Train/part-00006-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
../01_Data/Train/part-00005-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
../01_Data/Train/part-00000-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
10  Files Loaded Successfuly; Time Taken --> 0:00:55


KeyboardInterrupt: 

In [None]:
## read Labels and Incase duplicate value keep the positive one 
labels = pd.read_csv(label_path)
labels = labels.groupby(['bookingID']).max().reset_index()
labels.shape

In [None]:
features_all.head()

In [None]:
features_all = pd.merge(features_all, labels, on= ['bookingID'], how="left")

drop_cols = ['bookingID','label']
target = features_all[['bookingID','label']]
features_all = features_all.drop(columns = drop_cols, axis=1)

train_df, val_df, train_target, val_target = train_test_split(features_all, target, test_size=0.2, random_state=1259)
train_df.shape, val_df.shape

In [None]:
train_target.label.value_counts(normalize = True)

In [None]:
val_target.label.value_counts(normalize = True)

In [None]:
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True, random_state=11)

best_params = {'num_leaves': 8, 'colsample_bytree': 0.7414265939570617, 'max_depth': 6, 
               'learning_rate': 0.019922131503094735, 'subsample': 0.5675422486112608,
               'min_data_in_leaf': 51, 'min_sum_hessian_in_leaf': 10, 'bagging_freq': 9,
               'scale_pos_weight': (1 - np.mean(train_target.label))/np.mean(train_target.label)}

In [None]:
def train_model(X=train_df, X_test=val_df, y=train_target[['label']], params=None, folds=folds,
                plot_feature_importance=False):
    print ("Train Data -- >", X.shape[0])
    print ("Test Data -- >", X_test.shape[0])
        
    oof = np.zeros(len(X))
    prediction = np.zeros(len(X_test))
    scores = []
    feature_importance = pd.DataFrame()
    
    
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
        print('Fold', fold_n, 'started at', time.ctime())
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
       

        model = lgb.LGBMClassifier(**params, n_estimators = 2000, nthread = 4, n_jobs = -1)
        model.fit(X_train, y_train, 
                        eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='auc',
                        verbose=500, early_stopping_rounds=300)

        y_pred_valid = model.predict_proba(X_valid)[:,1]
        y_pred = model.predict_proba(X_test, num_iteration=model.best_iteration_)[:,1]
            
        oof[valid_index] = y_pred_valid.reshape(-1,)
        fpr, tpr, thresholds = roc_curve(y_valid.label,y_pred_valid)
        roc_auc_valid = auc(fpr, tpr)
        scores.append(roc_auc_valid)
        
        prediction += y_pred    
        
      
        fold_importance = pd.DataFrame()
        fold_importance["feature"] = X.columns
        fold_importance["importance"] = model.feature_importances_
        fold_importance["fold"] = fold_n + 1
        feature_importance = pd.concat([feature_importance, fold_importance], axis=0)

    prediction /= n_fold
    
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    
    feature_importance["importance"] /= n_fold
    if plot_feature_importance:
        cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

        best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

        plt.figure(figsize=(16, 12));
        sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
        plt.title('LGB Features (avg over folds)');
        
        return oof, prediction, feature_importance
    return oof, prediction
    
   

In [None]:
oof, pred, fp = train_model(params=best_params, plot_feature_importance=True)

In [None]:
train_pred = oof
val_pred = pred

print ("stats--->")
print ("")
#y_train_pred = len*[0]


fpr_train, tpr_train, thresholds = roc_curve(train_target.label,train_pred)
roc_auc_train = auc(fpr_train, tpr_train)

fpr_val, tpr_val, thresholds = roc_curve(val_target.label,val_pred)
roc_auc_val = auc(fpr_val, tpr_val)

print ("Train ROC --> ", roc_auc_train)
print ("Val ROC --> ", roc_auc_val)

plt.figure()
plt.plot(fpr_train, tpr_train, color='darkorange', lw=2, label='Train ROC curve (area = %0.2f)' % roc_auc_train)
plt.plot(fpr_val, tpr_val, color='red', lw=2, label='Val ROC curve (area = %0.2f)' % roc_auc_val)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

## HyperParamter Tunning 

In [None]:
# Hyperparameter tuning
param_dict={}
space = {'max_depth': hp.choice('max_depth', np.arange(-1, 10,dtype=int)),
         'min_data_in_leaf': hp.choice('min_data_in_leaf', np.arange(10, 400,dtype=int)),
         'min_sum_hessian_in_leaf': hp.choice('min_sum_hessian_in_leaf', np.arange(0, 15,dtype=int)),
         'num_leaves': hp.choice('num_leaves', np.arange(2, 20, dtype=int)),
         'bagging_freq': hp.choice('bagging_freq', np.arange(1, 20, dtype=int)),
         'subsample': hp.uniform('subsample', 0, 1),
         'colsample_bytree': hp.uniform('colsample_bytree', 0, 1),
        'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
        }


def objective(space):
    params = {
        'min_sum_hessian_in_leaf': space['min_sum_hessian_in_leaf'],
        'min_data_in_leaf':space['min_data_in_leaf'],
            'num_leaves':space['num_leaves'],
             'subsample': space['subsample'],
            'colsample_bytree': space['colsample_bytree'],
            'learning_rate':space['learning_rate'],
            'silent': 1,
            'verbose_eval': True,
            "objective":"binary",
        'device':'cpu',
        "boosting":"gbdt",
        'max_depth':space['max_depth'],
        'bagging_freq':space['bagging_freq'],
          'metric':"auc",
    'boost_from_average':False,
    }
    lgtrain = lgb.Dataset(features_all, label=target.label.values)
    cv = lgb.cv(params,
                lgtrain,
                nfold=5,metrics='auc',
                num_boost_round=20000,
                early_stopping_rounds=600,stratified=True,shuffle=True,verbose_eval=-1)
    au = (cv['auc-mean'][-1])
    params['n_estimators']=len(cv['auc-mean'])
    param_dict[au]=params
    pickle.dump(param_dict,open('params','wb'))
    #print(params, file=open("output_lgb.txt", "a"))
    #print(params)
    #print('max= ',round(max(param_dict.keys()),5) )
    #print ('auc= ', round(au,3))
    #print(au, file=open("output_lgb.txt", "a"))
    return{'loss': -au, 'status': STATUS_OK}


trials = Trials()
best = hyperopt.fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=1000,
            trials=trials
            )



In [56]:
best

{'bagging_freq': 9,
 'colsample_bytree': 0.7414265939570617,
 'learning_rate': 0.019922131503094735,
 'max_depth': 6,
 'min_data_in_leaf': 51,
 'min_sum_hessian_in_leaf': 10,
 'num_leaves': 8,
 'subsample': 0.5675422486112608}