In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

In [None]:
train = pd.read_csv('../input/janatahack-mobility-analysis/train.csv')
test = pd.read_csv('../input/janatahack-mobility-analysis/test.csv')
print(train.shape, test.shape)

In [None]:
train.head()

In [None]:
for col in train.columns:
    a = train[col].nunique()
    print(f'{col}: {a}')

In [None]:
train.isnull().sum()

## Appending Train and test together for easy data manipulation

In [None]:
train['flag'] = 'train'
test['flag'] = 'test'
test.target = None
all_df = train.append(test)
all_df.tail()
cancel1_month = list(all_df['Cancellation_Last_1Month'])

In [None]:
cat_cols = ['Type_of_Cab', 'Confidence_Life_Style_Index', 'Destination_Type', 'Gender', 'Cancellation_Last_1Month']
for col in cat_cols:
    a = all_df[col].unique()
    print(f'{col}: {a}')

## Feature Engineering

In [None]:
all_df['Confidence_Life_Style_Index_new'] = [0 if x== 'A'else x for x in all_df['Confidence_Life_Style_Index']]
all_df['Confidence_Life_Style_Index_new'] = [1 if x== 'B'else x for x in all_df['Confidence_Life_Style_Index_new']]
all_df['Confidence_Life_Style_Index_new'] = [2 if x== 'C'else x for x in all_df['Confidence_Life_Style_Index_new']]

all_df['Type_of_Cab_new'] = ['F' if x not in ('A', 'B', 'C', 'D', 'E') else x for x in all_df['Type_of_Cab']]

In [None]:
mean_LS_index = np.mean(all_df['Life_Style_Index'])
all_df['lifestyle_plus'] = all_df['Life_Style_Index'] + all_df['Confidence_Life_Style_Index_new']*mean_LS_index + np.random.normal(0.0,1.0)
all_df['lifestyle_nega'] = all_df['Life_Style_Index'] - all_df['Confidence_Life_Style_Index_new']*mean_LS_index + np.random.normal(0.0,1.0)

In [None]:
all_df['Type_of_Cab_new'] = [1 if x== 'A'else x for x in all_df['Type_of_Cab_new']]
all_df['Type_of_Cab_new'] = [2 if x== 'B'else x for x in all_df['Type_of_Cab_new']]
all_df['Type_of_Cab_new'] = [2 if x== 'C'else x for x in all_df['Type_of_Cab_new']]
all_df['Type_of_Cab_new'] = [3 if x== 'D'else x for x in all_df['Type_of_Cab_new']]
all_df['Type_of_Cab_new'] = [3 if x== 'E'else x for x in all_df['Type_of_Cab_new']]
all_df['Type_of_Cab_new'] = [2.2 if x== 'F'else x for x in all_df['Type_of_Cab_new']]

In [None]:
# New features
all_df['var4'] = all_df['Var2']*all_df['Var3']
all_df['var5'] = all_df['Var2']*all_df['Var3']*all_df['Var1']

all_df['total_rating'] = all_df['Life_Style_Index']+all_df['Customer_Rating']
all_df['diff_rating'] = all_df['Life_Style_Index']-all_df['Customer_Rating']

all_df['dist_rating'] = all_df['Trip_Distance']*all_df['Customer_Rating']
all_df['index_dist'] = all_df['Life_Style_Index']*all_df['Trip_Distance']
all_df['dist_by_rating'] = all_df['Trip_Distance']/all_df['Customer_Rating']

all_df['dist_var2'] = all_df['Trip_Distance']*all_df['Var2']
all_df['dist_var3'] = all_df['Trip_Distance']*all_df['Var3']
all_df['dist_var4'] = all_df['Trip_Distance']*all_df['var4']

In [None]:
np.mean(all_df['Var3'])

In [None]:
all_df.columns

### Trip type specific features

In [None]:
Destination_Type_ratio = all_df[all_df['flag'] == 'train'].groupby('Type_of_Cab_new')['Surge_Pricing_Type'].mean()
Destination_Type_ratio = Destination_Type_ratio.reset_index()
Destination_Type_ratio.columns = ['Type_of_Cab_new', 'Type_of_Cab_new_ratio']
all_df = pd.merge(all_df, Destination_Type_ratio, on = 'Type_of_Cab_new', how = 'left')

Destination_Type_ratio = all_df[all_df['flag'] == 'train'].groupby('Destination_Type')['Surge_Pricing_Type'].mean()
Destination_Type_ratio = Destination_Type_ratio.reset_index()
Destination_Type_ratio.columns = ['Destination_Type', 'Destination_Type_ratio']
all_df = pd.merge(all_df, Destination_Type_ratio, on = 'Destination_Type', how = 'left')

In [None]:
print(all_df.shape)
all_df = pd.get_dummies(all_df, columns = cat_cols)
print(all_df.shape)
all_df['Cancel_Last_1Month'] = cancel1_month
print(all_df.shape)

In [None]:
all_df.head()

### Logarithmic operation for non-normal features

In [None]:
all_df['log_dist'] = np.log(all_df['Trip_Distance'])
all_df['log_var3'] = np.log(all_df['Var3'])

In [None]:
train = all_df[all_df.flag == 'train'].copy()
test = all_df[all_df.flag == 'test'].copy()
print(test.shape, train.shape)

In [None]:
train.info()

In [None]:
X = train.drop(['Surge_Pricing_Type', 'flag', 'Trip_ID'],axis=1)
y = train['Surge_Pricing_Type']-1.0
X.shape, y.shape

In [None]:
X.isnull().sum()

In [None]:
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import RepeatedKFold, cross_val_score, KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score

## Training

#### 1. Judging improvement in CV scores by adding any new features. This ensures that we are not directly jumping to Hyperparameters' tuning and first judging if the new features are even useful.

In [None]:
model = LGBMClassifier(boosting_type = 'gbdt', objective = 'multiclass', num_class = 3)
#cv = RepeatedKFold(n_splits = 10, n_repeats = 1, random_state = 22)
cv = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 22)
n_scores = cross_val_score(model, X, y, scoring = 'accuracy', cv = cv )
print(np.mean(n_scores))
#70.47
#70.52
#70.53
#70.58

In [None]:
test = test.drop(['Surge_Pricing_Type', 'flag', 'Trip_ID'],axis=1)

### 2. Since competition's eval metric is accuracy, we will define this to train our LGBM model.

In [None]:
def accuracy(preds, train_data):
    labels = train_data.get_label()
    preds = preds.reshape((len(labels),3), order = 'F')
    pred = []
    for x in preds:
        pred.append(np.argmax(x))
    #print(len(labels), preds.shape, len(pred))
    return 'accuracy', accuracy_score(pred, labels), True #name, score, bool for higher result

### Defining a runLGB() function that trains the model with specific hyperparameter as  its input.

In [None]:
def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None): 
    params = {}
    params["objective"] = "multiclass"
    params['metric'] = None
    params["max_depth"] = 9
    #params['num_leaves'] = 150
    params['boosting_type'] = 'gbdt'
    params["min_data_in_leaf"] = 50
    params["learning_rate"] = 0.02
    params["bagging_fraction"] = 0.9
    params["feature_fraction"] = 0.7
    params["bagging_freq"] = 3
    params["bagging_seed"] = 50
    params["verbosity"] = -1
    params['num_class'] = 3
    #params['n_estimators'] = 200
    params['nthread'] = 4
    num_rounds = 1000

    plst = list(params.items())
    lgtrain = lgb.Dataset(train_X, label=train_y)

    if test_y is not None:        
        lgtest = lgb.Dataset(test_X, label=test_y)        
        model = lgb.train(params, lgtrain,  num_rounds,
                          valid_sets=[lgtrain,lgtest],                          
                          early_stopping_rounds=50, 
                          verbose_eval=100, feval = accuracy)
    else:
        lgtest = lgb.Dataset(test_X)
        model = lgb.train(params, lgtrain,   num_rounds, feval = accuracy)

    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    pred_test_y = (pred_test_y)
    preds_y = []
    for x in pred_test_y:
        preds_y.append(np.argmax(x))
       
    pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)
    pred_test_y2 = (pred_test_y2)
    preds_y2 = []
    for x in pred_test_y2:
        preds_y2.append(np.argmax(x))
    
    acc = 0
    if test_y is not None:           
        acc = accuracy_score((test_y), preds_y)
        return pred_test_y, acc, pred_test_y2, model
    else:
        return pred_test_y, acc, pred_test_y2, model

#### Model 2: Using 80-20 split to train with the new non-default parameters

In [None]:
%%time

from sklearn.model_selection import train_test_split
dev_X, val_X, dev_y, val_y = train_test_split(X, y, test_size = 0.2, random_state = 22)
print(dev_X.shape, val_X.shape)
pred_val, acc, pred_test, model = runLGB(dev_X, dev_y, val_X, val_y, test)
# 70.24 with depth 7
# 70.50 with depth 8
# 70.80 with depth 8 and new feature var3 and cancel1_month #9min 25s for 720 rounds lr = 0.015
# 70.84 
# 70.73 increased LR
# 70.89 increased LR and decreased children in leaves 6min 36s 500 rounds lr = 0.025
# 71.03 increased LR = 0.025 leaves = 80 8min early = 75 513 rounds only
# 71.02 9min 6s 591 rounds leaves = 50

In [None]:
predictions = []
for x in pred_test:
    predictions.append(np.argmax(x))
my_sub = pd.read_csv('../input/janatahack-mobility-analysis/sample_submission.csv')
my_sub['Surge_Pricing_Type'] = predictions
my_sub['Surge_Pricing_Type'] = my_sub['Surge_Pricing_Type'] +1.0
my_sub.to_csv('lgb_1_split_submission.csv', index=False)

#### Model 3: Using Cross-Validation to improve the model's generalization performance. We will aggregate the results on test dataset for each fold and average it to make the final submission.

In [None]:
import time

cv_acc_scores = []
pred_test_full = 0
pred_test_weight = 0
n_splits = 5
kf = StratifiedKFold(n_splits=n_splits, shuffle= True, random_state=22)

for dev_index, val_index in kf.split(X, y):
    start = time.time()
    dev_X, val_X = X.iloc[dev_index], X.iloc[val_index]
    dev_y, val_y = y.iloc[dev_index], y.iloc[val_index]    
    
    pred_val, acc, pred_test,model = runLGB(dev_X, dev_y, val_X, val_y, test)
    pred_test_full += pred_test
    pred_test_weight += pred_test*acc
    cv_acc_scores.append(acc)
    print(f'Mean Accuracy: {np.mean(cv_acc_scores)}; Split Accuracy: {acc}')
    print(f'Total time in seconds till this epoch: {time.time()-start}')
    #print(f'Accuracy: {np.mean(cv_acc_scores)}, F1: {np.mean(cv_f1_scores)}')
pred_test_full /= n_splits
pred_test_weight /= n_splits
print(sum(cv_acc_scores)/n_splits)
# 0.7050553250493787 depth 8; new feature var3 & cancel1_month #560 sec for 720 rounds 5 splits #0.7057120537 LB
# 0.70577675 depth 8; new feature var3 & cancel1_month #560 sec for 720 rounds 10 splits #0.705574192 LB
# 0.7060730796414302

In [None]:
pred_test_full
predictions = []
for x in pred_test_full:
    predictions.append(np.argmax(x))
w_preds = []
for x in pred_test_weight:
    w_preds.append(np.argmax(x))

In [None]:
my_sub = pd.read_csv('../input/janatahack-mobility-analysis/sample_submission.csv')
my_sub['Surge_Pricing_Type'] = predictions
my_sub['Surge_Pricing_Type'] = my_sub['Surge_Pricing_Type'] +1.0
my_sub.to_csv('lgb_cv_submission.csv', index=False)

#my_sub['target'] = pred_test_full
#my_sub.to_csv('lgb_submission.csv', index=False)


In [None]:
my_sub = pd.read_csv('../input/janatahack-mobility-analysis/sample_submission.csv')
my_sub['Surge_Pricing_Type'] = w_preds
my_sub['Surge_Pricing_Type'] = my_sub['Surge_Pricing_Type'] +1.0
my_sub.to_csv('lgb_weight_cv_submission.csv', index=False)

### Learning which features were instrumental in training the above model.

In [None]:
a =model.feature_importance(importance_type='split')
feature = pd.DataFrame(model.feature_name())
feature['impo'] = a
feature = feature.sort_values(by = ['impo'], ascending = False)
feature.head(30)

In [None]:
my_sub.tail()