In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

from scipy import stats
from scipy.stats import norm, skew

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold, GroupKFold, GridSearchCV, StratifiedKFold

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVR, NuSVR, LinearSVR
from sklearn.mixture import BayesianGaussianMixture, GaussianMixture
from sklearn.metrics import *

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

import sys, os
import random 

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
    
from IPython.display import display


def set_seed(seed=2121):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
set_seed()

In [None]:
train = pd.read_csv('../input/taxi-pricing-with-mobility-analytics/sigma_cabs.csv')
test = pd.read_csv('../input/taxi-pricing-with-mobility-analytics/test.csv')
display(train.head())
display(test.head())
print(train.shape)
test.shape

In [None]:
plt.style.use('fivethirtyeight')
sns.countplot(train.Surge_Pricing_Type)

In [None]:
train.describe(include=['O'])

In [None]:
train.info()

In [None]:
cats = [c for c in train.columns if train[c].dtypes == 'object']
nums = [c for c in train.columns if c not in cats]
cats

In [None]:
nums

### Missing data

In [None]:
missingTr = train.isnull().sum()##/len(train)
missingTr = missingTr[missingTr>0]
missingTr = missingTr.sort_values()

missingTs = test.isnull().sum()#/len(test)
missingTs = missingTs[missingTs>0]
missingTs = missingTs.sort_values()

plt.style.use('fivethirtyeight')
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))

missingTr.plot.bar(color='k', ax=axes[0])   
missingTs.plot.bar(color='k', ax=axes[1])   

axes[0].set_title('train');
axes[1].set_title('test');

## **EDA**

In [None]:
sns.set(font_scale=2.1)
int_flat = pd.melt(train, value_vars=nums)
g = sns.FacetGrid(int_flat, col='variable', col_wrap=4, sharex=False, sharey=False, height=10, aspect=1.2)
g = g.map(sns.distplot, 'value', color='teal', kde=True, fit=norm)
plt.style.use('fivethirtyeight')
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.style.use('seaborn-talk')
import seaborn as sns
fig, ((a,b,c),(d,e,f),(g,h,k)) = plt.subplots(3,3,figsize=(20,12))
sns.kdeplot(train['Customer_Since_Months'][train.Surge_Pricing_Type == 1], color="coral", shade=True, ax=a)
sns.kdeplot(train["Customer_Since_Months"][train.Surge_Pricing_Type == 2], color="teal", shade=True, ax=a)
sns.kdeplot(train["Customer_Since_Months"][train.Surge_Pricing_Type == 3], color="grey", shade=True, ax=a)
sns.countplot(train['Customer_Since_Months'],hue=train['Surge_Pricing_Type'],palette='coolwarm',ax=b)
sns.distplot(train.Customer_Since_Months, kde = False, fit=norm, color= 'grey', ax=c)
sns.distplot(test.Customer_Since_Months, kde = False, fit=norm, color = 'teal', ax=c)

sns.kdeplot(train['Customer_Rating'][train.Surge_Pricing_Type == 1], color="coral", shade=True, ax=d)
sns.kdeplot(train["Customer_Rating"][train.Surge_Pricing_Type == 2], color="teal", shade=True, ax=d)
sns.kdeplot(train["Customer_Rating"][train.Surge_Pricing_Type == 3], color="grey", shade=True, ax=d)
sns.countplot(train['Customer_Rating'],hue=train['Surge_Pricing_Type'],palette='coolwarm', ax=e)
sns.distplot(train.Customer_Rating, kde = False, fit=norm, color= 'grey', ax=f)
sns.distplot(test.Customer_Rating, kde = False, fit=norm, color = 'teal', ax=f)

sns.kdeplot(train['Cancellation_Last_1Month'][train.Surge_Pricing_Type == 1], color="coral", shade=True, ax=g)
sns.kdeplot(train["Cancellation_Last_1Month"][train.Surge_Pricing_Type == 2], color="teal", shade=True, ax=g)
sns.kdeplot(train["Cancellation_Last_1Month"][train.Surge_Pricing_Type == 3], color="grey", shade=True, ax=g)
sns.countplot(train['Cancellation_Last_1Month'],hue=train['Surge_Pricing_Type'],palette='coolwarm',ax=h)
sns.distplot(train.Cancellation_Last_1Month, kde = False, fit=norm, color= 'grey', ax=k)
sns.distplot(test.Cancellation_Last_1Month, kde = False, fit=norm, color = 'teal', ax=k)

In [None]:
plt.style.use('seaborn-talk')
import seaborn as sns
fig, ((a,b,c),(d,e,f),(g,h,k)) = plt.subplots(3,3,figsize=(20,12))
sns.kdeplot(train['Var1'][train.Surge_Pricing_Type == 1], color="cyan", shade=True, ax=a)
sns.kdeplot(train["Var1"][train.Surge_Pricing_Type == 2], color="teal", shade=True, ax=a)
sns.kdeplot(train["Var1"][train.Surge_Pricing_Type == 3], color="grey", shade=True, ax=a)
sns.countplot(train['Var1'],hue=train['Surge_Pricing_Type'],palette='bone',ax=b)
sns.distplot(train.Var1, kde = False, fit=norm, color= 'grey', ax=c)
sns.distplot(test.Var1, kde = False, fit=norm, color = 'teal', ax=c)

sns.kdeplot(train['Var2'][train.Surge_Pricing_Type == 1], color="cyan", shade=True, ax=d)
sns.kdeplot(train["Var2"][train.Surge_Pricing_Type == 2], color="teal", shade=True, ax=d)
sns.kdeplot(train["Var2"][train.Surge_Pricing_Type == 3], color="grey", shade=True, ax=d)
sns.countplot(train['Var2'],hue=train['Surge_Pricing_Type'],palette='bone', ax=e)
sns.distplot(train.Var2, kde = False, fit=norm, color= 'grey', ax=f)
sns.distplot(test.Var2, kde = False, fit=norm, color = 'teal', ax=f)

sns.kdeplot(train['Var3'][train.Surge_Pricing_Type == 1], color="cyan", shade=True, ax=g)
sns.kdeplot(train["Var3"][train.Surge_Pricing_Type == 2], color="teal", shade=True, ax=g)
sns.kdeplot(train["Var3"][train.Surge_Pricing_Type == 3], color="grey", shade=True, ax=g)
sns.countplot(train['Var3'],hue=train['Surge_Pricing_Type'],palette='bone',ax=h)
sns.distplot(train.Var3, kde = False, fit=norm, color= 'grey', ax=k)
sns.distplot(test.Var3, kde = False, fit=norm, color = 'teal', ax=k)

In [None]:
cats.remove('Trip_ID')
def analyse_cats(df, cat_cols):
    d = pd.DataFrame()
    cl = [];u = [];s =[];nans =[]
    for c in cat_cols:
        #print("column:" , c ,"--Uniques:" , train[c].unique(), "--Cardinality:", train[c].unique().size)
        cl.append(c); u.append(df[c].unique()); s.append(df[c].unique().size); nans.append(df[c].isnull().sum())
        fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 3))
        sns.countplot(train[c], ax=axes[0], label='train', palette='Set2');
        sns.countplot(test[c],  ax=axes[1], label='test', palette='Set2');
        axes[0].set_title('train');
        axes[1].set_title('test');
    d['"feat"'] = cl;d["uniques"] = u;d["cardinality"] = s;d["nans"] = nans
    return d
sns.set()
plt.style.use('seaborn')
catanadf = analyse_cats(train, cats)
catanadf

In [None]:
sns.set()
plt.style.use('seaborn-poster')
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 6))
sns.distplot(train.Life_Style_Index, bins=50, fit=norm, kde=True, color='teal', ax=axes[0])
sns.distplot(test.Life_Style_Index, bins=50, fit=norm, kde=True, color='darkred', ax=axes[1])
axes[0].set_title('train');
axes[1].set_title('test');

In [None]:

for c in cats:
    le = LabelEncoder()
    le.fit(list(train[c].astype(str)) + list(test[c].astype(str)))
    train[c] = le.transform(train[c].astype(str))
    test[c] = le.transform(test[c].astype(str))
    le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    print('target mapping :  ',c ,  le_name_mapping)


In [None]:
plt.style.use('fivethirtyeight')
sns.catplot(x="Confidence_Life_Style_Index", y="Surge_Pricing_Type", hue="Gender", kind="bar", data=train, aspect=2.5)

In [None]:
plt.figure(figsize=(10, 7))
sns.catplot(data=train[cats], orient="h", kind="box", aspect=2)
plt.show()

In [None]:
plt.figure(figsize=(10, 7))
sns.catplot(data=train[['Var1', 'Var2', 'Var3']], orient="h", kind="box", aspect=2)
plt.show()

In [None]:
plt.style.use('seaborn')
g = sns.catplot(x="Type_of_Cab", y="Surge_Pricing_Type", row="Confidence_Life_Style_Index",
                kind="box", orient="h", height=2, aspect=4,
                data=train)
g.set(xscale="log")

In [None]:
plt.figure(figsize=(15, 7))
sns.scatterplot(x="Customer_Rating", y="Life_Style_Index", hue="Type_of_Cab", size="Trip_Distance" , data=train, palette='vlag')

In [None]:
plt.figure(figsize=(15, 7))
sns.scatterplot(x="Trip_Distance", y="Life_Style_Index", hue="Type_of_Cab", size="Customer_Rating" , data=train, palette='BuGn_r')

In [None]:
plt.figure(figsize=(12, 5))
sns.scatterplot(x="Cancellation_Last_1Month", y="Life_Style_Index", hue="Confidence_Life_Style_Index", size="Customer_Rating" , data=train, palette='vlag')

In [None]:
plt.figure(figsize=(15, 7))
sns.scatterplot(x="Var1", y="Life_Style_Index", hue="Confidence_Life_Style_Index", size="Customer_Rating" , data=train, palette='BuGn_r')

In [None]:
plt.figure(figsize=(15, 7))
sns.scatterplot(x="Var1", y="Var2", hue="Confidence_Life_Style_Index", size="Customer_Rating" , data=train, palette='bone')

In [None]:
plt.figure(figsize=(15, 10))
sns.heatmap(train[nums].corr(), annot=True, center=True)

In [None]:
target = train.pop('Surge_Pricing_Type')
target = target.map({1:0, 2:1, 3:2})


for df in [train, test]:
    del df['Trip_ID']

In [None]:
params = {
    
    'objective': 'multiclass',
    'boosting': 'gbdt',
    'metric': 'multi_logloss',
    'max_depth': -1,
    'num_leaves': 20,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
       
    'lambda_l2': 2.0,
    'lambda_l1': 2.0,
   # 'min_gain_to_split': 0,
    'num_class': len(np.unique(target)),
     }

import lightgbm as lgb

scores = []


oof = np.zeros(len(train))
preds_lgb = np.zeros(len(test))

feature_importances_gain = pd.DataFrame()
feature_importances_gain['feature'] = train.columns

feature_importances_split = pd.DataFrame()
feature_importances_split['feature'] = train.columns

folds = StratifiedKFold(n_splits=3, shuffle=True, random_state=4242)

for fold_, (train_ind, val_ind) in enumerate(folds.split(train, target)):
    print("fold :::::::: " , fold_)
    trn_data = lgb.Dataset(train.iloc[train_ind], target.iloc[train_ind])
    val_data = lgb.Dataset(train.iloc[val_ind], target.iloc[val_ind])
    
    model = lgb.train(params, trn_data, valid_sets=(trn_data, val_data), num_boost_round=1000, verbose_eval=100, early_stopping_rounds=100)
    oof[val_ind] = np.argmax(model.predict(train.iloc[val_ind], num_iteration=model.best_iteration), axis=1)
    
        
    print('f1 :', f1_score(target.iloc[val_ind], oof[val_ind], average='micro'))
    scores.append(f1_score(target.iloc[val_ind], oof[val_ind], average='micro'))
    
    feature_importances_gain['fold_{}'.format(fold_ + 1)] = model.feature_importance(importance_type='gain')
    feature_importances_split['fold_{}'.format(fold_ + 1)] = model.feature_importance(importance_type='split')
    
    preds_lgb += np.argmax(model.predict(test, num_iteration=model.best_iteration), axis=1)/folds.n_splits
    
preds_lgb = np.round(preds_lgb)
print('f1 micro mean ---->',np.mean(scores))


In [None]:
np.unique(preds_lgb)

In [None]:
plt.figure(figsize=(10, 5))
pd.Series(preds_lgb).hist(color='teal')

In [None]:
feature_importances_gain['average'] = feature_importances_gain[['fold_{}'.format(fold + 1) for fold in range(folds.n_splits)]].mean(axis=1)
feature_importances_gain.to_csv('feature_importances.csv')

plt.figure(figsize=(15, 10))
sns.barplot(data=feature_importances_gain.sort_values(by='average', ascending=False).head(100),color='teal',  x='average', y='feature');
plt.title('TOP feature importance over {} folds average'.format(folds.n_splits));

In [None]:
target_names = target.unique()


def plot_cm(y_true, y_pred, title):
    figsize=(12,8)
    y_pred = y_pred.astype(int)
    cm = confusion_matrix(y_true, y_pred, labels=np.unique(y_true))
    cm_sum = np.sum(cm, axis=1, keepdims=True)
    cm_perc = cm / cm_sum.astype(float) * 100
    annot = np.empty_like(cm).astype(str)
    nrows, ncols = cm.shape
    for i in range(nrows):
        for j in range(ncols):
            c = cm[i, j]
            p = cm_perc[i, j]
            if i == j:
                s = cm_sum[i]
                annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
            elif c == 0:
                annot[i, j] = ''
            else:
                annot[i, j] = '%.1f%%\n%d' % (p, c)
    cm = pd.DataFrame(cm, index=np.unique(y_true), columns=np.unique(y_true))
    cm.index.name = 'Actual'
    cm.columns.name = 'Predicted'
    fig, ax = plt.subplots(figsize=figsize)
    plt.title(title)
    sns.heatmap(cm, cmap='vlag',  annot=annot, fmt='', ax=ax)
sns.set(font_scale=1.2)
plot_cm(target, oof, 'cm')

## XGBoost 

In [None]:

xgb_params = {
    
    'objective':'multi:softmax', 
    'max_depth': 5, 
    'learning_rate': 0.1, 
    'booster':'gbtree', 
    'eval_metric': 'mlogloss', 
    'max_leaves': 20, 
    'colsample_bytree': 0.8,
    'num_class': len(np.unique(target)),
    'subsample':0.8, 
    'lambda':2, 
    'alpha': 1.2
   
}


xgb_scores = []

oof_xgb = np.zeros(len(train))
pred_xgb = np.zeros(len(test))

importances = pd.DataFrame()


folds = StratifiedKFold(n_splits=3, shuffle=True, random_state=4242)

for fold_, (train_ind, val_ind) in enumerate(folds.split(train, target)):
    print('fold : ----------------------------------------', fold_)
    trn_data = xgb.DMatrix(data=train.iloc[train_ind], label=target.iloc[train_ind])
    val_data = xgb.DMatrix(data= train.iloc[val_ind], label=target.iloc[val_ind])
    
       
    xgb_model = xgb.train(xgb_params, trn_data, num_boost_round=3000, evals=[(trn_data, 'train'), (val_data, 'test')], verbose_eval=100, early_stopping_rounds=100)
    oof_xgb[val_ind] = xgb_model.predict(xgb.DMatrix(train.iloc[val_ind]),  ntree_limit= xgb_model.best_ntree_limit)
    
    print(f1_score(target.iloc[val_ind], oof_xgb[val_ind],average='micro' ))
    xgb_scores.append(f1_score(target.iloc[val_ind], oof_xgb[val_ind], average='micro'))
        
    importance_score = xgb_model.get_score(importance_type='gain')
    importance_frame = pd.DataFrame({'Importance': list(importance_score.values()), 'Feature': list(importance_score.keys())})
    importance_frame['fold'] = fold_ +1
    importances = pd.concat([importances, importance_frame], axis=0, sort=False)
    
    pred_xgb += xgb_model.predict(xgb.DMatrix(test), ntree_limit= xgb_model.best_ntree_limit)/folds.n_splits
    

print('model f1:------------------>', np.mean(xgb_scores))

In [None]:

mean_gain = importances[['Importance', 'Feature']].groupby('Feature').mean()

mean_gain = mean_gain.reset_index()
plt.figure(figsize=(17, 10))
sns.barplot(x='Importance', y='Feature', data=mean_gain.sort_values('Importance', ascending=False), palette='bone')
