In [None]:
#================================================================#
### Installing relevant libraries ###
#================================================================#

!pip install category_encoders

In [None]:
#================================================================#
### Importing all the relevant libraries ###
#================================================================#

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
import category_encoders as ce

from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, ShuffleSplit, cross_val_score

from sklearn.feature_selection import SelectKBest, chi2

from sklearn.metrics import roc_curve, plot_roc_curve, auc, classification_report, accuracy_score
from sklearn.metrics import confusion_matrix

import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from xgboost import plot_importance

import scipy.stats as stats
from imblearn.over_sampling import RandomOverSampler

from sklearn.ensemble import RandomForestClassifier


In [None]:
#================================================================#
### Reading Train Dataset ###
#================================================================#

data = pd.read_csv('../input/cat-in-the-dat/train.csv')

print(data.shape)

data.head(2)

In [None]:
#================================================================#
### Train Dataset Distribution ###
#================================================================#

data.describe(include='all')

# From Nom 5 to Nom 9, we notice that the variables have high cardinality
# Similar is the case with Ord 5

In [None]:
#================================================================#
### Reading Test Dataset ###
#================================================================#
test = pd.read_csv('../input/cat-in-the-dat/test.csv')

print(test.shape)

test.head(2)

In [None]:
test.describe(include='all')

# Exploratory Data analysis

In [None]:
#================================================================#
### Distribution of Target Variable ###
#================================================================#

data['target'].value_counts()

In [None]:
#===========================================================================#
### Bins vs Target Variable ###
#===========================================================================#

fig, ax = plt.subplots(nrows=1, ncols=5, figsize=(20,5))

smallPlots('bin_0' , 'Bin 0', 0)
smallPlots('bin_1' , 'Bin 1', 1)
smallPlots('bin_2' , 'Bin 2', 2)
smallPlots('bin_3' , 'Bin 3', 3)
smallPlots('bin_4' , 'Bin 4', 4)

fig.tight_layout()
plt.show()

In [None]:
#===========================================================================#
### Noms vs Target Variable ###
#===========================================================================#

fig, ax = plt.subplots(nrows=1, ncols=5, figsize=(25,5))

smallPlots('nom_0' , 'Nom 0', 0)
smallPlots('nom_1' , 'Nom 1', 1)
smallPlots('nom_2' , 'Nom 2', 2)
smallPlots('nom_3' , 'Nom 3', 3)
smallPlots('nom_4' , 'Nom 4', 4)

fig.tight_layout()
plt.show()

In [None]:
#================================================================#
### Nom 5's Pareto chart ###
#================================================================#

# Nom 5 had high cardinality, first logical check is to see if the data
# distribution is skewed or not, if skewed then we can replace less frequent entries
# In this case the data is distributed linearly i.e. all entries in Nom 5 have 
# almost equal # of IDs corresponding to them

paretoPlots('nom_5', 'id')

In [None]:
#================================================================#
### Nom 6's Pareto chart ###
#================================================================#

# Nom 6 had high cardinality, first logical check is to see if the data
# distribution is skewed or not, if skewed then we can replace less frequent entries
# In this case the data is distributed linearly i.e. all entries in Nom 6 have 
# almost equal # of IDs corresponding to them

paretoPlots('nom_6', 'id')

In [None]:
#================================================================#
### Nom 7's Pareto chart ###
#================================================================#

# Nom 7 had high cardinality, first logical check is to see if the data
# distribution is skewed or not, if skewed then we can replace less frequent entries
# In this case the data is distributed linearly i.e. all entries in Nom 7 have 
# almost equal # of IDs corresponding to them

paretoPlots('nom_7', 'id')

In [None]:
#================================================================#
### Nom 8's Pareto chart ###
#================================================================#

# Nom 8 had high cardinality, first logical check is to see if the data
# distribution is skewed or not, if skewed then we can replace less frequent entries
# In this case the data is distributed linearly i.e. all entries in Nom 8 have 
# almost equal # of IDs corresponding to them

paretoPlots('nom_8', 'id')

In [None]:
#===========================================================================#
### Ord vs Target Variable ###
#===========================================================================#

fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(15,5))

smallPlots('ord_0' , 'Ord 0', 0)
smallPlots('ord_1' , 'Ord 1', 1)
smallPlots('ord_2' , 'Ord 2', 2)

fig.tight_layout()
plt.show()

In [None]:
#===========================================================================#
### Ord vs Target Variable ###
#===========================================================================#

fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(15,10))

smallPlots('ord_3' , 'Ord 3', 0)
smallPlots('ord_4' , 'Ord 4', 1)

fig.tight_layout()
plt.show()

In [None]:
#================================================================#
### Ord 5's Pareto chart ###
#================================================================#

# Ord 5 had high cardinality, first logical check is to see if the data
# distribution is skewed or not, if skewed then we can replace less frequent entries
# In this case the data is distributed linearly i.e. all entries in Ord 5 have 
# almost equal # of IDs corresponding to them

paretoPlots('ord_5', 'id')

In [None]:
#===========================================================================#
### Day and Month vs Target Variable ###
#===========================================================================#

fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(15,10))

smallPlots('day' , 'Day', 0)
smallPlots('month' , 'Month', 1)

fig.tight_layout()
plt.show()

# Encoding

There are 3 approaches we can take:

**Count Encoding:**

In count encoding the categorical value is replaced by it's frequency in the dataset i.e. say an entry - 'Green' in column nom_0 has a frequency of 100 then at every place in nom_0 the entry as 'Green' will be replaced by 100

This will be done throughout the columns specified i.e. the categorical entries will be replaced by it's count in the column

**Target Encoding:**

Target encoding is similar to Count encoding but has the *Target Variable* involved. Say for column nom_5 the value green has a frequency of 100 and at a target level, it has the value of 60 for 1 and 40 for 0 then at all the places where the row has value nom_5 = 'Green' and 'Target' = 1, it'll replace it by 60 and for 'Target' = 0 it'll return 40

**One Hot Encoding**

One Hot encoding is the most common and widely used criteria but it'll suffer in case of high cardinality, here we see *nom_9* has more than 11K distinct values, in that case OHE will make 11K columns which will lead to system crash

OHE can be used in combination with either Count or Target Encoding, for high cardinal variables, we can use the above mentioned approaches while for the low cardinal variables, we can use OHE

In [None]:
#===========================================================================#
### Running Count encoding on Dataset ###
#===========================================================================#

data_copy = data.copy()

data_copy = data_copy.astype('str')

# Create the encoder
count_enc_all = ce.CountEncoder()

# Transform the features
count_encoded_all = count_enc_all.fit_transform(data_copy.iloc[:, 1:-1])
count_encoded_all = count_encoded_all/count_encoded_all.shape[0]

count_test = count_enc_all.transform(test.iloc[:,1:])

print(count_encoded_all.shape)
print(count_test.shape)

count_encoded_all.head(3)

In [None]:
#===========================================================================#
### Running Target Encoding on Dataset ###
#===========================================================================#

data_copy = data.copy()

cat_features = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1',
       'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9',
       'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5', 'day', 'month']

# Create the encoder
target_enc = ce.TargetEncoder(cols=cat_features)
target_enc.fit(data_copy[cat_features], data_copy['target'])

# Transform the features, rename the columns with _target suffix, and join to dataframe
train_TE = target_enc.transform(data_copy[cat_features])
test_TE = target_enc.transform(test[cat_features])

print(train_TE.shape)

train_TE.head()

In [None]:
#===========================================================================#
### Running Count encoding and OHE on Dataset ###
#===========================================================================#

data_copy = data.copy()

data_copy['ord_0'] = data_copy['ord_0'].astype('str')
data_copy['day'] = data_copy['day'].astype('str')
data_copy['month'] = data_copy['month'].astype('str')

col_list_cat = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9', 'ord_5']

col_list_oh = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1',
       'nom_2', 'nom_3', 'nom_4','ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'day', 'month']


# Create the encoder
count_enc = ce.CountEncoder()

# Transform the features
count_encoded = count_enc.fit_transform(data_copy[col_list_cat])
count_encoded = count_encoded/count_encoded.shape[0]

oh_encoded = pd.get_dummies(data_copy[col_list_oh])

data_enc = pd.concat([oh_encoded, count_encoded, data_copy['target']], axis=1)

print(data_enc.shape)

data_enc.head(2)

In [None]:
#===========================================================================#
### Running Count encoding and OHE on Dataset ###
#===========================================================================#

test_copy = test.copy()

test_copy['ord_0'] = test_copy['ord_0'].astype('str')
test_copy['day'] = test_copy['day'].astype('str')
test_copy['month'] = test_copy['month'].astype('str')

# Transform the features
count_encoded_test = count_enc.transform(test_copy[col_list_cat])
count_encoded_test = count_encoded_test/count_encoded.shape[0]

oh_encoded_test = pd.get_dummies(test_copy[col_list_oh])

data_enc_test = pd.concat([oh_encoded_test, count_encoded_test], axis=1)

print(data_enc_test.shape)

data_enc_test.head(2)

In [None]:
#===========================================================================#
### Eleminating variables using Chisq ###
#===========================================================================#

chi2_output = chi2(oh_encoded, data_copy['target'])
col_1 = []
for i in range(0,len(chi2_output[1])):
  if chi2_output[1][i] < 0.05:
    col_1.append(oh_encoded.columns[i])

col_0 = []
for i in range(0,len(chi2_output[0])):
  if chi2_output[0][i] < 0.05:
    col_0.append(oh_encoded.columns[i])


In [None]:
contigency_tab = pd.crosstab(data_enc['nom_0_Green'], data_enc['target']) 
contigency_tab

In [None]:
c, p, dof, expected = stats.chi2_contingency(contigency_tab) 

# Print the p-value
print(p)
print(expected)


# Model

## Target Encoded

In [None]:
# model_tuned, dict_model = model_tuning(train_TE, data['target'])

In [None]:
#===========================================================================#
### Training XG Boost Model ###
#===========================================================================#

modelXgb_te = XGBClassifier(max_depth=12, n_estimators=400, reg_lambda=0.1, tree_method='gpu_hist')

modelXgb_te.fit(train_TE, data['target'])

pred_xgb_train = modelXgb_te.predict(train_TE)

print('### Train Summary ###', '\n')
print(classification_report(data['target'], pred_xgb_train), '\n')
print(confusion_matrix(data['target'], pred_xgb_train), '\n')

In [None]:
#===========================================================================#
### Test Prediction ### ~ 77%
#===========================================================================#

pred_xgb_test = modelXgb_te.predict_proba(test_TE)

test_pred_xgb = pd.DataFrame(pred_xgb_test[:,1], columns=['target'])

test_xgb = test_pred_xgb.copy()
test_xgb['pred'] = np.where(test_xgb['target'] >=0.5,1,0)
test_xgb['pred'].value_counts()

In [None]:
test_pred_te_xgb = pd.concat([test['id'], test_pred_xgb], axis=1)
test_pred_te_xgb.to_csv('pred_te_xgb.csv', index=False)

## Only Cat Encoded

In [None]:
#===========================================================================#
### Test Train Split ###
#===========================================================================#

X_train_ce, X_test_ce, y_train_ce, y_test_ce = train_test_split(count_encoded_all, data['target'], test_size=0.01, random_state=12)

ros = RandomOverSampler(random_state=0)
X_resampled_ce, y_resampled_ce = ros.fit_resample(X_train_ce, y_train_ce)

X_resampled_ce = pd.DataFrame(X_resampled_ce, columns= X_train_ce.columns)
print(X_resampled_ce.shape)
X_resampled_ce.head()

In [None]:
# model_tuned, dict_model = model_tuning(count_encoded_all, data['target'])

In [None]:
#===========================================================================#
### Training XG Boost Model ###
#===========================================================================#

modelXgb_ce = XGBClassifier(n_estimators=200, max_depth=12, reg_lambda=0.05, tree_method='gpu_hist')

modelXgb_ce.fit(X_resampled_ce[col_list_oh], y_resampled_ce)

pred_xgb_train_ce = modelXgb_ce.predict(X_train_ce[col_list_oh])
pred_xgb_test_ce = modelXgb_ce.predict(X_test_ce[col_list_oh])
pred_xgb_test_ce_f = modelXgb_ce.predict_proba(count_test[col_list_oh])

print('### Train Summary ###', '\n')
print(classification_report(y_train_ce, pred_xgb_train_ce), '\n')
print(confusion_matrix(y_train_ce, pred_xgb_train_ce), '\n')

print('\n', '### Test Summary ###', '\n')
print(classification_report(y_test_ce, pred_xgb_test_ce), '\n')
print(confusion_matrix(y_test_ce, pred_xgb_test_ce), '\n')

# print('\n', '### Provider 2 Summary ###', '\n')
# print(classification_report(y_cat_enc_p2_broad_1, pred_xgb_test_p2_cat_broad_1), '\n')
# # print(confusion_matrix(y_cat_enc_p2_broad_1, pred_xgb_test_p2_cat_broad_1), '\n')

In [None]:
#===========================================================================#
### Plotting an ROC curve to check performance ###
#===========================================================================#

false_positive_rate, true_positive_rate, _ = roc_curve(y_test_ce, pred_xgb_test_ce)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

# plot the curve
plt.plot(false_positive_rate, true_positive_rate, 
    'b', label='AUC = %0.3f'% roc_auc)
plt.legend(loc='lower right')
plt.show()

In [None]:
#===========================================================================#
### Identifying Important Features ###
#===========================================================================#

_, ax = plt.subplots(figsize=(8, 8))
plot_importance(modelXgb_ce, max_num_features=10, ax = ax)
plt.show()

In [None]:
#===========================================================================#
### Test Prediction ###
#===========================================================================#

test_pred = pd.DataFrame(pred_xgb_test_ce_f[:,1], columns=['target'])
test_pred.value_counts()

In [None]:
# test_pred_cat = pd.concat([test['id'], test_pred], axis=1)
# test_pred_cat.to_csv('pred_cat_only.csv')

## Cat and OH

In [None]:
#===========================================================================#
### Test Train Split ###
#===========================================================================#

X_train, X_test, y_train, y_test = train_test_split(data_enc.iloc[:,:-1],
                                                    data_enc['target'], test_size=0.05, random_state=12)

### Random Oversampling to Create a balanced Dataset ###

ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [None]:
# model_tuned, dict_model = model_tuning(data_enc.iloc[:,:-1], data_enc['target'])

In [None]:
#===========================================================================#
### Defining and running the Model ###
#===========================================================================#

modelXgb = XGBClassifier(n_estimators=300, max_depth=10, min_child_weight=1,tree_method='gpu_hist')

modelXgb.fit(X_resampled, y_resampled)

pred_xgb_train = modelXgb.predict(X_train)
pred_xgb_test = modelXgb.predict(X_test)
pred_xgb_test_f = modelXgb.predict_proba(data_enc_test) 

print('### Train Summary ###', '\n')
print(classification_report(y_train, pred_xgb_train), '\n')
print(confusion_matrix(y_train, pred_xgb_train), '\n')

print('\n', '### Test Summary ###', '\n')
print(classification_report(y_test, pred_xgb_test), '\n')
print(confusion_matrix(y_test, pred_xgb_test), '\n')

In [None]:
#===========================================================================#
### Plotting an ROC curve to check performance ###
#===========================================================================#

false_positive_rate, true_positive_rate, _ = roc_curve(y_test, pred_xgb_test)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

# plot the curve
plt.plot(false_positive_rate, true_positive_rate, 
    'b', label='AUC = %0.3f'% roc_auc)
plt.legend(loc='lower right')
plt.show()

In [None]:
#===========================================================================#
### Identifying Important Features ###
#===========================================================================#

_, ax = plt.subplots(figsize=(8, 8))
plot_importance(modelXgb, max_num_features=10, ax = ax)
plt.show()

In [None]:
#===========================================================================#
### Test Prediction ### 73%
#===========================================================================#

test_pred_oh_ce = pd.DataFrame(pred_xgb_test_f[:,1], columns=['target'])
test_pred_oh_ce = pd.concat([test['id'], test_pred_oh_ce], axis=1)
test_pred_oh_ce.to_csv('test_xgb_oh_ce.csv', index=False)

test_xgb_oh_ce = test_pred_oh_ce.copy()
test_xgb_oh_ce['pred'] = np.where(test_xgb_oh_ce['target'] >=0.5,1,0)
print(test_xgb_oh_ce['pred'].value_counts())

# Functions

In [None]:
#============================================================================#
## Bar Plots
#============================================================================#

def smallPlots(col, title, i):

  plot_data = data.groupby([col, 'target']).agg({'id':'count'}).reset_index()

  sns.barplot(x=plot_data[col],
              y=plot_data['id'],
              hue=plot_data['target'], ax= ax[i])
  
  ax[i].set_title(title)

In [None]:
#============================================================================#
## Pareto Analysis: What % of Column == 80% or 90% of Users
#============================================================================#

def paretoPlots(col, agg_col):

  df_pareto = data.groupby([col]).agg({agg_col: 'count'}).reset_index()

  df_pareto = df_pareto.sort_values(by=[agg_col], ascending=False, ignore_index=True)

  df_pareto['dummy'] = 1

  df_pareto['dummy_cum'] = df_pareto.groupby(['dummy'])['dummy'].apply(lambda x: x.cumsum())
  df_pareto[agg_col+'_cumsum'] = df_pareto.groupby(['dummy'])[agg_col].apply(lambda x: x.cumsum())

  df_pareto['dummay_cum_max'] = df_pareto['dummy_cum'].max()
  df_pareto[agg_col+'_cumsum_max'] = df_pareto[agg_col+'_cumsum'].max()

  df_pareto['dummay_ratio'] = df_pareto['dummy_cum']/df_pareto['dummay_cum_max'] * 100
  df_pareto[agg_col+'_ratio'] = df_pareto[agg_col+'_cumsum']/df_pareto[agg_col+'_cumsum_max'] *100


  fig, ax = plt.subplots(ncols = 1, nrows = 1, figsize = (10, 8))

  ax.plot(df_pareto['dummay_ratio'], df_pareto[agg_col+'_ratio'])

  plt.xlabel('% of ' + col)
  plt.ylabel('% of ' + agg_col)

  plt.show()
  ax.set_title('Pareto Analysis for ' + col)
  ax.plot()

In [None]:
#===========================================================================#
### XG Boost Hyper Parameter Tuning ###
#===========================================================================#

parameters_xgb_1 = {
 'max_depth':range(7, 11, 1),
 'min_child_weight':range(0, 3, 1)
}

parameters_xgb_2 = {
 'gamma':[i/5.0 for i in range(0,1)]
}

parameters_xgb_3 = {
 'n_estimators':range(200,600,100)
}

parameters_xgb_4 = {
 'reg_lambda':[0.0, 0.1, 0.2]
}

parameters_xgb_5 = {
 'learning_rate':[0.01, 0.05, 0.1, 0.2]
}

def gridSearchFunction(model, model_params, X, y):
    %%time
    grid_search_xgb = GridSearchCV(estimator = model,
                           param_grid = model_params,
                           scoring = 'accuracy',
                           cv = 3,
                           n_jobs = 4)
    grid_search = grid_search_xgb.fit(X, y)
    return grid_search


def model_tuning(X, y):
    """
    Please Note: This function requires a lot of resources and time to run
    
    This functions is used to tune the hyperparameters for XG Boost.
    It starts with min_depth and min_child_weight then moves to gamma, n_estimators,
    reg_lambda and finally to learning_rate. The tuned parameters are finally stored
    in dict_
    """
    
    param_array = [parameters_xgb_1, parameters_xgb_2, parameters_xgb_3,
                   parameters_xgb_4, parameters_xgb_5]
    
    xgModel = XGBClassifier(tree_method='gpu_hist')    
    dict_ = {}
    
    
    for i in range(0,5):
        model_params = param_array[i]
        grid_search_op = gridSearchFunction(xgModel, model_params, X, y)
        dict_.update(grid_search_op.best_params_)
        print(dict_)
        xgModel = XGBClassifier(**dict_)
    
    return xgModel, dict_

In [None]:
def evaluate(y,y_hat,labels, title):
  print(classification_report(y,y_hat))
  cm = confusion_matrix(y,y_hat)
  cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
  cmat = pd.DataFrame(cm)
  cmat.columns = labels
  cmat.set_index([pd.Index(labels)],inplace=True)
  sns.heatmap(cmat,cmap="YlGnBu", annot=True)
  plt.title(title)