In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn import feature_selection
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import RFE
from xgboost import XGBClassifier
from sklearn.feature_selection import VarianceThreshold
import collections
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.model_selection import ParameterGrid
from imblearn.over_sampling import ADASYN

In [None]:
train=pd.read_csv("../input/santander-customer-satisfaction/train.csv",index_col="ID")
test_X=pd.read_csv("../input/santander-customer-satisfaction/test.csv",index_col="ID")

# DATASET ANALYSIS

In [None]:
train.describe()

In [None]:
train.info()

In [None]:
test_X.info()

All features are numeric

In [None]:
train.isnull().sum().sum()

In [None]:
test_X.isnull().sum().sum()

There is no missing data in both test, train set

In [None]:
sns.set_style("whitegrid")
sns.countplot(x="TARGET",data=train)

In [None]:
df=pd.DataFrame(train.TARGET.value_counts())
df['percentage']=100*df['TARGET']/train.shape[0]
df

Huge Class Imbalance as shown above

For detailed EDA please see https://www.kaggle.com/cast42/exploring-features#Clusters

# DATA CLEANING

In [None]:
# -999999 in var 3 means unkown so we are replacing it with the most common value in var3
train.var3 = train.var3.replace(-999999,2)
test_X.var3 = test_X.var3.replace(-999999,2)

In [None]:
train.loc[train.var3==-999999].shape

In [None]:
test_X.loc[test_X.var3==-999999].shape

Validation & Train set Split

In [None]:
train_X, val_X, train_y, val_y = train_test_split(train.drop(labels=['TARGET'], axis=1),train['TARGET'],test_size=0.2,random_state=1)

*Feature Variance Analysis*

In [None]:
#Use Variance Thereshold to remove both constant,quasi-constant features
selector = VarianceThreshold(threshold=0.01)

In [None]:
selector.fit(train_X)

In [None]:
constArr=selector.get_support()

In [None]:
constCol=[col for col in train_X.columns if col not in train_X.columns[constArr]]

In [None]:
# constant features
constCol

In [None]:
#check
train_X.ind_var2_0.unique()

In [None]:
#dropping constant features from train,test,val set
train_X.drop(columns=constCol,axis=1,inplace=True)
test_X.drop(columns=constCol,axis=1,inplace=True)
val_X.drop(columns=constCol,axis=1,inplace=True)

In [None]:
print(train_X.shape)
print(test_X.shape)
print(val_X.shape)

*Remove Duplicate features*

In [None]:
def duplicateColumns(data):
    dupliCols=[]
    for i in range(0,len(data.columns)):
        col1=data.columns[i]
        for col2 in data.columns[i+1:]:
            if data[col1].equals(data[col2]):
                dupliCols.append(col1+','+col2)
    return dupliCols

In [None]:
dupCol=duplicateColumns(train_X)
dCols=[col.split(',')[1] for col in dupCol]
dCols

In [None]:
dupCol

delete the any oen of them will be fine but we decide to delete the first column

In [None]:
dCols=list(set(dCols))

In [None]:
train_X.drop(columns=dCols,axis=1,inplace=True)
val_X.drop(columns=dCols,axis=1,inplace=True)
test_X.drop(columns=dCols,axis=1,inplace=True)

*Removing Features that are highly correlated to each other*

In [None]:
def correlation(dataset,threshold):
    col_corr=set() # set will contains unique values.
    corr_matrix=dataset.corr() #finding the correlation between columns.
    for i in range(len(corr_matrix.columns)): #number of columns
        for j in range(i):
            if abs(corr_matrix.iloc[i,j])>threshold: #checking the correlation between columns.
                colName=corr_matrix.columns[i] #getting the column name
                col_corr.add(colName) #adding the correlated column name heigher than threshold value.
    return col_corr #returning set of column names

In [None]:
corrCol=list(correlation(train_X,0.8))

In [None]:
len(corrCol)

In [None]:
train_X.drop(columns=corrCol,axis=1,inplace=True)
val_X.drop(columns=corrCol,axis=1,inplace=True)
test_X.drop(columns=corrCol,axis=1,inplace=True)

We have finished using filter method to select features

*Scale the data*

In [None]:
scaler=StandardScaler()

In [None]:
train_sca_X = scaler.fit_transform(train_X)
test_sca_X = scaler.transform(test_X)
val_sca_X = scaler.transform(val_X)

# Oversample Data

*oversample data with smote*

In [None]:
pd.DataFrame(train_sca_X,columns=train_X.columns,index=train_X.index)

In [None]:
sm = SMOTE(random_state=42)
train_res_X, train_res_y = sm.fit_resample(train_sca_X, train_y)

In [None]:
train_res_y.value_counts()

# Modelling with xgboost

In [None]:
baseline_xgb_clf = XGBClassifier(random_state=20)

In [None]:
baseline_xgb_clf.fit(train_res_X,train_res_y,early_stopping_rounds=20,eval_metric="auc",eval_set=[(val_sca_X, val_y)])

In [None]:
pred_y = baseline_xgb_clf.predict_proba(val_sca_X)[:,1]

In [None]:
roc_auc_score(val_y,pred_y)

The Above Model has performed relatively well. Let's see if we could improve results by hyperparameter tuning

# Hyperparameter Tuning

Using Grid Search Cross Validation to find best hyperparameters

In [None]:
# param_grid = {"learning_rate"    : [0.05, 0.10] ,
#  "max_depth"        : [5, 6, 8, 10],
#  "min_child_weight" : [ 1, 3, 5],
#  "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4],
#  "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ] }

In [None]:
xgb_clf=XGBClassifier()

In [None]:
# best_score=0
# for g in ParameterGrid(param_grid):
#      xgb_clf.set_params(**g)
#      xgb_clf.fit(train_res_X,train_res_y,early_stopping_rounds=20,eval_metric="auc",eval_set=[(val_sca_X, val_y)])
#      pred_y = xgb_clf.predict_proba(val_sca_X)[:,1]
#      score=roc_auc_score(val_y,pred_y)
#      if score > best_score:
#          best_score = roc_auc_score(val_y,pred_y)
#          best_grid = g

In [None]:
best_grid={'colsample_bytree': 0.7,
 'gamma': 0.1,
 'learning_rate': 0.1,
 'max_depth': 6,
 'min_child_weight': 1}

In [None]:
xgb_clf.set_params(**best_grid)
xgb_clf.fit(train_res_X,train_res_y,early_stopping_rounds=20,eval_metric="auc",eval_set=[(val_sca_X, val_y)])

We have finished hyperparameter tuning. Now we should make a prediction on the test set

In [None]:
pred_y = xgb_clf.predict_proba(test_sca_X)[:,1]

In [None]:
subs = pd.DataFrame(({'ID': test_X.index, 'TARGET': pred_y}))

Another set of grid search to find best hyperparameter

In [None]:
param_grid = {"learning_rate"    : [0.03] ,
  "max_depth"        : [6],
 "min_child_weight" : [ 0,1],
 "gamma"            : [ 0.1],
"n_estimators": [150,200,250],
"colsample_bytree" : [ 0.8,0.85] }

In [None]:
len(list(ParameterGrid(param_grid)))

In [None]:
best_score=0
for g in ParameterGrid(param_grid):
    xgb_clf.set_params(**g)
    xgb_clf.fit(train_res_X,train_res_y,early_stopping_rounds=20,eval_metric="auc",eval_set=[(val_sca_X, val_y)])
    pred_y = xgb_clf.predict_proba(val_sca_X)[:,1]
    score=roc_auc_score(val_y,pred_y)
    if score > best_score:
        best_score = roc_auc_score(val_y,pred_y)
        best_grid = g

In [None]:
best_score

In [None]:
best_grid

In [None]:
# best_grid2={'colsample_bytree': 0.8,
# 'gamma': 0.0,
#  'learning_rate': 0.03,
#  'max_depth': 6,
#  'min_child_weight': 1,
#  'n_estimators': 350}

In [None]:
#best_score2=0.8262192223595399

In [None]:
#subs.to_csv('submission.csv', index=False)

We arrive at 0.8265 accuracy it is not the best I might come back at this problem later. But it should be a fairly good notebook to show we tackle imbalanced class classification problem