# **Santander's Customer Satisfaction**

## Exploratory Data Analysis

In [None]:
from google.colab import files

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_curve, auc, f1_score
from sklearn.model_selection import RandomizedSearchCV

In [None]:
files.upload()

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
train = pd.read_csv("train.csv")

In [None]:
train_dummy = train.copy()

**Used a copy of dataset so that original dataset remains unchannged.**

In [None]:
train_dummy.head(7)

In [None]:
train_dummy.describe()

**Summary of dataset.**

Here are the inferences made:

1. All values are numeric.
2. Multiple columns have only zeros in all the rows.
3. Some duplicate columns are also there like "imp_op_var41_efect_ult1" and "imp_op_var39_efect_ult1".
4. Need impute/remove outliers.
5. Normalisation of data is required.

In [None]:
train_dummy.dtypes

**Datatypes of columns(All columns are not shown because of large dataset).**

In [None]:
sns.heatmap(train_dummy.isnull())

**No missing value in dataset.**

In [None]:
train_dummy.isnull().sum()

In [None]:
train_dummy.info()

In [None]:
train_dummy['TARGET'].unique()

In [None]:
target = train_dummy.TARGET
plt.hist(target)
plt.show()

**There are more no. of "0" than "1" in the target side.That means more customers are satified.**

In [None]:
c1=0
c2=0
for i in train_dummy.TARGET:
  if i==0:
    c1+=1
  elif i==1:
    c2+=1

x=c1/(c1+c2)

print("No. of 0s in target: "+str(c1))
print("No. of 1s in target: "+str(c2))
print("Percentage of customer satisfied: "+str(x*100)+"%")

**Almost 96% of total customers are satisfied.**

Hence Dataset is unbalanced.

In [None]:
zero_col=[]
for col in train_dummy:
  c=0
  for data in train_dummy[col]:
    if data!=0:
      c+=1
  if c==0:
    zero_col.append(col)
    print(col+" has all zero values")
    print("===============================================")
print(zero_col)

In [None]:
train_dummy = train_dummy.drop(columns=zero_col)

**All columns having only 0s are dropped from data set.**

In [None]:
train_dummy.shape

**Hence, dataset size became less.**

In [None]:
duplicate_columns = []
colu = train_dummy.columns
counter = 0
for i in range(len(colu)-1):
  val = train_dummy[colu[i]].values
  for j in range(i+1,len(colu)):
    if np.array_equal(val,train_dummy[colu[j]].values):
      duplicate_columns.append(colu[j])
      counter+=1
print("Duplicate columns in the dataset: ",duplicate_columns)
print("No. of duplicate columns: ",counter)

**So, We need to remove all 29 duplicate columns too.**

In [None]:
train_dummy = train_dummy.drop(columns= duplicate_columns)

**All 29 columns have been dropped.**

In [None]:
train_dummy.shape

**Size after dropping duplicate columns.**

In [None]:
cor = train_dummy.corr()
print(cor)

**Correlation coefficients of all columns.**

In [None]:
for i in range(0,5):
  for j in range(0,5):
    x = i*50
    y = j*50
    corr = cor.iloc[range(x,x+50),range(y,y+50)]
    fig, a = plt.subplots(figsize = (15, 10))
    sns.heatmap(corr,linewidths=0.5,ax = a)

 **Correlation heat maps have been displayed.**

In [None]:
col_cor = set()
for i in range(len(cor.columns)):
  for j in range(i):
    if (cor.iloc[i,j]>=0.9) and (cor.columns[j] not in col_cor):
      col_name = cor.columns[i]
      col_cor.add(col_name)
      if col_name in train_dummy:
        del train_dummy[col_name]

print("cols are: ",col_cor)

**Columns are removed having r-value>=0.9.**

In [None]:
train_dummy.describe()

In [None]:
train_dummy.shape

In [None]:
z_score = []
for co in train_dummy:
  temp = stats.zscore(train_dummy[co])
  z_score.append(temp)
  temp = []

print(z_score)

**All z-scores are detected.**

In [None]:
Q1 = train_dummy.quantile(0.25)
Q3 = train_dummy.quantile(0.75)
IQR = Q3-Q1
print("The IQR of all data: ",IQR)

**IQR value detected for the training data set.**

In [None]:
type(IQR)

In [None]:
train_d_outliers = train_dummy[~((train_dummy < (Q1 - 1.5*IQR)) | (train_dummy > (Q3 + 1.5*IQR))).any(axis=1)]
print(train_d_outliers)
type(train_d_outliers)

**train_d_outliers** is the data-frame where all the rows having outliers are being removed.

##PCA

Here we have two datasets:
1. Dataset with outliers.
2. Dataset without outliers.

We shall perform PCA on both the datasets.But before PCA, we need to normalize the data first.

In [None]:
#Dropping the ID and TARGET columns from datasets.
train_dummy_drop = train_dummy.drop(columns = ['ID','TARGET'])
train_d_outliers_drop = train_d_outliers.drop(columns = ['ID','TARGET'])
train_dummy_drop.shape , train_d_outliers_drop.shape

ID and TARGET columns are being dropped of the dataset before performing PCA. 

In [None]:
#Normalizing the data with outliers
train_dummy_norm = preprocessing.normalize(train_dummy_drop)
train_dummy_norm = pd.DataFrame(train_dummy_norm)

#Normalizing the data without outliers
train_d_outliers_norm = preprocessing.normalize(train_d_outliers_drop)
train_d_outliers_norm = pd.DataFrame(train_d_outliers_norm)

print(train_dummy_norm)
print(train_d_outliers_norm)

In [None]:
train_dummy_norm.shape, train_d_outliers_norm.shape

###PCA of dataset having outliers **"train_dummy_norm"**:

In [None]:
pca_santander_w = PCA(n_components = 7)
PC_santander_w = pca_santander_w.fit_transform(train_dummy_norm)

In [None]:
pca_dataFrame_w = pd.DataFrame(data = PC_santander_w, columns = ['PC1','PC2','PC3','PC4','PC5','PC6','PC7'])
pca_dataFrame_w 

In [None]:
#Variance ratio 
print(pca_santander_w.explained_variance_ratio_)

###PCA of dataset without outliers **"train_d_outliers_norm"**:

In [None]:
pca_santander_wo = PCA(n_components = 7)
PC_santander_wo = pca_santander_wo.fit_transform(train_d_outliers_norm)

In [None]:
pca_dataFrame_wo = pd.DataFrame(data = PC_santander_wo, columns = ['PC1','PC2','PC3','PC4','PC5','PC6','PC7'])
pca_dataFrame_wo

In [None]:
#Variance ratio 
print(pca_santander_wo.explained_variance_ratio_)

##Data Modelling 

Before modelling, we need to add both 'ID' and 'TARGET' columns in the normalized PCA dtaset.

In [None]:
#Adding the ID and TARGET to the "pca_dataFrame_w" dataframe again.
pca_dataFrame_w['ID'] = train_dummy['ID']
pca_dataFrame_w['TARGET'] = train_dummy['TARGET']
pca_dataFrame_w = pca_dataFrame_w[['ID','PC1','PC2','PC3','PC4','PC5','PC6','PC7','TARGET']]
pca_dataFrame_w

In [None]:
#Adding the ID and TARGET to the "pca_dataFrame_wo" dataframe again.
pca_dataFrame_wo['ID'] = train_d_outliers['ID']
pca_dataFrame_wo['TARGET'] = train_d_outliers['TARGET']
pca_dataFrame_wo = pca_dataFrame_wo[['ID','PC1','PC2','PC3','PC4','PC5','PC6','PC7','TARGET']]
pca_dataFrame_wo

Now we will do test and train split to our training dataset before building the model.

In [None]:
#Assigning the TARGET column to y and remaining columns to x. 
x = pca_dataFrame_w.drop(columns=['ID','TARGET'])
y = pca_dataFrame_w['TARGET']

#Splitting the  training dataset into test and train
x_train, x_test, y_train, y_test = train_test_split(x , y , test_size = 0.3 , random_state = 20)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

###**1. Logistic regression**

In [None]:
#importing logistic regreassion from sklearn
from sklearn.linear_model import LogisticRegression

#Building model
mlr = LogisticRegression()
mlr.fit(x_train,y_train) 

In [None]:
#Testing model on x_test
y_pred = mlr.predict(x_test)

In [None]:
#Checking accuracy
confusion_matrix(y_test , y_pred)

In [None]:
#Accuracy
print("Accuracy of the model is: ",accuracy_score(y_test, y_pred)*100,"%")

In [None]:
#Classification report
print("Overall report:")
print(classification_report(y_test,y_pred))

Now we will hypertune our logistic regression parameters and again predict the result.

In [None]:
params_lr = [{
    'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter' : [100, 1000,2500, 5000]
    
}]

In [None]:
mlr_random = RandomizedSearchCV(mlr,param_distributions=params_lr,scoring='roc_auc',n_jobs=-1,cv=5,verbose=True)

In [None]:
clf_random_mlr = mlr_random.fit(x_train,y_train)

In [None]:
clf_random_mlr.best_estimator_

In [None]:
mlr_hyp = LogisticRegression(C=11.288378916846883, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
mlr_hyp.fit(x_train,y_train)

In [None]:
y_pred_mlr_hyp = mlr_hyp.predict(x_test)

In [None]:
cm_mlr = confusion_matrix(y_test,y_pred_mlr_hyp) 
cm_mlr

In [None]:
accuracy_mlr = accuracy_score(y_test,y_pred_mlr_hyp)
accuracy_mlr

In [None]:
 print(classification_report(y_test,y_pred_mlr_hyp))

In [None]:
f1_mlr = f1_score(y_test,y_pred_mlr_hyp)
f1_mlr

Here we can observe one thing that even if after hypertuning the parameters, the evaluated parameters remain unchanged.

Let's see the perrformance using ROC curve.

In [None]:
y_prob_mlr = mlr_hyp.predict_proba(x_test)[:,1]
y_prob_mlr

In [None]:
fpr_mlr , tpr_mlr , thr_mlr = roc_curve(y_test,y_prob_mlr)
fpr_mlr , tpr_mlr , thr_mlr

In [None]:
auc_mlr = auc(fpr_mlr,tpr_mlr)
print("Area under the ROC curve is: ",auc_mlr)

In [None]:
#Now let's draw the ROC
plt.figure(figsize=(7,7))
plt.title("ROC of Logistic regression")
plt.plot(fpr_mlr,tpr_mlr,linestyle = 'solid',label = 'Area Under the Curve = %0.3f' % auc_mlr)
plt.legend(loc = 'upper left')
plt.plot([0,1],linestyle = '--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")

###**2. Decision tree**

In [None]:
#importing decision tree
from sklearn.tree import DecisionTreeClassifier

#Building model
DT = DecisionTreeClassifier()
DT.fit(x_train,y_train)

In [None]:
#Testing model on x_test
y_pred_dt = DT.predict(x_test)

In [None]:
#Confusion matrix
confusion_matrix(y_test,y_pred_dt)

In [None]:
#Accuracy
accuracy_dt = accuracy_score(y_test,y_pred_dt)
accuracy_dt

In [None]:
#Classification report
print(classification_report(y_test,y_pred_dt))

In [None]:
f1_dt = f1_score(y_test,y_pred_dt)
f1_dt

Now we will see the model performance using ROC.

In [None]:
y_prob_dt = DT.predict_proba(x_test)[:,1]
y_prob_dt

In [None]:
fpr_dt , tpr_dt , thr_dt = roc_curve(y_test,y_prob_dt)
fpr_dt , tpr_dt , thr_dt

In [None]:
auc_dt = auc(fpr_dt,tpr_dt)
print("Area under the curve : ",auc_dt)

In [None]:
#Now let's draw the ROC
plt.figure(figsize=(7,7))
plt.title("ROC of Decision Tree")
plt.plot(fpr_dt,tpr_dt,linestyle = 'solid',label = 'Area Under the Curve = %0.3f' % auc_dt)
plt.legend(loc = 'upper left')
plt.plot([0,1],linestyle = '--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")

Now we will hypertune our Decision tree parameters and again predict the result.

In [None]:
params_dt = [
    {'splitter' : ['best', 'random'],
     'max_depth' : np.linspace(1, 32, 32, endpoint=True),
     'min_samples_split' : np.linspace(1, 10, 10, endpoint=True),
     'min_samples_leaf' : np.linspace(0.1, 0.5, 10, endpoint=True),
     'max_features' : list(range(1,x_train.shape[1])),
    }
]

In [None]:
dt_random = RandomizedSearchCV(DT,param_distributions=params_dt,scoring='roc_auc',n_jobs=-1,cv=5,verbose=True)

In [None]:
clf_random_dt = dt_random.fit(x_train,y_train)

In [None]:
clf_random_dt.best_estimator_

In [None]:
DT_hyp = DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=5.0, max_features=5, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=0.3666666666666667,
                       min_samples_split=1.0, min_weight_fraction_leaf=0.0,
                       presort='deprecated', random_state=None,
                       splitter='random')

In [None]:
DT_hyp.fit(x_train,y_train)

In [None]:
y_pred_dt_hyp = DT_hyp.predict(x_test)

In [None]:
confusion_matrix(y_test,y_pred_dt_hyp)

In [None]:
accuracy_dt_hyp = accuracy_score(y_test,y_pred_dt_hyp)
accuracy_dt_hyp

In [None]:
print(classification_report(y_test,y_pred_dt_hyp))

In [None]:
f1_dt_hyp = f1_score(y_test,y_pred_dt_hyp)
f1_dt_hyp

Now let's see model performance after hyperparameter tuning by ROC.

In [None]:
y_prob_dt_hyp = DT_hyp.predict_proba(x_test)[:,1]
y_prob_dt_hyp

In [None]:
fpr_dt_hyp , tpr_dt_hyp , thr_dt_hyp = roc_curve(y_test,y_prob_dt_hyp)

In [None]:
auc_dt_hyp = auc(fpr_dt_hyp,tpr_dt_hyp)
auc_dt_hyp

In [None]:
#Now let's draw the ROC
plt.figure(figsize=(7,7))
plt.title("ROC of Decision Tree(After tuning Hyperparameters)")
plt.plot(fpr_dt_hyp,tpr_dt_hyp,linestyle = 'solid',label = 'Area Under the Curve = %0.3f' % auc_dt_hyp)
plt.legend(loc = 'upper left')
plt.plot([0,1],linestyle = '--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")

###**3. Naive Bayes**

In [None]:
#importing multinomial naive bayes
from sklearn.naive_bayes import GaussianNB

#Building model
GNB = GaussianNB()
GNB.fit(x_train,y_train)

In [None]:
#Testing model on x_test
y_pred_nb = GNB.predict(x_test)

In [None]:
#Confusion matrix
confusion_matrix(y_test,y_pred_nb)

In [None]:
#Accuracy
accuracy_nb = accuracy_score(y_test,y_pred_nb)
accuracy_nb

In [None]:
#Classification report
print(classification_report(y_test,y_pred_nb))

In [None]:
f1_nb = f1_score(y_test,y_pred_nb)
f1_nb

Now let's go for ROC.

In [None]:
y_prob_nb = GNB.predict_proba(x_test)[:,1]
y_prob_nb

In [None]:
fpr_nb , tpr_nb , thr_nb = roc_curve(y_test,y_prob_nb)
fpr_nb , tpr_nb , thr_nb

In [None]:
auc_nb = auc(fpr_nb,tpr_nb)
auc_nb

In [None]:
#Now let's draw the ROC
plt.figure(figsize=(7,7))
plt.title("ROC of Naive Bayes")
plt.plot(fpr_nb,tpr_nb,linestyle = 'solid',label = 'Area Under the Curve = %0.3f' % auc_nb)
plt.legend(loc = 'upper left')
plt.plot([0,1],linestyle = '--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")

We will not do hyperparameter tuning for this model, because for this dataset Gaussian naive bayes is not appropriate.

If in case we need to do tuning of hyperparameters, below are the parameters with which we can do tuning using RandomSearchCV or GridSearchCV.

In [None]:
params_nb = [
             {C=[0.05,0.1,0.2,0.3,0.25,0.4,0.5,0.6,0.7,0.8,0.9,1]
              gamma=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
              kernel=['rbf','linear']}
]

###**4. Random Forest**

In [None]:
#importing required package 
from sklearn.ensemble import RandomForestClassifier

In [None]:
#Building model
RF = RandomForestClassifier(n_estimators=100)
RF.fit(x_train,y_train)

In [None]:
#Testing model on x_test
y_pred_rf = RF.predict(x_test)

In [None]:
#Confusion matrix
confusion_matrix(y_test, y_pred_rf)

In [None]:
#Accuracy
accuracy_score(y_test, y_pred_rf)

In [None]:
#Overall score
print(classification_report(y_test, y_pred_rf))

Now we will do hyperparameter tuning of random forest classifier.

In [None]:
params_rf = [
    {'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]}
]

In [None]:
rf_random = RandomizedSearchCV(RF,param_distributions=params_rf,scoring='roc_auc',n_jobs=-1,cv=2,verbose=True)

In [None]:
clf_random_rf = rf_random.fit(x_train,y_train)

In [None]:
y_pred_rf_hyp = clf_random_rf.predict(x_test)

In [None]:
confusion_matrix(y_test,y_pred_rf_hyp)

In [None]:
accuracy_rf = accuracy_score(y_test,y_pred_rf_hyp)
accuracy_rf

In [None]:
print(classification_report(y_test,y_pred_rf_hyp))

In [None]:
f1_rf = f1_score(y_test,y_pred_rf_hyp)
f1_rf

Now let's observe the model performance using ROC.

In [None]:
y_prob_rf_hyp = clf_random_rf.predict_proba(x_test)[:,1]
y_prob_rf_hyp

In [None]:
fpr_rf,tpr_rf,thr_rf = roc_curve(y_test,y_prob_rf_hyp)
fpr_rf,tpr_rf,thr_rf

In [None]:
auc_rf = auc(fpr_rf,tpr_rf)
auc_rf

In [None]:
#Now let's draw the ROC
plt.figure(figsize=(7,7))
plt.title("ROC of Random Forest")
plt.plot(fpr_rf,tpr_rf,linestyle = 'solid',label = 'Area Under the Curve = %0.3f' % auc_rf)
plt.legend(loc = 'upper left')
plt.plot([0,1],linestyle = '--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")

###**5. SVM**

In [None]:
#importing packages
from sklearn.svm import SVC

In [None]:
#Building model
clf = SVC()
clf.fit(x_train,y_train)

In [None]:
#Applying model on x_test
y_pred_svm = clf.predict(x_test)  

In [None]:
#confusion matrix
confusion_matrix(y_test,y_pred_svm)

In [None]:
#Accuracy
accuracy_score(y_test,y_pred_svm)

In [None]:
#Overall score
print(classification_report(y_test,y_pred_svm))

Now we will do the hyperparameter tuning of SVM algo.

In [None]:
params_svm = [
              {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf']}
]

In [None]:
random_svm = RandomizedSearchCV(clf,param_distributions=params_svm,scoring='roc_auc',n_jobs=-1,cv=5,verbose=True)

In [None]:
clf_random_svm = random_svm.fit(x_train,y_train)

In [None]:
clf_random_svm.best_estimator_

In [None]:
svm_hyp = SVC(C=1000, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [None]:
svm_hyp.fit(x_train,y_train)

In [None]:
y_pred_svm_hyp = svm_hyp.predict(x_test)

In [None]:
confusion_matrix(y_test,y_pred_svm_hyp)

In [None]:
accuracy_svm = accuracy_score(y_test,y_pred_svm_hyp)
accuracy_svm

In [None]:
print(classification_report(y_test,y_pred_svm_hyp))

In [None]:
f1_svm = f1_score(y_test,y_pred_svm_hyp)
f1_svm

Now let's see the model performance using ROC.

In [None]:
y_prob_svm = svm_hyp.predict_proba(x_test)[:,1]
y_prob_svm

In [None]:
fpr_svm, tpr_svm, thr_svm = roc_curve(y_test,y_prob_svm)
fpr_svm, tpr_svm, thr_svm

In [None]:
auc_svm = auc(fpr_svm,tpr_svm)
auc_svm

In [None]:
#Now let's draw the ROC
plt.figure(figsize=(7,7))
plt.title("ROC of SVM")
plt.plot(fpr_svm,tpr_svm,linestyle = 'solid',label = 'Area Under the Curve = %0.3f' % auc_svm)
plt.legend(loc = 'upper left')
plt.plot([0,1],linestyle = '--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")

Here also after tuning the parameters, still the model performance remains unchanged.

###**6. XGBoost**

In [None]:
#Setting parameters for xgboost
params = {
    "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
}

In [None]:
#importing necessary pacakges
import xgboost

In [None]:
classifier = xgboost.XGBClassifier()

In [None]:
random_search = RandomizedSearchCV(classifier,param_distributions=params,n_iter=5,scoring='roc_auc',n_jobs=-1,cv=5,verbose=3)

Here, we have done random search to find out the exact parameters for xgboost. In this method we passed classifier, all the selected parameters. 

In [None]:
random_search.fit(x_train,y_train)

In [None]:
print(random_search.best_estimator_)
print("====================================================================================================")
print(random_search.best_params_)

All above parameters are suitable for our dataset and it will not make the model overfitting.

In [None]:
classifier = xgboost.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0.4,
              learning_rate=0.15, max_delta_step=0, max_depth=3,
              min_child_weight=5, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
from sklearn.model_selection import cross_val_score
score_train = cross_val_score(classifier,x_train,y_train,cv=10)
score_train

In [None]:
score_train.mean()

In [None]:
classifier.fit(x_train,y_train)
y_pred_xgb = classifier.predict(x_test)

We have built the model and predicted the output for test data.

In [None]:
#Accuracy
accuracy_xgb = accuracy_score(y_test,y_pred_xgb)
accuracy_xgb

In [None]:
#confusion matrix
confusion_matrix(y_test,y_pred_xgb)

In [None]:
#Overall score
print(classification_report(y_test,y_pred_xgb))

In [None]:
probability_xgb = classifier.predict_proba(x_test)

In [None]:
print("Probability of satisfaction:",probability_xgb[:,0])
print("============================================================================================================")
print(print("Probability of dis-satisfaction:",probability_xgb[:,1]))

In [None]:
f1_xgb = f1_score(y_test,y_pred_xgb)
f1_xgb

Now let's go for ROC.

In [None]:
y_prob_xgb = probability_xgb[:,1]
y_prob_xgb

In [None]:
fpr_xgb,tpr_xgb,thr_xgb = roc_curve(y_test,y_prob_xgb)
fpr_xgb,tpr_xgb,thr_xgb

In [None]:
auc_xgb = auc(fpr_xgb,tpr_xgb)
auc_xgb

In [None]:
#Now let's draw the ROC
plt.figure(figsize=(7,7))
plt.title("ROC of XGBoost")
plt.plot(fpr_xgb,tpr_xgb,linestyle = 'solid',label = 'Area Under the Curve = %0.3f' % auc_xgb)
plt.legend(loc = 'upper left')
plt.plot([0,1],linestyle = '--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")

##Model Comparison

In [None]:
table = pd.DataFrame({
    'Model' : ['Logistic Regression','Decision Tree','Decision Tree(Hyp)','Naive Bayes','Random Forest','SVM','XGBoost'],
    'Accuracy' : [accuracy_mlr,accuracy_dt,accuracy_dt_hyp,accuracy_nb,accuracy_rf,accuracy_svm,accuracy_xgb],
    'AUC' : [auc_mlr,auc_dt,auc_dt_hyp,auc_nb,auc_rf,auc_svm,auc_xgb],
    'f1-Score' : [f1_mlr,f1_dt,f1_dt_hyp,f1_nb,f1_rf,f1_svm,f1_xgb]
})


table.style.background_gradient(cmap='hot_r', text_color_threshold=0.5)