In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

# **Data Wrangling:**

In [None]:
df = pd.read_csv("../input/default-of-credit-card-clients-dataset/UCI_Credit_Card.csv")
df.head()

Firstly, Null values need to be checked, 
however, after checking this dataset, there are 
no null values. Then, column ID has been 
removed from the data because it contains just 
a normal serial number of observations.

In [None]:
df.drop(['ID'], axis=1, inplace=True) # Deleting column ID

In [None]:
df.isnull().sum() # No null values.

In [None]:
df.shape   # Shape of DataFrame.

In [None]:
df.columns # Columns of DataFrame

In [None]:
df.describe()

# **Correlation Matrix Plot**
We can demonstrate that no variables 
are strongly correlated with the Target variable 
(default). The ‘PAY_’ variables have a strong 
correlation between them and have a weak 
positive correlation with the target 
variable(default). All the ‘BILL_AMT’ 
variables have a good positive correlation 
between them. Also, ‘LIMIT_BAL’ has a good 
positive correlation with ‘BILL_AMT’ 
variables.

In [None]:
corr = df.corr()
plt.figure(figsize=(25,15))
sns.heatmap(corr, annot=True)

# **Splitting data into train and test**
We have split this data into 75% and 25% for 
train and test sets respectively using 
sklearn.model_selection train_test_split. We 
created x_train, x_test, y_train and y_test.
Also, we have transformed the data using 
sklearn. preprocessing MinMaxScaler. Here, 
we have used x_train and x_test to transform 
them into xtrain_scaler and xtest_scalar 
because there are many observations with large 
ranges such as ‘TOTAL_PAY’, 
‘LIMIT_BALANCE’ and ‘TOTAL_BILL’. 
We are using MinMaxScaler to scale our 
variables and convert them in the range 0-1. 
Since all variables are transformed using the 
same MinMaxScaler in the same range 0-1, so 
the degree to which it affects our target variable 
will become equal and will avoid variables 
from being biased because of large range 
values.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
x = df[['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']]
y = df['default.payment.next.month']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=3)

xtrain_scaler = MinMaxScaler().fit_transform(x_train)
xtest_scaler = MinMaxScaler().fit_transform(x_test)

# **Machine Learning Models:**
This project will use 5 Machine Learning models:
# **1. KNN:**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix

> ***Checking for which K, model is generating more accuracy. Using normal train and test data.***

In [None]:
i = 1
k = np.arange(1, 30, 1)
k_val_acc = []
for i in k:
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(x_train, y_train)              # Fitting model with x_train and y_train
    y_pred = knn.predict(x_test)           # Predicting the results
    k_val_acc.append(metrics.accuracy_score(y_test, y_pred))
    print("Accuracy for K = {0} is: ".format(i),metrics.accuracy_score(y_test, y_pred))
    
m = max(k_val_acc)

print("We got max accuracy of {0} when K = {1}". format(max(k_val_acc), [i+1 for i, j in enumerate(k_val_acc) if j == m]))

> ***Checking for which K, model is generating more accuracy. Using transformed train and test data.***

In [None]:
i = 1
k = np.arange(1, 30, 1)
k_val_acc_mms = []
for i in k:
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(xtrain_scaler, y_train)              # Fitting model with xtrain_scaler and y_train
    y_pred_mms = knn.predict(xtest_scaler)       # Predicting the results
    k_val_acc_mms.append(metrics.accuracy_score(y_test, y_pred_mms))
    print("Accuracy for K = {0} is: ".format(i),metrics.accuracy_score(y_test, y_pred_mms))
    
m = max(k_val_acc_mms)

print("We got max accuracy of {0} when K = {1}". format(max(k_val_acc_mms), [i+1 for i, j in enumerate(k_val_acc_mms) if j == m]))

**Confusion Matrix of model with transformed data:**

In [None]:
plot_confusion_matrix(knn, xtest_scaler, y_test)

conf_metr = metrics.confusion_matrix(y_test, y_pred_mms)

print("Confusion Matrix: \n {}".format(conf_metr))
print(metrics.classification_report(y_test,y_pred_mms))
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_mms))
print("Recall/Sensitivity/True Positive Rate:",metrics.recall_score(y_test, y_pred_mms))
print("Precision:",metrics.precision_score(y_test, y_pred_mms))

In [None]:
# ROC Curve:
metrics.plot_roc_curve(knn, xtest_scaler, y_test)

First, we checked for which K, our model has 
the best accuracy. We performed KNN two 
times, once with normal train and test sets and 
a second time with transformed 
(MinMaxScaler) train and test sets.
1. During our first model, we found results as:
* K: 28
* Accuracy: 78.56 %
2. During our second model of KNN with transformed sets:
* K: 29
* Accuracy: 81.78 %
3. After plotting confusion matrix with transformed sets, we were able to get:
* Accuracy: 81.78 % 
* Precision: 63.40 %
* Recall: 33.06 %

Shows ROC Curve and it can be 
observed that “Area Under the Curve” is 0.75. We can observe a decent ROC curve.

# **2. Logistic Regression:**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

> ***HyperParameter Tuned model with normal train and test data:***

In [None]:
c_val = [0.001,0.01,0.1,0.5,1.0]

logreg = LogisticRegression(solver = 'liblinear')
hyperParam = [{'C':c_val}]

gsv = GridSearchCV(logreg,hyperParam,cv=5,verbose=1)
best_model = gsv.fit(x_train, y_train)                      # Fitting model with x_train and y_train
logreg_pred = best_model.best_estimator_.predict(x_test)    # Predicting the results


print("Best HyperParameter: ",gsv.best_params_)
print("Best Accuracy :",best_model.score(x_test, y_test))

> ***HyperParameter Tuned model with transformed training and testing data:***

In [None]:
c_val = [0.001,0.01,0.1,0.5,1.0]

logreg = LogisticRegression(solver = 'liblinear')
hyperParam = [{'C':c_val}]

gsv = GridSearchCV(logreg,hyperParam,cv=5,verbose=1)
best_model = gsv.fit(xtrain_scaler, y_train)                       # Fitting model with xtrain_scaler and y_train
logreg_pred_mms = best_model.best_estimator_.predict(xtest_scaler) # Predicting the results

print("Best HyperParameter: ",gsv.best_params_)
print("Best Accuracy :",best_model.score(xtest_scaler, y_test))

**Confusion Matrix of model with transformed data:**

In [None]:
plot_confusion_matrix(gsv,xtest_scaler, y_test)

conf_metr = confusion_matrix(y_test, logreg_pred_mms)

print("Confusion Matrix: \n {}".format(conf_metr))
print(metrics.classification_report(y_test,logreg_pred_mms))
print("Accuracy:",metrics.accuracy_score(y_test, logreg_pred_mms))
print("Recall/Sensitivity/True Positive Rate:",metrics.recall_score(y_test, logreg_pred_mms))
print("Precision:",metrics.precision_score(y_test, logreg_pred_mms))

In [None]:
# ROC Curve:
metrics.plot_roc_curve(gsv, xtest_scaler, y_test)

Next, Logistic Regression was used for this 
dataset. This model is tuned by giving ‘C’ 
[0.001,0.01,0.1,0.5,1.0] and assigning the
solver as ‘liblinear’. This model is 
hyperparameter tuned using sklearn’s 
GridSearchCV. We performed this model two 
times, once with normal train and test sets and 
a second time with transformed 
(MinMaxScaler) train and test sets.
1. During our first model, we found results as:
* C: 0.001
* Accuracy: 78.81 %
2. During our second model with transformed sets:
* C: 1.0
* Accuracy: 82.68 %
3. After plotting confusion matrix with transformed sets, we were able to get:
* Accuracy: 82.68 % 
* Precision: 68.74 %
* Recall: 33.37 %

Shows ROC Curve and it can be observed that “Area Under the Curve” is 0.73. The ROC Curve can be used to compare with other models, it showsthe area under the curve. Large values on Y-Axis demonstrates lower false negatives and higher true positives.

# **3. Decision Tree:**

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

> ***Unpruned Tree with normal train and test data:***

In [None]:
dtree_up = DecisionTreeClassifier()
dtree_up.fit(x_train, y_train)                  # Fitting model with x_train and y_train
dtree_pred_up = dtree_up.predict(x_test)        # Predicting the results
print("Accuracy is: ",metrics.accuracy_score(y_test, dtree_pred_up))

> ***Pruned Tree after applying model with normal train and test data:***

In [None]:
i = 1
d = np.arange(1, 20, 1)
depth = []
for i in d:
    dtree = DecisionTreeClassifier(max_depth=i)
    dtree.fit(x_train, y_train)                # Fitting model with x_train and y_train
    dtree_pred = dtree.predict(x_test)         # Predicting the results
    depth.append(metrics.accuracy_score(y_test, dtree_pred))
    print("Accuracy when max_depth = {0}: ".format(i),metrics.accuracy_score(y_test, dtree_pred))

m = max(depth)

print("We got max accuracy of {0} when max_depth = {1}". format(max(depth), [i+1 for i, j in enumerate(depth) if j == m]))

> ***Pruned Tree applying model with transformed training and testing data:***

In [None]:
i = 1
d = np.arange(1, 20, 1)
depth_mms = []
for i in d:
    dtree = DecisionTreeClassifier(max_depth=i)
    dtree.fit(xtrain_scaler, y_train)             # Fitting model with xtrain_scaler and y_train
    dtree_pred_mms = dtree.predict(xtest_scaler)  # Predicting the results
    depth_mms.append(metrics.accuracy_score(y_test, dtree_pred_mms))
    print("Accuracy when max_depth = {0}: ".format(i),metrics.accuracy_score(y_test, dtree_pred_mms))

m = max(depth_mms)

print("We got max accuracy of {0} when max_depth = {1}". format(max(depth_mms), [i+1 for i, j in enumerate(depth_mms) if j == m]))

**Confusion Matrix of Pruned Tree model with transformed training and testing data:**

In [None]:
plot_confusion_matrix(dtree, xtest_scaler, y_test)

conf_metr = metrics.confusion_matrix(y_test, dtree_pred_mms)

print("Confusion Matrix: \n {}".format(conf_metr))
print(metrics.classification_report(y_test,dtree_pred_mms))
print("Accuracy:",metrics.accuracy_score(y_test, dtree_pred_mms))
print("Recall/Sensitivity/True Positive Rate:",metrics.recall_score(y_test, dtree_pred_mms))
print("Precision:",metrics.precision_score(y_test, dtree_pred_mms))

In [None]:
# ROC Curve:
metrics.plot_roc_curve(dtree, xtest_scaler, y_test)

**Visualizing Decision Tree**

In [None]:
pip install pydotplus

In [None]:
from sklearn.tree import export_graphviz
from six import StringIO
from IPython.display import Image  
import pydotplus

In [None]:
feat = ['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
dot_data = StringIO()
export_graphviz(dtree, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,feature_names = feat,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('Credit_Card_Tree.png')
Image(graph.create_png())

Next, Decision Tree is used for this dataset. We 
have used 3 variants of models: 1st Full tree with 
normal test and train sets, 2nd Pruned tree with 
normal test and train sets, and 3rd Pruned tree 
with transformed (MinMaxScaler) train and test 
sets.
1. Decision Tree (Unpruned) with normal test and train sets:
* Accuracy: 73.34 %
2. Decision Tree – which was pruned using max_depth for 1 to 20 range and used normal train and test sets here.
* Accuracy: 83.04 %
* max_depth: 4
2. Decision Tree – which was pruned using max_depth for 1 to 20 range and this time using transformed (MinMaxScaler) train and test sets.
* Accuracy: 83.06 %
* max_depth: 3
3. After plotting confusion matrix (Fig. 6) with transformed sets, we were able to get:
* Accuracy: 57.21 % 
* Precision: 24.61 %
* Recall: 47.10 %

Shows ROC Curve and it can be observed that “Area Under the Curve” is 0.55. The ROC Curve can be used to compare with other models, it shows the area under the curve. We got not good results here, it is clearly visible in ROC that area under the curve is less.

# **4. Random Forest:**

In [None]:
from sklearn.ensemble import RandomForestClassifier

> ***Simple Random Forest with transformed data:***

In [None]:
rf = RandomForestClassifier()
rf.fit(xtrain_scaler, y_train)             # Fitting model with xtrain_scaler and y_train
rf_pred = rf.predict(xtest_scaler)         # Predicting the results
#est_per.append(metrics.accuracy_score(y_test, rf_pred))
print("Accuracy: {0}".format(metrics.accuracy_score(y_test, rf_pred)))

> ***Hyperparameter Tuned Random Forest with transformed data:***

In [None]:
estimators = [10,50,80,100,150,200,250,300]

rf = RandomForestClassifier(max_depth=3,random_state=5)
hyperParam = [{'n_estimators':estimators}]

gsv = GridSearchCV(rf,hyperParam,cv=5,verbose=1)
best_model = gsv.fit(xtrain_scaler, y_train)                       # Fitting model with xtrain_scaler and y_train
rf_pred_mms = best_model.best_estimator_.predict(xtest_scaler)     # Predicting the results

print("Best HyperParameter: ",gsv.best_params_)
print("Best Accuracy :",best_model.score(xtest_scaler, y_test))

**Confusion Matrix of Hyperparameter Tuned Random Forest model with transformed data:**

In [None]:
plot_confusion_matrix(gsv, xtest_scaler, y_test)

conf_metr = confusion_matrix(y_test, rf_pred_mms)

print("Confusion Matrix: \n {}".format(conf_metr))
print(metrics.classification_report(y_test,rf_pred_mms))
print("Accuracy:",metrics.accuracy_score(y_test, rf_pred_mms))
print("Recall/Sensitivity/True Positive Rate:",metrics.recall_score(y_test, rf_pred_mms))
print("Precision:",metrics.precision_score(y_test, rf_pred_mms))

In [None]:
# ROC Curve:
metrics.plot_roc_curve(gsv, xtest_scaler, y_test)

Next, we used Random Forest. We used two 
models of Random Forest for the dataset. Both 
times, we used transformed (MinMaxScaler) 
train and test sets. 1st model is Simple Random 
Forest and 2nd model is hyperparameter tuned 
Random Forest model.
1. During our first simple model, we found results as:
* Accuracy: 79.44 %
2. For second model, we have used hyperparameter tuning, for this we changed max_depth to 3 (because we had got depth 3 for decision tree which showed best accuracy), random state = 5 and n_estimators values as[10,50,80,100,150,200,250,300]. This model is hyperparameter tuned using sklearn’s GridSearchCV.
* Accuracy: 81.80 %
* n_estimator: 10
3. After plotting confusion matrix with transformed sets, we were able to get:
* Accuracy: 81.80 % 
* Precision: 69.05 %
* Recall: 25.44 %

Shows ROC Curve and it can be observed that “Area Under the Curve” is 0.77. The ROC Curve can be used to compare with other models, it shows the area under the curve. Large values on Y-Axis demonstrates lower false negatives and higher true positives.

# **5. SVC (Support Vector Classifier):**

In [None]:
from sklearn.svm import SVC

> ***Tuned Model with transformed data:***

In [None]:
kernels = ['rbf','linear','poly','sigmoid']

svc = SVC()
hyperParam = [{'kernel':kernels}]

gsv = GridSearchCV(svc,hyperParam,cv=5,verbose=1)
best_model = gsv.fit(xtrain_scaler, y_train)                       # Fitting model with xtrain_scaler and y_train
svc_pred_mms = best_model.best_estimator_.predict(xtest_scaler)    # Predicting the results

print("Best HyperParameter: ",gsv.best_params_)
print("Best Accuracy :",best_model.score(xtest_scaler, y_test))

**Confusion Matrix Tuned Model with transformed data:**

In [None]:
plot_confusion_matrix(gsv, xtest_scaler, y_test)

conf_metr = confusion_matrix(y_test, svc_pred_mms)

print("Confusion Matrix: \n {}".format(conf_metr))
print(metrics.classification_report(y_test,svc_pred_mms))
print("Accuracy:",metrics.accuracy_score(y_test, svc_pred_mms))
print("Recall/Sensitivity/True Positive Rate:",metrics.recall_score(y_test, svc_pred_mms))
print("Precision:",metrics.precision_score(y_test, svc_pred_mms))

In [None]:
# ROC Curve:
metrics.plot_roc_curve(gsv, xtest_scaler, y_test)

Next, we used Support Vector Machine for this dataset. This model is tuned using 4 kernel values [‘rbf’, ‘linear’, ‘poly’ and ‘sigmoid’].
1. Used this model with transformed (MinMaxScaler) train and test sets.
* Accuracy: 81.64 %
* Kernel: poly
2. After plotting confusion matrix with transformed sets, we were able to get:
* Accuracy: 81.64 % 
* Precision: 69.71 %
* Recall: 23.48 %

Shows ROC Curve and it can be observed that the “Area Under the Curve” is 0.69. The ROC Curve which we can use to compare with other models, it shows the area under the curve. From the ROC curve, we can demonstrate that area under the curve is less.


# **Bagging with all classifiers using Cross Validation:**

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier

In [None]:
# Creating classifiers
knn = KNeighborsClassifier()
lg = LogisticRegression()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
svc = SVC()

clf_array = [knn, lg, dt, rf,svc]

for clf in clf_array:
    cc_scores = cross_val_score(clf, x, y, cv=10, n_jobs=-1)
    bagging_clf = BaggingClassifier(clf, max_samples=0.25, max_features=10, random_state=3)
    bagging_scores = cross_val_score(bagging_clf, x, y, cv=10, n_jobs=-1)
    
    print("Accuracy of: {1:.3f}, std: (+/-) {2:.3f} [{0}]".format(clf.__class__.__name__,cc_scores.mean(), cc_scores.std()))
    print("Accuracy of: {1:.3f}, std: (+/-) {2:.3f} [Bagging {0}]\n".format(clf.__class__.__name__,bagging_scores.mean(), bagging_scores.std()))

In [None]:
clf = [knn, lg, dt, rf,svc]
eclf = VotingClassifier(estimators=[('KNN', knn), ('Logistic Regression', lg), ('Decision Tree', dt), ('Random Forest', rf), ('SVC', svc)], voting='hard')
for clf, label in zip([knn, lg, dt, rf,svc, eclf], ['KNN', 'Logistic Regression', 'Decision Tree', 'Random Forest', 'SVC', 'Ensemble']):
    scores = cross_val_score(clf, x_train, y_train, cv=10, scoring='accuracy')
    print("Accuracy: %0.3f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Next, we used the bagging method, and this will 
create all models using different data and a 
weighted average will be used to determine the 
result. We have used all 5 ML models (KNN, 
Logistic, Decision Tree, Random Forest and 
SVM). We hyperparameter BaggingClassifier 
with all 5 ML models, max_samples=0.25, 
max_features=10, random_state=3. For cross 
val score, parameters are passed like cv = 10 
and n_jobs = -1. We can see from the below 
that for KNN, Logistic and Decision 
Tree, accuracy is increasing, and the standard 
deviation is decreasing for KNN, Decision Tree 
and Random Forest.
To choose the best classifier, we will use 
Sklearn’s VotingClassifier, which will help us 
to combine different ML classifiers and will 
perform a vote on all classifiers.
Shows results we can observe that 
RandomForest had the best accuracy 81.30 % 
with a very low standard deviation of 0.01.

Note: Accuracy and Std deviation may change slightly after each run.

# **Boosting with all classifiers using Cross Validation:**

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from mlxtend.classifier import EnsembleVoteClassifier

In [None]:
# Creating classifiers
knn = KNeighborsClassifier()
lg = LogisticRegression()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
svc = SVC()
ada_boost = AdaBoostClassifier()
grad_boost = GradientBoostingClassifier()
xgb_boost = XGBClassifier()
boost_array = [ada_boost, grad_boost, xgb_boost]
clf = [knn, lg, dt, rf,svc]
eclf = EnsembleVoteClassifier(clfs=[ada_boost, grad_boost, xgb_boost], voting='hard')
labels = ['Ada Boost', 'Grad Boost', 'XG Boost', 'Ensemble']
for clf, label in zip([ada_boost, grad_boost, xgb_boost, eclf], labels):
    scores = cross_val_score(clf, x, y, cv=10, scoring='accuracy')
    print("Accuracy: {0:.3f}, std: (+/-) {1:.3f} [{2}]".format(scores.mean(), scores.std(), label))

Lastly, we used boosting technique. This 
boosting is not random, and the current 
performance of the model will depend on 
previous models. We used Ada Boost 
Classifier, Gradient Boosting Classifier and XG 
Boost Classifier. We have used all 5 ML 
models (KNN, Logistic, Decision Tree, 
Random Forest and SVM). We 
hyperparametered cross val score with all 5 ML 
models, cv = 10 and scoring = ‘accuracy’. We 
also tuned ‘EnsembleVoteClassifier’ with voting = ‘hard’ and for all 3 boosters Ada boost, 
Gradient boost and XG Boost.
To choose the best classifier, we will use 
Sklearn’s VotingClassifier, which will help us 
to combine different ML classifiers and will 
perform a vote on all classifiers.
According to the results, Gradient boost came 
out to be best with 82.10 % accuracy and 0.011 
standard deviation.

Note: Accuracy and Std deviation may change slightly after each run.

# **Conclusion:**
The first dataset is the Credit Card dataset, 
where we had records from Taiwan from April 2005 to September 2005. This dataset was 
highly imbalanced and needed resampling or 
transformation. We used MinMaxScaler to 
transform our train and test data. After applying 
5 machine learning models with normal data 
and transformed data, it can be easily observed 
that the performance of all models was good 
with transformed train and test data. KNN, 
Logistic Regression and RandomForest gave 
the best accuracy around 82 %. Logistic 
Regression was best with 82.68 % accuracy, 
recall 84.27 % and 95.92 % recall. After 
applying bagging, it was observed that 
RandomForest came out to be the best with 
81.30 % accuracy and a very low 0.01 standard 
deviation. In last, we used boosting for all 5 
machine learning models and applied Ada 
boost, Gradient boost and XG boost. Out of 
these 3. Grad boost performed best when both 
accuracy and standard deviation were 
compared with other boosting methods. For 
Grad boost, we got 82.10 % accuracy and 0.011 
standard deviation.

Note: Accuracy and Std deviation may change slightly after each run.