In [2]:
# Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [7]:
train=pd.read_csv('C:\\Users\\sarit\\Downloads\\Algorithms_projects\\Churn_Modelling.csv')

In [8]:
train.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [9]:
# converting gender to 0 and 1 
# drop non numeric columns 

Gender=pd.get_dummies(train['Gender'],drop_first=True)
train=pd.concat([train,Gender], axis=1)
train.drop(['RowNumber','Surname','Geography','Gender'], axis=1, inplace=True)

In [10]:
x=train.drop('Exited',axis=1)  # independent
y=train.Exited  # dependent

In [11]:
x.head()

Unnamed: 0,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Male
0,15634602,619,42,2,0.0,1,1,1,101348.88,0
1,15647311,608,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,42,8,159660.8,3,1,0,113931.57,0
3,15701354,699,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,43,2,125510.82,1,1,1,79084.1,0


In [12]:
y.head()

0    1
1    0
2    1
3    0
4    0
Name: Exited, dtype: int64

In [13]:
# split data into train and test dataset
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state=101)

In [14]:
from sklearn.ensemble import AdaBoostClassifier

In [15]:
model = AdaBoostClassifier(random_state=1)

In [16]:
model.fit(X_train,y_train)

AdaBoostClassifier(random_state=1)

In [17]:
pred_1 = model.predict(X_test)

In [18]:
model.score(X_train,y_train) # training Accuracy

0.85625

In [19]:
print(accuracy_score(pred_1,y_test)) # testing Accuracy

0.8535


In [20]:
y.value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [21]:
print(classification_report(pred_1,y_test))

              precision    recall  f1-score   support

           0       0.95      0.87      0.91      1724
           1       0.48      0.74      0.58       276

    accuracy                           0.85      2000
   macro avg       0.72      0.80      0.75      2000
weighted avg       0.89      0.85      0.87      2000



# Adaboost using Decision Tree

In [22]:
from sklearn.tree import DecisionTreeClassifier

In [23]:
classifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1,
                                                      max_features=4,
                                                      min_samples_leaf=20),
                                n_estimators=200)

In [24]:
cls=classifier.fit(X_train,y_train)

In [25]:
pred_2 = classifier.predict(X_test)

In [26]:
print(accuracy_score(pred_2,y_test))

0.8555


In [27]:
print(classification_report(pred_2,y_test))

              precision    recall  f1-score   support

           0       0.96      0.87      0.91      1728
           1       0.48      0.75      0.58       272

    accuracy                           0.86      2000
   macro avg       0.72      0.81      0.75      2000
weighted avg       0.89      0.86      0.87      2000



In [28]:
from sklearn.model_selection import StratifiedKFold

In [29]:
st=StratifiedKFold(n_splits=10)
from sklearn.model_selection import cross_val_score

In [30]:
results=cross_val_score(cls,x,y,cv=st)

In [31]:
results

array([0.846, 0.846, 0.858, 0.837, 0.852, 0.847, 0.852, 0.852, 0.85 ,
       0.843])

In [32]:
from sklearn.linear_model import LogisticRegression

In [33]:
lr=LogisticRegression()

In [34]:
classifier=AdaBoostClassifier(lr, n_estimators=200)

In [35]:
classifier.fit(X_train,y_train)

AdaBoostClassifier(base_estimator=LogisticRegression(), n_estimators=200)

In [36]:
pred10=classifier.predict(X_test)

In [37]:
print(classification_report(pred10,y_test))

              precision    recall  f1-score   support

           0       1.00      0.79      0.88      2000
           1       0.00      0.00      0.00         0

    accuracy                           0.79      2000
   macro avg       0.50      0.39      0.44      2000
weighted avg       1.00      0.79      0.88      2000



In [38]:
from imblearn.over_sampling import SMOTE
sm=SMOTE()


In [39]:
x_sm,y_sm=sm.fit_resample(x,y)

In [40]:
x_train,x_test,y_train,y_test=train_test_split(x_sm,y_sm,test_size=.2,random_state=51)

In [41]:
classifier.fit(x_train,y_train)

AdaBoostClassifier(base_estimator=LogisticRegression(), n_estimators=200)

In [42]:
pred=classifier.predict(x_test)

In [43]:
print(classification_report(pred,y_test))

              precision    recall  f1-score   support

           0       0.46      0.60      0.52      1243
           1       0.69      0.56      0.62      1943

    accuracy                           0.57      3186
   macro avg       0.58      0.58      0.57      3186
weighted avg       0.60      0.57      0.58      3186



In [44]:
classifier.fit(x_train,y_train)

AdaBoostClassifier(base_estimator=LogisticRegression(), n_estimators=200)

In [45]:
classifier.predict(x_test)

array([1, 1, 1, ..., 0, 1, 0], dtype=int64)

In [46]:
print(classification_report(pred,y_test))

              precision    recall  f1-score   support

           0       0.46      0.60      0.52      1243
           1       0.69      0.56      0.62      1943

    accuracy                           0.57      3186
   macro avg       0.58      0.58      0.57      3186
weighted avg       0.60      0.57      0.58      3186



# XGBoost

In [47]:
!pip install xgboost



In [48]:
import xgboost as xgb

In [49]:
xg_model = xgb.XGBClassifier(random_state=1,
                             learning_rate=0.01)

In [51]:
xg_model.fit(x_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.01, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=1,
              reg_alpha=0, reg_lambda=1, ...)

In [52]:
pred_3 = xg_model.predict(x_test)

In [53]:
print(accuracy_score(pred_3,y_test))

0.8025737602008789


In [54]:
new_xbg_model = xgb.XGBClassifier(base_score=0.2, booster='gbtree', colsample_bylevel=0.5,
                              colsample_bynode=0.5, colsample_bytree=0.5, gamma=0, gpu_id=-1,
                              importance_type='gain', interaction_constraints='',
                              learning_rate=0.05, max_delta_step=0, max_depth=3,
                              min_child_weight=1, monotone_constraints='()',
                              n_estimators=1000, n_jobs=0, num_parallel_tree=1,
                              objective='binary:logistic', random_state=211, reg_alpha=0,
                              reg_lambda=1, scale_pos_weight=1, subsample=1,
                              tree_method='exact', validate_parameters=1, verbosity=None)

In [55]:
new_xbg_model.fit(x_train,y_train)

XGBClassifier(base_score=0.2, booster='gbtree', callbacks=None,
              colsample_bylevel=0.5, colsample_bynode=0.5, colsample_bytree=0.5,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type='gain', interaction_constraints='',
              learning_rate=0.05, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=3, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=1000,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=211,
              reg_alpha=0, reg_lambda=1, ...)

In [56]:
pred_4 = new_xbg_model.predict(x_test)

In [57]:
print(accuracy_score(pred_4,y_test))

0.8182674199623352
