In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sea 

In [59]:
import tensorflow as tf 
from tensorflow import keras

In [24]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score,KFold,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [3]:
df=pd.read_csv('Churn_Modelling.csv')

In [6]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [5]:
df.drop(['RowNumber','CustomerId','Surname'],axis=1,inplace=True)

In [7]:
df.shape

(10000, 11)

In [8]:
df.nunique()

CreditScore         460
Geography             3
Gender                2
Age                  70
Tenure               11
Balance            6382
NumOfProducts         4
HasCrCard             2
IsActiveMember        2
EstimatedSalary    9999
Exited                2
dtype: int64

In [9]:
df['Gender'].value_counts()

Male      5457
Female    4543
Name: Gender, dtype: int64

In [10]:
df_categorical=df.select_dtypes(['object'])

In [11]:
df_categorical.shape

(10000, 2)

In [12]:
for i in df_categorical.columns:
    df[i]=pd.get_dummies(df_categorical[i])

In [13]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,1,1,42,2,0.0,1,1,1,101348.88,1
1,608,0,1,41,1,83807.86,1,0,1,112542.58,0
2,502,1,1,42,8,159660.8,3,1,0,113931.57,1
3,699,1,1,39,1,0.0,2,0,0,93826.63,0
4,850,0,1,43,2,125510.82,1,1,1,79084.1,0


In [18]:
x=df.drop('Exited',axis=1)
y=df['Exited']

In [19]:
x.shape,y.shape

((10000, 10), (10000,))

In [20]:
std=StandardScaler()
var1=std.fit_transform(x)
x_std=pd.DataFrame(var1,columns=x.columns)

In [21]:
x_std.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,-0.326221,0.997204,1.095988,0.293517,-1.04176,-1.225848,-0.911583,0.646092,0.970243,0.021886
1,-0.440036,-1.002804,1.095988,0.198164,-1.387538,0.11735,-0.911583,-1.547768,0.970243,0.216534
2,-1.536794,0.997204,1.095988,0.293517,1.032908,1.333053,2.527057,0.646092,-1.03067,0.240687
3,0.501521,0.997204,1.095988,0.007457,-1.387538,-1.225848,0.807737,-1.547768,-1.03067,-0.108918
4,2.063884,-1.002804,1.095988,0.388871,-1.04176,0.785728,-0.911583,0.646092,0.970243,-0.365276


In [27]:
x_train,x_test,y_train,y_test=train_test_split(x_std,y,random_state=0)

In [35]:
models_list=[('LR',LogisticRegression()),('RF',RandomForestClassifier(n_estimators=4)),('DT',DecisionTreeClassifier()),
             ('KNN',KNeighborsClassifier())]

In [36]:
models_list

[('LR', LogisticRegression()),
 ('RF', RandomForestClassifier(n_estimators=4)),
 ('DT', DecisionTreeClassifier()),
 ('KNN', KNeighborsClassifier())]

In [25]:
kfold=KFold(n_splits=5,shuffle=True,random_state=0)

In [26]:
model=LogisticRegression()

In [30]:
cv=cross_val_score(model,x_train,y_train,cv=kfold)

In [31]:
cv

array([0.814     , 0.79133333, 0.81666667, 0.82266667, 0.80666667])

In [32]:
cv.max()

0.8226666666666667

In [33]:
results=[]
names=[]

In [37]:
for name,model in models_list:
    kfold=KFold(n_splits=10,shuffle=True,random_state=0)
    cv_results=cross_val_score(model,x_train,y_train,cv=kfold,scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print(f"{name} :{cv_results.mean()} : {cv_results.std()}")

LR :0.8102666666666668 : 0.012663859338026991
RF :0.8336 : 0.008979977728257465
DT :0.7818666666666666 : 0.011142710621747301
KNN :0.8258666666666666 : 0.009394087975364559


In [38]:
n_estimator=np.array([50,100,150,200])
max_features=np.array([3,5,7,9])
param_grid=dict(n_estimators=n_estimator,max_features=max_features)

In [39]:
model=RandomForestClassifier()

In [41]:
kfold=KFold(n_splits=5,shuffle=True,random_state=0)

In [42]:
grid=GridSearchCV(model,param_grid=param_grid,scoring='accuracy',cv=kfold)

In [43]:
grid_result=grid.fit(x_train,y_train)

In [45]:
grid_result.best_score_

0.8542666666666665

In [44]:
grid_result.best_params_

{'max_features': 3, 'n_estimators': 200}

In [46]:
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

0.850933 (0.006148) with: {'max_features': 3, 'n_estimators': 50}
0.854000 (0.008475) with: {'max_features': 3, 'n_estimators': 100}
0.853067 (0.005682) with: {'max_features': 3, 'n_estimators': 150}
0.854267 (0.007690) with: {'max_features': 3, 'n_estimators': 200}
0.849600 (0.004763) with: {'max_features': 5, 'n_estimators': 50}
0.849867 (0.007971) with: {'max_features': 5, 'n_estimators': 100}
0.851733 (0.007138) with: {'max_features': 5, 'n_estimators': 150}
0.851067 (0.008760) with: {'max_features': 5, 'n_estimators': 200}
0.850667 (0.009522) with: {'max_features': 7, 'n_estimators': 50}
0.850133 (0.006116) with: {'max_features': 7, 'n_estimators': 100}
0.848933 (0.006148) with: {'max_features': 7, 'n_estimators': 150}
0.849467 (0.006984) with: {'max_features': 7, 'n_estimators': 200}
0.848000 (0.006505) with: {'max_features': 9, 'n_estimators': 50}
0.849333 (0.005249) with: {'max_features': 9, 'n_estimators': 100}
0.848667 (0.005530) with: {'max_features': 9, 'n_estimators': 150}

In [47]:
from xgboost import XGBClassifier

In [48]:
model=XGBClassifier()

In [49]:
model.get_params().keys()

dict_keys(['objective', 'use_label_encoder', 'base_score', 'booster', 'colsample_bylevel', 'colsample_bynode', 'colsample_bytree', 'gamma', 'gpu_id', 'importance_type', 'interaction_constraints', 'learning_rate', 'max_delta_step', 'max_depth', 'min_child_weight', 'missing', 'monotone_constraints', 'n_estimators', 'n_jobs', 'num_parallel_tree', 'random_state', 'reg_alpha', 'reg_lambda', 'scale_pos_weight', 'subsample', 'tree_method', 'validate_parameters', 'verbosity'])

In [50]:
kfold=KFold(n_splits=8,shuffle=True,random_state=0)
n_estimators=np.array([50,100,125,150,175,200])
param_grid=dict(n_estimators=n_estimators)

In [51]:
param_grid

{'n_estimators': array([ 50, 100, 125, 150, 175, 200])}

In [52]:
grid=GridSearchCV(model,param_grid=param_grid,scoring='accuracy',cv=kfold)

In [53]:
grid_result=grid.fit(x_train,y_train)





































































































































































































In [55]:
grid_result.best_params_

{'n_estimators': 50}

In [57]:
grid_result.best_score_

0.8490670504012943

In [58]:
x_train,x_test,y_train,y_test=train_test_split(x_std,y,test_size=0.2,random_state=0)

In [60]:
import keras 
from keras.models import Sequential
from keras.layers import Dense,LeakyReLU,ReLU,Dropout

In [69]:
model=Sequential()

model.add(Dense(units=6,activation='relu',input_dim=10))
model.add(Dense(6,activation='relu'))
model.add(Dense(1,activation='sigmoid'))

model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [70]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 6)                 66        
_________________________________________________________________
dense_7 (Dense)              (None, 6)                 42        
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 7         
Total params: 115
Trainable params: 115
Non-trainable params: 0
_________________________________________________________________


In [71]:
history=model.fit(x_train,y_train,validation_split=0.33,batch_size=10,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [75]:
y_pred_train=model.predict(x_train)

In [76]:
y_pred_train=(y_pred_train>0.5)

In [77]:
accuracy_score(y_train,y_pred_train)

0.856125

In [78]:
y_pred_test=model.predict(x_test)
y_pred_test=(y_pred_test>0.5)

In [79]:
accuracy_score(y_test,y_pred_test)

0.8555