In [76]:
import pandas as pd
import numpy as np

Churn = pd.read_csv(r'../input/bank-customers/Churn Modeling.csv')

Churn.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [77]:
Churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [78]:
X = Churn.iloc[:, 3 : 13]

In [79]:
X.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,France,Female,42,2,0.0,1,1,1,101348.88
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,502,France,Female,42,8,159660.8,3,1,0,113931.57
3,699,France,Female,39,1,0.0,2,0,0,93826.63
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1


In [80]:
y = Churn['Exited']

In [81]:
geography = pd.get_dummies(X['Geography'], drop_first = True)
gender = pd.get_dummies(X['Gender'], drop_first = True)

In [82]:
X = pd.concat([X, geography], axis = 1)
X = X.drop(['Geography'], axis = 1)
X['Gender'] = gender

In [83]:
num_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']

In [84]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

for i in num_features:
    k = np.array(X[i])
    k = k.reshape(-1, 1)
    K = scaler.fit_transform(k)
    X[i] = K

In [85]:
X.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Germany,Spain
0,0.538,0,0.324324,0.2,0.0,0.0,1,1,0.506735,0,0
1,0.516,0,0.310811,0.1,0.334031,0.0,0,1,0.562709,0,1
2,0.304,0,0.324324,0.8,0.636357,0.666667,1,0,0.569654,0,0
3,0.698,0,0.283784,0.1,0.0,0.333333,0,0,0.46912,0,0
4,1.0,0,0.337838,0.2,0.500246,0.0,1,1,0.3954,0,1


In [86]:
from imblearn.over_sampling import SMOTE
from collections import Counter

sampler = SMOTE(sampling_strategy = 0.8)

Xs, ys = sampler.fit_resample(X, y)

Counter(ys)

Counter({1: 6370, 0: 7963})

In [87]:
X = Xs
y = ys

In [88]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [89]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

model = RandomForestClassifier(n_estimators = 1000)

model.fit(X_train, y_train)

predictions = model.predict(X_test)

ac = accuracy_score(predictions, y_test)

cr = classification_report(predictions, y_test)

print(cr)

              precision    recall  f1-score   support

           0       0.91      0.89      0.90      1595
           1       0.86      0.89      0.88      1272

    accuracy                           0.89      2867
   macro avg       0.89      0.89      0.89      2867
weighted avg       0.89      0.89      0.89      2867



In [90]:
from sklearn.svm import SVC

model2 = SVC(gamma = 1)

model2.fit(X_train, y_train)

predictions2 = model2.predict(X_test)

ac2 = accuracy_score(predictions2, y_test)

cr2 = classification_report(predictions2, y_test)

print(cr2)

              precision    recall  f1-score   support

           0       0.86      0.77      0.82      1724
           1       0.70      0.81      0.75      1143

    accuracy                           0.79      2867
   macro avg       0.78      0.79      0.78      2867
weighted avg       0.80      0.79      0.79      2867



In [91]:
from sklearn.linear_model import LogisticRegression

model3 = LogisticRegression()

model3.fit(X_train, y_train)

predictions3 = model3.predict(X_test)

ac3 = accuracy_score(predictions3, y_test)

cr3 = classification_report(predictions3, y_test)

print(cr3)

              precision    recall  f1-score   support

           0       0.78      0.71      0.74      1700
           1       0.63      0.71      0.66      1167

    accuracy                           0.71      2867
   macro avg       0.70      0.71      0.70      2867
weighted avg       0.72      0.71      0.71      2867



In [92]:
from joblib import dump

dump(model, r'./RFModel.joblib')
dump(model2, r'./svc.joblib')
dump(model3, r'./LogReg.joblib')

['./LogReg.joblib']