In [3]:
import pandas as pd

In [4]:
import sklearn

In [5]:
df = pd.read_csv(r"H:\bank_churn_prediction\Bank_Customer_Churn_Prediction.csv")

In [6]:
df.head()

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [7]:
df.drop(['customer_id','country','gender','age'],axis = 1,inplace= True)

In [8]:
df.head()

Unnamed: 0,credit_score,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,619,2,0.0,1,1,1,101348.88,1
1,608,1,83807.86,1,0,1,112542.58,0
2,502,8,159660.8,3,1,0,113931.57,1
3,699,1,0.0,2,0,0,93826.63,0
4,850,2,125510.82,1,1,1,79084.1,0


In [9]:
df.columns


Index(['credit_score', 'tenure', 'balance', 'products_number', 'credit_card',
       'active_member', 'estimated_salary', 'churn'],
      dtype='object')

In [10]:
X = df.drop( 'churn',axis = 1)
y = df[ 'churn']

In [11]:
from sklearn.preprocessing import StandardScaler
st = StandardScaler()

In [12]:
X[['credit_score','balance','estimated_salary']] = st.fit_transform(X[['credit_score','balance','estimated_salary']])

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=23)

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [16]:
log = LogisticRegression()
log.fit(X_train,y_train)
y_pred = log.predict(X_test)
accuracy_score(y_test,y_pred)

0.802

## Feature Selection

In [17]:
from sklearn.feature_selection import RFE

In [18]:
# for logistics regression
rfe = RFE(estimator=log, n_features_to_select=7) 
rfe = rfe.fit(X.values, y.values) 

# mask of selected features
print(rfe.support_)
# The feature ranking, such that ranking_[i] corresponds to the ranking position of the i-th feature 
print(rfe.ranking_)

[ True  True  True  True  True  True  True]
[1 1 1 1 1 1 1]


In [19]:
mask = rfe.support_.tolist()
selected_feats = [b for a,b in zip(mask, X.columns) if a]
selected_feats

['credit_score',
 'tenure',
 'balance',
 'products_number',
 'credit_card',
 'active_member',
 'estimated_salary']

## Hyper Paramter Tuning

In [20]:
import numpy as np

In [21]:
# Obtaining class weights based on the class samples imbalance ratio
_, num_samples = np.unique(y_train, return_counts=True)
weights = np.max(num_samples)/num_samples

# Define weight dictionnary
weights_dict = dict()
class_labels = [0,1]

# Weights associated with classes
for a,b in zip(class_labels,weights):
    weights_dict[a] = b

weights_dict

{0: 1.0, 1: 3.8750761730652044}

In [22]:
from sklearn.model_selection import GridSearchCV

In [23]:
para_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10,20,50, 100, 1000],  # Regularization parameter
    'solver': ['liblinear', 'newton-cg', 'sag', 'saga',"lbfgs"],
    'penalty':['none', 'elasticnet', 'l1', 'l2'],
    "max_iter": [100,500,1000,2000]}

grid_search = GridSearchCV( log, para_grid, cv=10, verbose=True, n_jobs=-1) #scoring='accuracy',
best_GS = grid_search.fit(X_train, y_train)

print(f"Best Hyperparameters: {grid_search.best_params_}")
print(f"Best Accuracy Score: {grid_search.best_score_:.4f}")

Fitting 10 folds for each of 720 candidates, totalling 7200 fits


3240 fits failed out of a total of 7200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
360 fits failed with the following error:
Traceback (most recent call last):
  File "h:\bank_churn_prediction\venvst\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "h:\bank_churn_prediction\venvst\lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "h:\bank_churn_prediction\venvst\lib\site-packages\sklearn\linear_model\_logistic.py", line 1169, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "h:\bank_churn_prediction\venvst\lib\site-packages\sklearn\linear_model\

Best Hyperparameters: {'C': 0.001, 'max_iter': 100, 'penalty': 'none', 'solver': 'newton-cg'}
Best Accuracy Score: 0.7949


In [24]:
# Best Hyperparameters: {'C': 0.001, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
# Best Accuracy Score: 0.7949

In [25]:
log_tuned = LogisticRegression(C=0.001, max_iter=100, penalty="l1",class_weight=weights_dict, solver="liblinear" )
log_tuned.fit(X_train,y_train)
y_pred = log_tuned.predict(X_test)
accuracy_score(y_test,y_pred)

0.802

In [26]:
log_tuned = LogisticRegression(C=1, max_iter=100, penalty="l2", solver="liblinear" )
log_tuned.fit(X_train,y_train)
y_pred = log_tuned.predict(X_test)
accuracy_score(y_test,y_pred)

0.802

### Deploy the Project in Streamlit

In [27]:
import pickle

In [28]:
pickle.dump(log, open('model_churn.pkl','wb'))

In [29]:
model_final = pickle.load(open('model_churn.pkl','rb'))

In [30]:
df.head()

Unnamed: 0,credit_score,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,619,2,0.0,1,1,1,101348.88,1
1,608,1,83807.86,1,0,1,112542.58,0
2,502,8,159660.8,3,1,0,113931.57,1
3,699,1,0.0,2,0,0,93826.63,0
4,850,2,125510.82,1,1,1,79084.1,0


In [31]:
model_final.predict([[699,1,0.00,2,0,0,93826.63]])[0]




1

In [32]:
log_tuned.predict([[619,2,0.00,1,1,1,101348.88]])[0]



1

In [33]:
sklearn.__version__

'1.3.1'