In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,PrecisionRecallDisplay
import xgboost as xgb

In [None]:
df=pd.read_csv('../datasets/Churn_Modelling.csv',index_col=False)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
for col in df.columns:
    print(df[col].value_counts())

In [None]:
df['NumOfProducts'].unique()

In [None]:
df['NumOfProducts'].describe()

In [None]:
df.hist(figsize=(20,12),bins=100)

In [None]:
df_churn=df.drop(['RowNumber','CustomerId','Surname'],axis=1)

In [None]:
df_churn.describe()

In [None]:
sns.countplot(x='Exited',hue='Gender',data=df_churn)

In [None]:
corr=df_churn.corr()

In [None]:
f,ax=plt.subplots(figsize=(20,12))
sns.heatmap(corr,vmax=0.8,annot=True,cmap='Blues')

In [None]:
plt.figure(figsize=(20,12))
plt.subplot(411)
sns.countplot(x='HasCrCard',hue='Exited',data=df_churn)

plt.subplot(412)
sns.countplot(x='IsActiveMember',hue='Exited',data=df_churn)
plt.subplot(413)
sns.countplot(x='Tenure',hue='Exited',data=df_churn)

In [None]:
plt.figure(figsize=(20,12))
sns.countplot(x='Geography',hue='Exited',data=df_churn)

In [None]:
plt.figure(figsize=(20,12))
sns.kdeplot(df_churn[df_churn['Exited']==1]['EstimatedSalary'],label='Exited',color='r',shade=True)
sns.kdeplot(df_churn[df_churn['Exited']==0]['EstimatedSalary'],label='Stayed',color='b',shade=True)

plt.xlabel('EstimatedSalary')

### Feature Engineering

In [None]:
df_churn.info()

1.toarray returns an ndarray from a sparse matrix

In [None]:
#We can use OneHotEncoder to transform categorical variables into numerical variables.
onehot=OneHotEncoder()
X_number=df_churn.drop(['Geography','Gender'],axis=1)
X_category=onehot.fit_transform(df_churn[['Geography','Gender']]).toarray()
X_cat=pd.DataFrame(X_category)

In [None]:
X_all=pd.concat([X_cat,X_number],axis=1)

In [None]:
X_all.head()

In [None]:
#split the dataset
y=df_churn['Exited']
X_all=X_all.drop(['Exited'],axis=1)

In [None]:
scaler=MinMaxScaler()
X_all_scaled=scaler.fit_transform(X_all)

In [None]:
X_all_scaled

In [None]:
#splitting the X_all 
X_train,X_test,y_train,y_test=train_test_split(X_all_scaled,y,test_size=0.3,random_state=0)

In [None]:
y_test.shape

In [None]:
#Logistic Regression
logreg=LogisticRegression()
logreg.fit(X_train,y_train)
y_pred=logreg.predict(X_test)

In [None]:
print('Accuracy for the logistic regression is : {:.2f}%'.format(100* accuracy_score(y_test,y_pred)))

In [None]:
#confusion matrix heatmap
con_matrix=confusion_matrix(y_test,y_pred)
sns.heatmap(con_matrix,annot=True)

In [None]:
#classification report
print(classification_report(y_test,y_pred))

### RandomForest Classification HyperParameter Tuning

In [None]:
## We can try with another model
# Random Forest
rf = RandomForestClassifier(n_estimators = 100, random_state = 0)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)


In [None]:
#confusion matrix heatmap
con_matrix=confusion_matrix(y_test,y_pred)
sns.heatmap(con_matrix,annot=True)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
print('Accuracy for the Random Forest Classifier is : {:.2f}%'.format(100* accuracy_score(y_test,y_pred)))

#### Using Scikit-Learn’s RandomizedSearchCV method, we can define a grid of hyperparameter ranges, and randomly sample from the grid, performing K-Fold CV with each combination of values.

In [None]:
from pprint import pprint
rf=RandomForestClassifier(random_state=42)

pprint(rf.get_params())

In [None]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV

n_estimators=[int(x) for x in np.linspace(start=0,stop=1000,num=10)]
max_features=['sqrt','log2']
max_depth=[int(x) for x in np.linspace(start=10,stop=110,num=11)]
max_depth.append(None)
min_samples_split=[2,5,10]
min_samples_leaf=[1,2,4]
bootstrap=[True,False]

random_grid={
    'n_estimators':n_estimators,
    'max_features':max_features,
    'max_depth':max_depth,
    'min_samples_split':min_samples_split,
    'min_samples_leaf':min_samples_leaf,
    'bootstrap':bootstrap
}
pprint(random_grid)

In [None]:
rf_new=RandomForestClassifier()
rf_random=RandomizedSearchCV(estimator=rf_new,param_distributions=random_grid,n_iter=100,cv=3,random_state=42)

In [None]:
rf_random.fit(X_train,y_train)

In [None]:
best_params=rf_random.best_params_

In [None]:
best_params

In [None]:
rfc = RandomForestClassifier(n_estimators = 111,min_samples_split=5,min_samples_leaf=1,max_features='log2',max_depth=30,bootstrap=True, random_state = 0)
rfc.fit(X_train, y_train)

y_predc = rfc.predict(X_test)


In [None]:
print('Accuracy for the Random Forest Classifier is : {:.2f}%'.format(100* accuracy_score(y_test,y_predc)))

### XGBoost Classifier

In [None]:
model=xgb.XGBClassifier()
model.fit(X_train,y_train)

In [None]:
pred_xgb=model.predict(X_test)

In [None]:
## Evaluation of thais model
print('Accuracy of XGBoost classifier on test set: {:.2f}%'.format(100* accuracy_score(y_test, pred_xgb)))

In [None]:
cm=confusion_matrix(y_test,pred_xgb)
sns.heatmap(cm,annot=True)

In [None]:
print(classification_report(y_test,pred_xgb))

| This | is   |
|------|------|
|   a  | table|

|Value | Model | Precision | Recall | F1-score|
|------|------|------|------|------|
|0| Logistic Regression |0.88| 0.96 |0.92|
|1|  Logistic Regression| 0.76|0.51  |   0.61|
|0| RandomForestClassifier|0.88 |0.96  |0.92|
|1| RandomForestClassifier|0.76 |0.51  |0.61|
|0| RandomForestClassifier|0.88 |0.96  |0.92|
|0| XGBoost Classifier|0.89 |0.95  |0.91|
|1| XGBoost Classifier|0.72|0.53   |   0.61

In [None]:
df.info()

In [None]:
df_churn2=pd.read_csv('datasets/Churn_Modelling.csv',index_col=False)
df_churn2=df_churn2.drop(['RowNumber','CustomerId','Surname'],axis=1)

categorical=['Geography','Gender']
numerical=['CreditScore','Age','Tenure','Balance','NumOfProducts','HasCrCard', 'IsActiveMember', 'EstimatedSalary']

df_train_all,df_test=train_test_split(df_churn2,test_size=0.3,random_state=0)

y_train_all=df_train_all.Exited.astype(int).values
y_test=df_test.Exited.astype(int).values

df_train,df_val=train_test_split(df_train_all,test_size=0.3,random_state=0)
y_train=df_train.Exited.astype(int).values
y_val=df_val.Exited.astype(int).values

##Use Dict Vectorizer to transform categorical variables into numerical variables

# The class DictVectorizer can be used to convert feature arrays represented as lists of standard 
# Python dict objects to the NumPy/SciPy representation used by scikit-learn estimators.

train_dicts = df_train[categorical + numerical].to_dict(orient = 'records')

dv=DictVectorizer()
X_train=dv.fit_transform(train_dicts)

In [None]:
rf = RandomForestClassifier(n_estimators = 100, random_state = 0)
rf.fit(X_train, y_train)


In [None]:
val_dicts=df_val[categorical+numerical].to_dict(orient='records')
X_val=dv.transform(val_dicts)
y_pred = rf.predict_proba(X_val)[:,1]

In [None]:
y_pred

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_val, y_pred)

In [None]:
from sklearn.pipeline import make_pipeline

pipeline=make_pipeline(
DictVectorizer(),RandomForestClassifier(n_estimators=100,random_state=0)
)
pipeline.fit(train_dicts,y_train)

In [None]:
y_pred=pipeline.predict_proba(val_dicts)[:,1]

roc_auc_score(y_val,y_pred)

### Saving the model


In [None]:
import pickle
with open('models/pipeline.bin','wb') as f:
    pickle.dump(pipeline,f)

### Testing out the model


In [None]:
customer = {'CreditScore': 597,
 'Geography': 'Germany',
 'Gender': 'Female',
 'Age': 35,
 'Tenure': 8,
 'Balance': 131101.04,
 'NumOfProducts': 1,
 'HasCrCard': 1,
 'IsActiveMember': 1,
 'EstimatedSalary': 192852.67,
 'Exited': 0}

In [None]:
customer

In [None]:
pipeline.predict_proba(customer)[0,1]