## Churn prediction
Churn prediction is a problem when customers/employees leave company for different reason. First, employers want to predict who is likely to leave, and further determine how to minimize this problem. This small example illustrate such problem when the customers of a big international bank, who decided to leave (Exited) from the bank.
First model here is an attempt to use neural network to built predictive model using Keras withou additional features. Dataset from kaggle Churn prediction

This is an exploration of applying neural networks to predict customer churn using binary classification.  To evaluate the model, we use cross validation.  

In [2]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline

seed = 7
np.random.seed(seed)

Using TensorFlow backend.


In [3]:
df = pd.read_csv("/home/tri/Downloads/Churn_Modelling.csv")
df.drop(['RowNumber','Surname'],axis=1,inplace=True)
#df=df.apply(LabelEncoder().fit_transform)
fields= ['Geography','Gender']
for f in fields:
    df[[f]]=df[[f]].apply(LabelEncoder().fit_transform)
df.head()

Unnamed: 0,CustomerId,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,15634602,619,0,0,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,2,0,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,0,0,42,8,159660.8,3,1,0,113931.57,1
3,15701354,699,0,0,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,2,0,43,2,125510.82,1,1,1,79084.1,0


In [4]:
def create_baseline():
    # create model
    model = Sequential()
    model.add(Dense(11, input_dim=11, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [5]:
#Baseline without encoding
x=df.values[:,0:11].astype(float)
y=df.values[:,11]
est = KerasClassifier(build_fn=create_baseline, nb_epoch=10, batch_size=20, verbose=0)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
res = cross_val_score(est, x, y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (res.mean()*100, res.std()*100))

Baseline: 44.06% (29.03%)


In [6]:
# pipeline without label encoding
np.random.seed(seed)
estimators = [('standardize', StandardScaler()),\
              ('mlp', KerasClassifier(build_fn=create_baseline, nb_epoch=10, batch_size=20, verbose=0))]
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(pipeline, x, y, cv=kfold)
print("Standardized: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Standardized: 85.25% (0.75%)


In [9]:
# with label encoding
encoded_y  =  LabelEncoder().fit_transform(y)
estimators = [('standardize', StandardScaler()),\
              ('mlp', KerasClassifier(build_fn=create_baseline, nb_epoch=10, batch_size=20, verbose=0))]
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(pipeline, x, encoded_y, cv=kfold)
print("Standardized: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Standardized: 84.86% (0.98%)


 This code credits kaggler phillipo //www.kaggle.com/filippoo/deep-learning-az-ann

In [11]:
def create_smaller():
    model = Sequential()
    model.add(Dense(5, input_dim=11, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

estimators = [('standardize', StandardScaler()),\
              ('mlp', KerasClassifier(build_fn= create_smaller, nb_epoch=10, batch_size=20, verbose=0))]
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(pipeline, x, encoded_y, cv=kfold)
print("Smaller: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Smaller: 83.61% (1.45%)


In [12]:
def create_larger():
    # create model
    model = Sequential()
    model.add(Dense(11, input_dim=11, activation='relu'))
    model.add(Dense(5, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
estimators = [('standardize', StandardScaler()),\
              ('mlp', KerasClassifier(build_fn= create_larger, nb_epoch=10, batch_size=20, verbose=0))]
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(pipeline, x, encoded_y, cv=kfold)
print("Larger: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Larger: 85.27% (0.96%)


## Other predictive modeling

In [4]:
df = pd.read_csv("/home/tri/Downloads/Churn_Modelling.csv")
df.drop(['CustomerId','RowNumber','Surname'],axis=1,inplace=True)
fields= ['Geography','Gender']
for f in fields:
    df[[f]]=df[[f]].apply(LabelEncoder().fit_transform)
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,0,0,42,2,0.0,1,1,1,101348.88,1
1,608,2,0,41,1,83807.86,1,0,1,112542.58,0
2,502,0,0,42,8,159660.8,3,1,0,113931.57,1
3,699,0,0,39,1,0.0,2,0,0,93826.63,0
4,850,2,0,43,2,125510.82,1,1,1,79084.1,0


In [5]:
X_train, X_test, y_train, y_test = train_test_split(df[df.columns.difference(['Exited'])],df['Exited'],test_size=0.3,\
                                                    random_state=42,stratify=df['Exited'])

## Second model is a logic regression

In [14]:
from sklearn.linear_model import LogisticRegression

estimators = [('standardize', StandardScaler()),\
              ('logistic',LogisticRegression(solver='liblinear',C=100) )]
pipeline = Pipeline(estimators)
#kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
#results = cross_val_score(pipeline, x, encoded_y, cv=kfold)
#print("Logistic Regression: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Logistic Regression: 80.84% (0.86%)


In [12]:
from sklearn.ensemble import RandomForestClassifier
estimators = [('standardize', StandardScaler()),('clf',RandomForestClassifier(criterion='gini') )]
#estimators = [('standardize', StandardScaler()),('clf',RandomForestClassifier(criterion='gini',\
# class_weight={0:0.3},1:0.7) )]
para = {'clf__n_estimators':(100,1000),
        'clf__max_depth':(5,10),
        'clf__min_samples_split':(2,3),
        'clf__min_samples_leaf':(1,2)
       }
pipeline = Pipeline(estimators)
#kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
#results = cross_val_score(pipeline, x, encoded_y, cv=kfold)
#print("Random Forest: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
cv = GridSearchCV(pipeline, para,cv=3,scoring='accuracy',n_jobs=-1)
cv.fit(X_train,y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'clf__n_estimators': (100, 1000), 'clf__min_samples_leaf': (1, 2), 'clf__max_depth': (5, 10), 'clf__min_samples_split': (2, 3)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [13]:
print('Best param')
best_para= cv.best_estimator_.get_params()
pred = cv.predict(X_test)
print('Accuracy ',accuracy_score(pred,y_test))

Best param
Accuracy  0.868333333333


In [22]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

# Setup the pipeline with the required steps: steps
steps = [('standardize', StandardScaler()),
         ('SVM',SVC())]  
para = {'SVM__C':[1, 10, 100], 'SVM__gamma':[0.1, 0.01]}
pipeline = Pipeline(steps)

In [27]:
cv = GridSearchCV(pipeline, para,cv=3)
cv.fit(X_train,y_train)
y_pred = cv.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

0.859333333333
             precision    recall  f1-score   support

          0       0.86      0.98      0.92      2389
          1       0.84      0.38      0.52       611

avg / total       0.86      0.86      0.84      3000



In [15]:
from sklearn.ensemble import GradientBoostingClassifier
estimators = [('standardize', StandardScaler()),('clf',GradientBoostingClassifier(loss='deviance',learning_rate=0.5) )]
#estimators = [('standardize', StandardScaler()),('clf',RandomForestClassifier(criterion='gini',\
# class_weight={0:0.3},1:0.7) )]
para = {'clf__n_estimators':(1000,2000),
        'clf__max_depth':(3,5),
        'clf__min_samples_split':(2,3),
        'clf__min_samples_leaf':(1,2)
       }
pipeline = Pipeline(estimators)
#kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
#results = cross_val_score(pipeline, x, encoded_y, cv=kfold)
#print("Random Forest: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
cv = GridSearchCV(pipeline, para,cv=3,scoring='accuracy',n_jobs=-1)
cv.fit(X_train,y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.5, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_de...      presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'clf__n_estimators': (1000, 2000), 'clf__min_samples_leaf': (1, 2), 'clf__max_depth': (3, 5), 'clf__min_samples_split': (2, 3)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [17]:
print('Best param',cv.best_estimator_.get_params())
pred = cv.predict(X_test)
print('Accuracy ',accuracy_score(pred,y_test))

Best param {'clf__min_impurity_split': None, 'clf__criterion': 'friedman_mse', 'steps': [('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.5, loss='deviance', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=1000,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False))], 'clf__min_impurity_decrease': 0.0, 'clf__max_leaf_nodes': None, 'clf__max_features': None, 'clf__random_state': None, 'clf__init': None, 'clf__loss': 'deviance', 'clf__subsample': 1.0, 'standardize': StandardScaler(copy=True, with_mean=True, with_std=True), 'clf__learning_rate': 0.5, 'standardize__copy': True, 'standardize__with_mean': True, 'clf__min

Addition, we need to implement hyperparameter

In [21]:
import xgboost as xgb
estimators = [('standardize', StandardScaler()),('clf',xgb.XGBClassifier(learning_rate=0.05) )]
#estimators = [('standardize', StandardScaler()),('clf',RandomForestClassifier(criterion='gini',\
# class_weight={0:0.3},1:0.7) )]
para = {'clf__n_estimators':(1000,2000),
        'clf__max_depth':(2,4)
    
               }
pipeline = Pipeline(estimators)
#kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
#results = cross_val_score(pipeline, x, encoded_y, cv=kfold)
#print("Random Forest: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
cv = GridSearchCV(pipeline, para,cv=3,scoring='accuracy',n_jobs=-1)
cv.fit(X_train,y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'clf__n_estimators': (1000, 2000), 'clf__max_depth': (2, 4)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [22]:
print('Best param',cv.best_estimator_.get_params())
pred = cv.predict(X_test)
print('Accuracy ',accuracy_score(pred,y_test))

Best param {'clf__base_score': 0.5, 'steps': [('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=2,
       min_child_weight=1, missing=None, n_estimators=1000, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1))], 'clf__colsample_bytree': 1, 'clf__scale_pos_weight': 1, 'clf__colsample_bylevel': 1, 'clf__silent': True, 'clf__reg_lambda': 1, 'clf__reg_alpha': 0, 'clf__subsample': 1, 'standardize': StandardScaler(copy=True, with_mean=True, with_std=True), 'clf__learning_rate': 0.05, 'clf__missing': None, 'standardize__copy': True, 'standardize__with_mean': True, 'clf__gamma': 0, 'clf': XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=2,
       min_child_wei