# Model

The purpose of this script is to model churn based on engineered features
split the transformed feature data into a training and test set

Run this script after feature.ipynb

Requires features.p file from features.ipynb
Produces results.p and results.tsv

Features are 

Scaling is performed using MinMaxScaler

Categorical variables (e.g. tenant_id are encoded using OneHotEncoder)

Train_test_split was used to split the data into a training set and testing set

Several modeling approaches are tried including \
LogisticRegression, LinearSVC, KNeighborsClassifier, RandomForestClassifier

Cross-validation was performed (usually using built-in settings of the modeling object)

Evaluation was done using confusion matrix, ROC curve (and AUC metric)



# Load Libraries

In [None]:
#Import libraries
%matplotlib inline
import logging
import collections
import datetime as dt
import sys
import os
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.figure 
import seaborn as sns

from sklearn.preprocessing import *
from sklearn.feature_selection import *
from sklearn.model_selection import *
from sklearn.linear_model import *
from sklearn.svm import *
from sklearn.neighbors import *
from sklearn.ensemble import *
from sklearn.metrics import *
from sklearn.pipeline import Pipeline

# Load data

Note data cleaning was done in previous scripts.

In [None]:
#Load features
dfFeatures = pickle.load( open( "features_all.p", "rb" ) )
dfFeatures.head()

In [None]:
# Here's an opportunity to subset for particular groups of interest
#dfFeatures = dfFeatures[dfFeatures.period_count==1] # clients who have only had one subscription

# Data cleaning (double check)

In [None]:
# Check for empty values
# print dfFeatures.info()
# dfFeatures = dfFeatures.dropna()
# print dfFeatures.info()

In [None]:
#Fix type of variable
# dfFeatures['tenant_id'] = dfFeatures.tenant_id.astype(int)
dfFeatures['tenant_id'] = dfFeatures.tenant_id.astype(str)
dfFeatures['client_id'] = dfFeatures.client_id.astype(str)

dfFeatures['call'] = dfFeatures.call.astype(float)
dfFeatures['email'] = dfFeatures.email.astype(float)
dfFeatures['meeting'] = dfFeatures.meeting.astype(float)

dfFeatures['avg_interval'] = dfFeatures.avg_interval.astype(float)
dfFeatures['period_duration_sum'] = dfFeatures.period_duration_sum.astype(int)
dfFeatures['period_duration_mean'] = dfFeatures.period_duration_sum.astype(int)
dfFeatures['period_count'] = dfFeatures.period_count.astype(int)
dfFeatures['active_count'] = dfFeatures.active_count.astype(int)
dfFeatures['churned'] = dfFeatures.churned.astype(int)

dfFeatures['call_first'] = dfFeatures.call_first.astype(float)
dfFeatures['email_first'] = dfFeatures.email_first.astype(float)
dfFeatures['meeting_first'] = dfFeatures.meeting_first.astype(float)
dfFeatures['call_last'] = dfFeatures.call_last.astype(float)
dfFeatures['email_last'] = dfFeatures.email_last.astype(float)
dfFeatures['meeting_last'] = dfFeatures.meeting_last.astype(float)
dfFeatures['days_since_last_touch'] = dfFeatures.days_since_last_touch.astype(int)

dfFeatures.info()

In [None]:
#dfFeatures.ix[3,:]

In [None]:
# Define feature list
y_lbl = dfFeatures['client_id']
X_lbl = dfFeatures['tenant_id']
y = dfFeatures.churned.as_matrix()

#Different variations possible for features used
#... agg, first & last
#xlist=['email','call','meeting','email_first','call_first','meeting_first','email_last','call_last','meeting_last','avg_interval','period_duration_sum','period_count']

# agg & last
#xlist=['email','call','meeting','email_last','call_last','meeting_last','avg_interval','period_duration_sum','period_count']

# just last
#xlist=['email_last','call_last','meeting_last','avg_interval','period_duration_sum','period_count']

# just agg
#xlist=['email','call','meeting','avg_interval','days_since_last_touch','period_count','period_duration_sum']

# just agg w no period duration sum
#xlist=['email','call','meeting','avg_interval','days_since_last_touch','period_duration_sum']

# just agg
#xlist=['email','call','meeting','avg_interval','period_duration_sum','period_duration_mean','period_count']

xlist=['email','call','meeting','email_first','call_first','meeting_first','email_last','call_last','meeting_last','avg_interval','period_duration_sum','period_count','days_since_last_touch']

print xlist
X = dfFeatures[xlist]

# Assess Class Balance

In [None]:
# Assess balance between classes
#dfFeatures.churned.unique()
churnedCount = np.sum(dfFeatures.churned==1) # Class 1 churn
engagedCount = np.sum(dfFeatures.churned==0) # Class 0 not churn
print churnedCount
print engagedCount
print str(round((float(churnedCount)/float(churnedCount+engagedCount))*100,1)) + '%'

# Scale data

In [None]:
# Scale data
scaler = MinMaxScaler() # Also tried StandardScaler()
X_scaled = scaler.fit_transform(X)
dfFeaturesScaled = pd.DataFrame(X_scaled,columns=xlist)
dfFeaturesScaled['churned'] = y
dfFeaturesScaled['tenant_id'] = X_lbl
dfFeaturesScaled['client_id'] = y_lbl
dfFeaturesScaled = dfFeaturesScaled.fillna(999)
dfFeatures['tenant_id'] = dfFeatures.tenant_id.astype(str)
dfFeatures['client_id'] = dfFeatures.client_id.astype(str)
y = dfFeaturesScaled.churned.as_matrix()
xlist.insert(0,'tenant_id')
X = dfFeaturesScaled[xlist]
X_original = X
y_original = y
y_original_lbl = dfFeaturesScaled.client_id

In [None]:
X=X.as_matrix()

In [None]:
X

In [None]:
y

# Create train and test sets

In [None]:
#Split data into test set and training set
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [None]:
# Show contents of raw data, training set and test set
print 'X'
print X
print
print 'y'
print y
print
print 'X_train'
print X_train
print
print 'X_test'
print X_test
print
print 'y_train'
print y_train
print
print 'y_test'
print y_test

In [None]:
# Check sizes of raw data, training set and test set
print 'X'
print X.shape
print 'y'
print y.shape
print 'X_train'
print X_train.shape
print 'X_test'
print X_test.shape
print 'y_train'
print y_train.shape
print 'y_test'
print y_test.shape

# Encode Categoricals

In [None]:
# Encode categoricals
enc = OneHotEncoder(categorical_features=[0,])
X_enc = enc.fit(X)
X_train_enc = enc.transform(X_train)
X_test_enc = enc.transform(X_test)

# Feature Exploration

In [None]:
# Chi-squared statistic on feature importance
from sklearn.feature_selection import chi2
scores, pvalues = chi2(X_train, y_train)
print scores
print pvalues

In [None]:
# Recursive feature elimination
clf = LogisticRegressionCV(cv=5,class_weight='balanced')
rfe = RFE(clf, 1)
rfe.fit(X_train,y_train)
print range(0,len(xlist))
print xlist
print rfe.support_
print rfe.ranking_
# 2 features (by defn)
# After RFE, the most important features are:
# period_duration_sum, period_duration_mean

In [None]:
#Feature correlation
ax = sns.heatmap(dfFeatures[xlist[0:-1]].corr())


# Train a model

In [None]:
# Several options make sense for modeling, including logistic regression, linear support vector classifier, 
# K-nearest neighbors, and random forest

clf = LogisticRegressionCV(cv=5,class_weight='balanced') #,penalty='l1',solver='liblinear')
#clf = LinearSVC(class_weight='balanced')
#clf = KNeighborsClassifier() #NO CLASS WEIGHT
#clf = RandomForestClassifier(class_weight='balanced') # n_estimators=15,


# Pipeline methods can be used to bundle feature selection process and classification
# clf = Pipeline([
#   ('feature_selection', SelectFromModel(linear_model.LogisticRegressionCV(cv=5,class_weight='balanced',penalty='l2'))),
#   ('classification', svm.LinearSVC(class_weight='balanced'))
# ])

# clf = Pipeline([
#   ('feature_selection', SelectFromModel(linear_model.LogisticRegressionCV(cv=5,class_weight='balanced',penalty='l2'))),
#   ('classification', ensemble.RandomForestClassifier(n_estimators=15,class_weight='balanced'))
# ])

def train_and_evaluate(clf, X_train, y_train):
    clf.fit(X_train,y_train)
    #sample weight available for LogisticRegressionCV, linearSVC, RandomForestClassifier
    #cv = KFold(n_splits=5,shuffle=True,random_state=None)
    scores = cross_val_score(clf, X_train, y_train) #,cv=cv)

    return clf

#scores

In [None]:
# Train model
train_and_evaluate(clf,X_train_enc,y_train) #X_train,y_train)

# Evaluate Performance on a Test Set

In [None]:
# Testing and evaluation # including ROC curve

global y_decision_function
global y_predict_proba
global y_predict
global y_score
def TestAndEvaluate(X_test, y_test, clf):
    try:
        y_decision_function = clf.decision_function(X_test) #Regression, linearSVC
        print 'y_decision_function'
        print y_decision_function  
        
        y_ret = y_decision_function
    except Exception as err:
        logging.exception(err)
    
    try:
        y_predict_proba = clf.predict_proba(X_test)[:,1] #KNeighborsSVC, Random forest
        print 'y_predict_proba'
        print y_predict_proba
        
        y_ret = y_predict_proba
    except Exception as err:
        logging.exception(err)

    try:
        y_predict = clf.predict(X_test) #[:,1]
        print 'y_predict'
        print y_predict
        
        y_ret = y_predict
    except Exception as err:
        logging.exception(err)

    try:
        y_score = clf.score(X_test,y_test) #[:,1]
        print 'y_score'
        print y_score
        
        y_ret = y_score
    except Exception as err:
        logging.exception(err)

    try:
        fpr, tpr, _ = roc_curve(y_test, y_decision_function)
    except Exception as err:
        logging.exception(err)

    try:
        fpr, tpr, _ = roc_curve(y_test, y_predict_proba)
    except Exception as err:
        logging.exception(err)

    try:
        roc_auc = auc(fpr,tpr)   
        plt.figure()
        lw = 2
        plt.plot(fpr, tpr, color='darkorange',lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating characteristic')
        plt.legend(loc="lower right")
        plt.savefig('auc.png')
        plt.close()
    except Exception as err:
        logging.exception(err)

    try:
        features = SelectFromModel(clf,prefit=True)
        print features
    except Exception as err:
        logging.exception(err)
        
    try: # Attribute not available for all types of classifiers
        print 'Feature importances'
        print clf.feature_importances_
    except Exception as err:
        logging.exception(err)

    try: # Attribute not available for all types of classifiers
        print 'coef'
        print clf.coef_
    except Exception as err:
        logging.exception(err)
    
    return y_predict

In [None]:
# Confusion matrix

def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    print 'plot_confusion_matrix'
    cm = cm.astype('float') / cm.sum(axis=1)[:,np.newaxis]
    plt.figure()
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(2)
    classes=['engaged','churned']
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks)
    plt.yticks(tick_marks)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.grid(False)
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    plt.savefig('cm.png')
    plt.close()
    return

def matrix_and_pars(ytest,ypred):
    print 'matrix_and_pars'
    cm = confusion_matrix(ytest,ypred)
    precision = float(cm[0][0]) / (cm[0][0]+cm[1][0])
    recall = float(cm[0][0]) / (cm[0][0]+cm[0][1])
    F1 = 2*precision*recall/(recall+precision)
    print 'recall: %0.3f precision: %0.3f F1: %0.3f' %(recall,precision,F1)
    print '%d %d' %(cm[0][0],cm[0][1])
    print '%d %d' %(cm[1][0],cm[1][1])
    plot_confusion_matrix(cm)
    return

In [None]:
#y_pred = measure_performance(X_test_enc, y_test, clf)
y_pred = TestAndEvaluate(X_test_enc, y_test, clf)


In [None]:
# Generate confusion matrix

matrix_and_pars(y_test,y_pred)

# Output of data for dashboard

In [None]:
#Run on all data (for dashboard)
X_all=enc.transform(X)
y
y_predict = clf.predict(X_all)
y_predict_proba = clf.predict_proba(X_all)
dfYOriginalLbl = pd.DataFrame(y_original_lbl).reset_index()
dfYOriginal = pd.DataFrame(y_original).reset_index()
dfOriginal = X_original.reset_index()
dfYPredict = pd.DataFrame(y_predict)
dfYPredict = dfYPredict.rename(columns={0:'churn_pred'})
dfYPredictProba = pd.DataFrame(y_predict_proba)
dfYPredictProba = dfYPredictProba.rename(columns={0:'churn_no'})
dfYPredictProba = dfYPredictProba.rename(columns={1:'churn_yes'})

dfResult = dfYPredictProba.join(dfYPredict)
dfResult = dfOriginal.join(dfResult)
dfResult = pd.merge(dfYOriginalLbl,dfResult)
dfResult = pd.merge(dfResult,dfYOriginal)
dfResult = dfResult.rename(columns={0:'churn_actual'})
dfResult = dfResult.drop('index',axis=1)
pickle.dump(dfResult, open( "results.p", "wb" ))
dfResult.to_csv('results.tsv',sep='\t')
dfResult.head()


In [None]:
dfResult = pd.merge(dfYOriginalLbl,dfFeatures)
dfResult = dfResult.join(dfYPredictProba)
dfResult = dfResult.join(dfYPredict)
dfResult = dfResult.rename(columns={0:'churn_actual'})
dfResult = dfResult.drop('index',axis=1)
pickle.dump(dfResult, open( "results_int.p", "wb" ))
dfResult.to_csv('results_int.tsv',sep='\t')
dfResult.head()

# Conclusion

The likelihood of churn is driven by the total duration of the relationship more than other factors and, while predictions based on this feature are reliable, cadence of communication remains challenging area for exploration.