In [None]:
# The purpose of this script is to split the transformed feature data into a training and test set
# Requires features.p file from features.ipynb
# Categorical variables (e.g. tenant_id are encoded)
# ...
# ...

In [None]:
#Import libraries
%matplotlib inline
import logging
import collections
import datetime as dt
import sys
import os
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.figure 
import seaborn as sns

from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import mysql.connector

from sklearn.preprocessing import *
from sklearn.feature_extraction import DictVectorizer as DV
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE
from sklearn.model_selection import *
# from sklearn.linear_model import *
from sklearn import linear_model
from sklearn import svm
from sklearn import neighbors
from sklearn import ensemble
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline

In [None]:
# Create database engine
dbname = 'cs'
username = 'rjf'
#engine = create_engine('postgresql://rjf@localhost:5432/cs_db')
engine = create_engine('mysql+mysqlconnector://mydb_user:rjf@localhost:5432/cs', echo=False)
print engine.url

passwd = os.environ["PASSWD"]
# Connect to database
conn = mysql.connector.connect(
         user='rjf',
         password=passwd,
         host='localhost',
         database='cs')

In [None]:
#Load features
dfFeatures = pickle.load( open( "features.p", "rb" ) )
dfFeatures.head()

In [None]:
# Here's an opportunity to subset for particular groups of interest
#dfFeatures = dfFeatures[dfFeatures.period_count==1] # clients who have only had one subscription

In [None]:
# Check for empty values
print dfFeatures.info()
dfFeatures = dfFeatures.dropna()
print dfFeatures.info()

In [None]:
# Assess blance between classes
#dfFeatures.churned.unique()
print np.sum(dfFeatures.churned==1) # Class 1 churn
print np.sum(dfFeatures.churned==0) # Class 0 not churn
# Noting here that the classes are imbalanced (6.5% churners, others 93.5% considered engaged for now)

In [None]:
#Fix type of variable
# dfFeatures['tenant_id'] = dfFeatures.tenant_id.astype(int)
dfFeatures['tenant_id'] = dfFeatures.tenant_id.astype(str)
dfFeatures['client_id'] = dfFeatures.client_id.astype(str)

dfFeatures['call'] = dfFeatures.call.astype(float)
dfFeatures['email'] = dfFeatures.email.astype(float)
dfFeatures['meeting'] = dfFeatures.meeting.astype(float)
# dfFeatures['call_first'] = dfFeatures.call_first.astype(float)
# dfFeatures['email_first'] = dfFeatures.email_first.astype(float)
# dfFeatures['meeting_first'] = dfFeatures.meeting_first.astype(float)
# dfFeatures['call_last'] = dfFeatures.call_last.astype(float)
# dfFeatures['email_last'] = dfFeatures.email_last.astype(float)
# dfFeatures['meeting_last'] = dfFeatures.meeting_last.astype(float)

dfFeatures['avg_interval'] = dfFeatures.avg_interval.astype(float)
dfFeatures['period_duration_sum'] = dfFeatures.period_duration_sum.astype(int)
dfFeatures['period_duration_mean'] = dfFeatures.period_duration_sum.astype(int)
dfFeatures['period_count'] = dfFeatures.period_count.astype(int)
dfFeatures['days_since_last_touch'] = dfFeatures.days_since_last_touch.astype(int)
dfFeatures['active_count'] = dfFeatures.active_count.astype(int)
dfFeatures['churned'] = dfFeatures.churned.astype(int)
dfFeatures.info()

In [None]:
#dfFeatures.ix[3,:]

In [None]:
# Define data types
#y=dfFeatures[['client_id','churned']]
y_lbl = dfFeatures['client_id']
X_lbl = dfFeatures['tenant_id']
y = dfFeatures.churned.as_matrix()
#REORDER
xlist=['email','call','meeting','email_first','call_first','meeting_first','email_last','call_last','meeting_last','avg_interval','period_duration_sum','period_count']
#xlist=('email','call','meeting','email_last','call_last','meeting_last','avg_interval','period_duration_sum','period_count')
#xlist=['email_last','call_last','meeting_last','avg_interval','period_duration_sum','period_count']
#xlist=['email','call','meeting','avg_interval','days_since_last_touch','period_count','period_duration_sum'] #
#xlist=['email','call','meeting','avg_interval','days_since_last_touch','period_duration_sum'] #
X = dfFeatures[xlist] #,'active_count']]
#X = dfFeatures[['email','call','meeting','email_last','call_last','meeting_last','avg_interval','period_duration_sum','period_count']] #,'active_count']]
#X = dfFeatures[['email_last','call_last','meeting_last','avg_interval','period_duration_sum','period_count','days_since_last_touch']] #,'active_count']]
#X.set_index('client_id')

In [None]:
xlen = len(xlist)

In [None]:
# Scale data
# Tried both MinMaxScaler() and StandardScaler()
scaler = MinMaxScaler() #StandardScaler()
X_scaled = scaler.fit_transform(X)
dfFeaturesScaled = pd.DataFrame(X_scaled,columns=xlist)#,'active_count'])
#dfFeaturesScaled = pd.DataFrame(X_scaled,columns=['email','call','meeting','email_last','call_last','meeting_last','avg_interval','period_duration_sum','period_count'])#,'active_count'])
#dfFeaturesScaled = pd.DataFrame(X_scaled,columns=['email_last','call_last','meeting_last','avg_interval','period_duration_sum','period_count','days_since_last_touch'])#,'active_count'])
dfFeaturesScaled['churned'] = y
dfFeaturesScaled['tenant_id'] = X_lbl
dfFeaturesScaled['client_id'] = y_lbl
dfFeaturesScaled = dfFeaturesScaled.dropna()
dfFeatures['tenant_id'] = dfFeatures.tenant_id.astype(str)
dfFeatures['client_id'] = dfFeatures.client_id.astype(str)
#dfFeaturesScaled.head()
y = dfFeaturesScaled.churned.as_matrix()
X = dfFeaturesScaled[xlist] #,'active_count']]
#X = dfFeaturesScaled[['email','call','meeting','email_last','call_last','meeting_last','avg_interval','period_duration_sum','period_count','tenant_id']] #,'active_count']]
#X = dfFeaturesScaled[['email_last','call_last','meeting_last','avg_interval','period_duration_sum','period_count','days_since_last_touch','tenant_id']] #,'active_count']]
X_original=X
y_original=y
y_original_lbl=dfFeaturesScaled.client_id

In [None]:
X=X.as_matrix()

In [None]:
y

In [None]:
#Split data into test set and training set
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [None]:
print 'X'
print X
print
print 'y'
print y
print
print 'X_train'
print X_train
print
print 'X_test'
print X_test
print
print 'y_train'
print y_train
print
print 'y_test'
print y_test

In [None]:
# Check sizes of test and training set
print 'X_train'
print X_train.shape
print 'X_test'
print X_test.shape
print 'y_train'
print y_train.shape
print 'y_test'
print y_test.shape

In [None]:
# Encode categoricals
enc = OneHotEncoder(categorical_features=[0,])
X_enc = enc.fit(X)
X_train_enc = enc.transform(X_train)
X_test_enc = enc.transform(X_test)

In [None]:
# Several options make sense for modeling, including logistic regression, linear support vector classifier, 
# K-nearest neighbors, and random forest

clf = linear_model.LogisticRegressionCV(cv=10,class_weight='balanced') #,penalty='l1',solver='liblinear')
#clf = svm.LinearSVC(class_weight='balanced')
#clf = neighbors.KNeighborsClassifier() #NO CLASS WEIGHT
#clf = ensemble.RandomForestClassifier(class_weight='balanced') # n_estimators=15,


# Pipeline methods can be used to bundle feature selection process and classification
# clf = Pipeline([
#   ('feature_selection', SelectFromModel(linear_model.LogisticRegressionCV(cv=5,class_weight='balanced',penalty='l2'))),
#   ('classification', svm.LinearSVC(class_weight='balanced'))
# ])

# clf = Pipeline([
#   ('feature_selection', SelectFromModel(linear_model.LogisticRegressionCV(cv=5,class_weight='balanced',penalty='l2'))),
#   ('classification', ensemble.RandomForestClassifier(n_estimators=15,class_weight='balanced'))
# ])

def train_and_evaluate(clf, X_train, y_train):
    clf.fit(X_train,y_train)
    #sample weight available for LogisticRegressionCV, linearSVC, RandomForestClassifier
    #cv = KFold(n_splits=5,shuffle=True,random_state=None)
    scores = cross_val_score(clf, X_train, y_train) #,cv=cv)

    return clf

#scores

In [None]:
# Testing and evaluation

global y_decision_function
global y_predict_proba
global y_predict
global y_score
def TestAndEvaluate(X_test, y_test, clf):
    try:
        y_decision_function = clf.decision_function(X_test) #Regression, linearSVC
        print 'y_decision_function'
        print y_decision_function  
        
        y_ret = y_decision_function
    except Exception as err:
        logging.exception(err)
    
    try:
        y_predict_proba = clf.predict_proba(X_test)[:,1] #KNeighborsSVC, Random forest
        print 'y_predict_proba'
        print y_predict_proba
        
        y_ret = y_predict_proba
    except Exception as err:
        logging.exception(err)

    try:
        y_predict = clf.predict(X_test) #[:,1]
        print 'y_predict'
        print y_predict
        
        y_ret = y_predict
    except Exception as err:
        logging.exception(err)

    try:
        y_score = clf.score(X_test,y_test) #[:,1]
        print 'y_score'
        print y_score
        
        y_ret = y_score
    except Exception as err:
        logging.exception(err)

    try:
        fpr, tpr, _ = metrics.roc_curve(y_test, y_decision_function)
    except Exception as err:
        logging.exception(err)

    try:
        fpr, tpr, _ = metrics.roc_curve(y_test, y_predict_proba)
    except Exception as err:
        logging.exception(err)

    try:
        roc_auc = metrics.auc(fpr,tpr)   
        plt.figure()
        lw = 2
        plt.plot(fpr, tpr, color='darkorange',lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating characteristic')
        plt.legend(loc="lower right")
        plt.savefig('auc.png')
        plt.close()
    except Exception as err:
        logging.exception(err)

    try:
        features = SelectFromModel(clf,prefit=True)
        print features
    except Exception as err:
        logging.exception(err)
        
    try: # Attribute not available for all types of classifiers
        print 'Feature importances'
        print clf.feature_importances_
    except Exception as err:
        logging.exception(err)

    try: # Attribute not available for all types of classifiers
        print 'coef'
        print clf.coef_
    except Exception as err:
        logging.exception(err)
    
    return y_predict

In [None]:
# Confusion matrix

def matrix_and_pars(ytest,ypred):
    cm = confusion_matrix(ytest,ypred)
    precision = float(cm[0][0]) / (cm[0][0]+cm[1][0])
    recall = float(cm[0][0]) / (cm[0][0]+cm[0][1])
    F1 = 2*precision*recall/(recall+precision)
    print 'recall: %0.3f precision: %0.3f F1: %0.3f' %(recall,precision,F1)
    print '%d %d' %(cm[0][0],cm[0][1])
    print '%d %d' %(cm[1][0],cm[1][1])
    plot_confusion_matrix(cm)
    classes=['engaged','churned']
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    return

In [None]:
class_names = np.unique(y_test)
#print class_names
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    cm = cm.astype('float') / cm.sum(axis=1)[:,np.newaxis]
    plt.figure()
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(2)
    plt.xticks(tick_marks)
    plt.yticks(tick_marks)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.grid(False)
    plt.savefig('cm.png')
    plt.close()
    return

In [None]:
# Train model
train_and_evaluate(clf,X_train_enc,y_train) #X_train,y_train)

In [None]:
# Test model
#y_pred = measure_performance(X_test_enc, y_test, clf)
y_pred = TestAndEvaluate(X_test_enc, y_test, clf)

In [None]:
# Generate confusion matrix

matrix_and_pars(y_test,y_pred)

In [None]:


print X.shape
print y.shape
print X_train.shape
print y_train.shape
print X_test.shape
print y_test.shape

In [None]:
# sfm
importances = list(clf.coef_[0,:xlen])
print importances
print 'x len'
print len(importances)
importances = [abs(number) for number in importances]


xlbls=range(xlen) #[0,1,2,3,4,5]
print 'x lbl length'
print len(xlbls)

#xlist=('email','call','meeting','email_last','call_last','meeting_last','avg_interval','period_duration_sum','period_count')
#xlist=('email_last','call_last','meeting_last','avg_interval','period_duration_sum','period_count')
#xlist=['n_emails','n_calls','n_meetings','frequency','recency','relationship length']
#print len(xlist)

x_list = ('email frequency','call frequency','meeting frequency',\
          'email frequency (early)','call frequency (early)','meeting frequency (early)',\
          'email frequency (late)','call frequency (late)','meeting frequency (late)',\
          'mean interval', 'length of relationship','number of subscription periods')

#plt.xticks(rotation=45) #'vertical')
plt.figure()
plt.barh(xlbls,importances,tick_label=x_list) #,rotation=45)
plt.savefig('features.png')
plt.close()
#xlbls=[0,1,2,3,4,5,6]
#plt.bar(xlbls,importances,tick_label=('email_last','call_last','meeting_last','avg_interval','period_duration_sum','period_count','days_since_last_touch')) #,rotation=45)

#print len(importances)
#importances = pd.DataFrame(importances)
#print importances
#plt.bar([1,2,3,4,5,6],importances)
#fig, ax = plt.subplots()
#ax.bar(importances)
#ax.set_xticklabels(()

In [None]:
from sklearn.feature_selection import chi2
scores, pvalues = chi2(X, y)
scores
pvalues

In [None]:
rfe = RFE(clf, 3)
rfe.fit(X_train,y_train)
print rfe.support_
print rfe.ranking_
print range(1,10)
print xlist

In [None]:
#Run on all data (for web app)

In [None]:
X_all=enc.transform(X)

In [None]:
y

In [None]:
y_predict = clf.predict(X_all)
y_predict_proba = clf.predict_proba(X_all)

In [None]:
dfYOriginalLbl = pd.DataFrame(y_original_lbl).reset_index()
dfYOriginal = pd.DataFrame(y_original).reset_index()
dfOriginal = X_original.reset_index()

dfYPredict = pd.DataFrame(y_predict)
dfYPredict = dfYPredict.rename(columns={0:'churn_pred'})

dfYPredictProba = pd.DataFrame(y_predict_proba)
dfYPredictProba = dfYPredictProba.rename(columns={0:'churn_no'})
dfYPredictProba = dfYPredictProba.rename(columns={1:'churn_yes'})

dfResult = dfYPredictProba.join(dfYPredict)
dfResult = dfOriginal.join(dfResult)
dfResult = pd.merge(dfYOriginalLbl,dfResult)
dfResult = pd.merge(dfResult,dfYOriginal)
dfResult = dfResult.rename(columns={0:'churn_actual'})
dfResult = dfResult.drop('index',axis=1)
dfResult.head()

In [None]:
pickle.dump(dfResult, open( "results.p", "wb" ))

In [None]:
#dfYOriginalLbl

In [None]:
dfResult = pd.merge(dfYOriginalLbl,dfFeatures)
dfResult = dfResult.join(dfYPredictProba)
dfResult = dfResult.join(dfYPredict)
dfResult = dfResult.rename(columns={0:'churn_actual'})
dfResult = dfResult.drop('index',axis=1)
dfResult = dfResult.dropna()
dfResult

In [None]:
pickle.dump(dfResult, open( "results.p", "wb" ))

In [None]:
dfResult.info()

In [None]:
dfResult.to_csv('results.tsv',sep='\t')