# Set-Up Environment

In [63]:
import pandas as pd
import seaborn as sns
import sklearn as skl
import numpy as np
import matplotlib as mpl

# Load in the Dataset

In [64]:
data = pd.read_csv('Churn Model Data.csv')
print(data.shape)
data.head()

(44027, 27)


Unnamed: 0,ID,churn,one_day_hours,one_day_dau,one_day_boi,seven_day_hours,seven_day_dau,seven_day_boi,thirty_day_hours,thirty_day_dau,...,housewife,hub_plus_tenure2,tenure_registration,indicia_group,onedayhours^2,sevendayhours^2,thirtydayhours^2,hoursmostpop^2,hubplus^2,tenure^2
0,000061da-2686-4954-9ed4-29fa8a767d39,0,4.963265,2,4,9.388741,8,4,25.979758,18,...,0,11,21.0,162.0,24.633999,88.148449,674.947814,3.154298,121,441
1,0002995a-c233-4bd0-9ac3-322c561774a8,0,0.96,1,1,1.766035,2,1,6.074976,5,...,0,1,43.0,199.0,0.9216,3.118879,36.905335,0.9216,1,1849
2,0004a2f0-90c6-43c5-84f5-804eab277e22,0,2.570976,2,4,6.82696,5,7,17.651861,16,...,0,9,21.0,146.0,6.609917,46.607387,311.588201,2.158252,81,441
3,0004c529-c2ae-4d47-bfde-eac95dc5b1e2,0,3.985561,2,1,3.985561,2,1,8.986164,6,...,0,1,23.0,103.0,15.884695,15.884695,80.751151,15.884695,1,529
4,0005c576-287b-4117-be33-72cebac974ea,1,0.34785,1,2,1.018686,2,3,7.029297,8,...,0,20,0.0,111.0,0.120999,1.037721,49.411012,0.107636,400,0


# Split the Data into a Train and Test Set

In [65]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.loc[:,data.columns.difference(['ID', 'churn','First_genre_id','First_programme_title'])], data['churn'], test_size=0.2, random_state=42)

In [66]:
## make sure that the training set does not include the ID and churn columns
X_train.head()

Unnamed: 0,16-34,affluent,female,hours_most_pop_show,hoursmostpop^2,housewife,hub_plus_tenure2,hubplus^2,indicia_group,one_day_boi,...,seven_day_boi,seven_day_dau,seven_day_hours,sevendayhours^2,tenure^2,tenure_registration,thirty_day_boi,thirty_day_dau,thirty_day_hours,thirtydayhours^2
35405,0,0,1,0.213278,0.045487,1,1,1,175.0,1,...,1,3,1.461611,2.136306,1,1.0,2,4,1.553952,2.414766
32421,1,1,1,1.019802,1.039996,1,1,1,112.0,1,...,1,1,1.019802,1.039996,0,0.0,3,5,1.495352,2.236079
8600,0,0,1,1.098051,1.205716,0,11,121,117.0,5,...,6,8,10.085179,101.710828,441,21.0,11,29,38.925627,1515.204411
10016,1,0,1,0.794797,0.631703,0,4,16,150.0,2,...,2,1,0.798408,0.637456,0,0.0,3,4,3.101909,9.621837
39510,0,0,1,0.671193,0.4505,0,25,625,191.0,2,...,2,1,0.673158,0.453141,0,0.0,2,3,4.385922,19.236309


# Make sure that the data is in the right format

In [67]:
X_train = X_train.apply(pd.to_numeric)
X_test = X_test.apply(pd.to_numeric)
y_train = y_train.apply(pd.to_numeric)
y_test = y_test.apply(pd.to_numeric)

# Use Feature Union, Pipeline and GridSearch to find the best Model

In [68]:
# Create a pipeline that extracts features from the data then creates a model
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV

X = X_train.iloc[:,0:8]
Y = y_train

# create feature union
features = []
features.append(('pca', PCA(n_components=8))) #calculate x principal components
features.append(('select_best', SelectKBest(k='all'))) #find the best y principal components
feature_union = FeatureUnion(features)

# create pipeline
estimators = []
estimators.append(('feature_union', feature_union))
estimators.append(('logistic', LogisticRegression())) #add a logistic regression model to the pipeline
model = Pipeline(estimators)

# set up gridsearch to find the best parameter values specified in the ranges below
parameters = {'feature_union__pca__n_components':[2, 8], 'feature_union__select_best__k':[2, 8]}
clf = GridSearchCV(model, parameters)
model2 = clf.fit(X, Y)

sorted(clf.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'mean_train_score',
 'param_feature_union__pca__n_components',
 'param_feature_union__select_best__k',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split0_train_score',
 'split1_test_score',
 'split1_train_score',
 'split2_test_score',
 'split2_train_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score',
 'std_train_score']

In [69]:
## error messages - with genre and programme included: cannot convert string to float
##                  after genre and programme removed: input contains NaN, infinity or value too large for float64
##                  putting in a few columns at a time works but the predicted vector is all 0.

# Use accuracy_score to calculate the model accuracy

In [70]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, model2.predict(X_test.iloc[:,0:8]))
print(acc)

0.678060413355


In [71]:
model2.predict(X_test.iloc[:,0:8])

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [72]:
np.count_nonzero(model2.predict(X_test.iloc[:,0:8]))

0