# set-up environment

In [2]:
import pandas as pd
import seaborn as sns
import sklearn as skl
import numpy as np
import matplotlib as mpl

# generate customer data

In [3]:
# create age data
age = np.random.poisson(30,10000)

In [4]:
age

array([22, 28, 34, ..., 30, 40, 25])

In [5]:
#create gender data
gender = np.random.binomial(1,0.5,10000)

In [6]:
gender

array([0, 0, 1, ..., 0, 0, 0])

In [7]:
#create hubplus tenure data
hubplus_tenure = np.random.negative_binomial(12,0.5,10000)

In [8]:
#create the number of hours watched per. day
one_day_hours = np.random.negative_binomial(2,0.5,10000)

In [9]:
hubplus_tenure

array([16, 17,  6, ...,  6, 15,  4])

In [10]:
one_day_hours

array([2, 5, 1, ..., 1, 2, 1])

In [11]:
#create the number of hours watched in 7 days
seven_day_hours = np.random.negative_binomial(10,0.5,10000)

In [12]:
seven_day_hours

array([15, 10, 12, ..., 11,  7,  7])

In [13]:
#create the number of hours watched in 30 days
thirty_day_hours = np.random.negative_binomial(30,0.5,10000)

In [14]:
thirty_day_hours

array([24, 31, 35, ..., 21, 39, 41])

In [15]:
#create affluent data
affluent = np.random.binomial(1,0.1,10000)

In [16]:
affluent

array([0, 0, 1, ..., 0, 0, 0])

In [17]:
#count the number of ones on the affluent vector, to check
unique, counts = np.unique(affluent, return_counts=True)
dict(zip(unique, counts))

{0: 8982, 1: 1018}

In [18]:
indicia_group = np.random.randint(0,200,10000)

In [19]:
indicia_group

array([ 83,   3, 140, ..., 132,  69, 151])

In [20]:
total_viewing_time = []
for i in range(10000):
    total_viewing_time.append(age[i]+np.random.normal(0,10,1)[0])

In [21]:
lin_pred = 0.8*age+5.6*gender+3*affluent+0.21*seven_day_hours+0.15*hubplus_tenure-25

In [22]:
lin_pred

array([ -1.85,   2.05,  14.22, ...,   2.21,  10.72,  -2.93])

In [23]:
import math
churn = []
for i in range(10000):
    churn.append(np.random.binomial(1,1/(1+math.exp(-lin_pred[i])),1)[0])

In [24]:
customer_data = pd.DataFrame({"age":age,"gender":gender,"hubplus_tenure":hubplus_tenure,"one_day_hours":one_day_hours,"seven_day_hours":seven_day_hours,"thirty_day_hours":thirty_day_hours,"affluent":affluent,"indicia_group":indicia_group,"churn":churn,"total_viewing_time":total_viewing_time})

# append all of the customer data in a dataframe

In [25]:
customer_data

Unnamed: 0,affluent,age,churn,gender,hubplus_tenure,indicia_group,one_day_hours,seven_day_hours,thirty_day_hours,total_viewing_time
0,0,22,1,0,16,83,2,15,24,38.982798
1,0,28,1,0,17,3,5,10,31,7.602521
2,1,34,1,1,6,140,1,12,35,49.776215
3,0,27,1,1,11,167,2,14,24,43.137320
4,0,19,0,1,6,13,0,15,31,9.540763
5,0,28,1,0,15,118,2,14,28,29.025661
6,0,27,1,1,12,32,2,13,34,28.055735
7,0,35,1,1,12,193,1,14,22,40.827695
8,0,32,1,0,17,5,6,12,32,45.205519
9,0,30,1,1,13,51,1,7,19,36.667317


In [26]:
churn[1]

1

# split the dataframe into training and test sets

In [27]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(customer_data.loc[:,['affluent','age','gender','hubplus_tenure','indicia_group','one_day_hours','seven_day_hours','thirty_day_hours','total_viewing_time']], customer_data['churn'], test_size=0.2, random_state=42)

# use feature union, pipeline and gridsearch to find the best features and model

In [28]:
# Create a pipeline that extracts features from the data then creates a model
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV

X = X_train 
Y = y_train

# create feature union
features = []
features.append(('pca', PCA(n_components=8))) #calculate x principal components
features.append(('select_best', SelectKBest(k=4))) #find the best y principal components
feature_union = FeatureUnion(features)

# create pipeline
estimators = []
estimators.append(('feature_union', feature_union))
estimators.append(('logistic', LogisticRegression())) #add a logistic regression model to the pipeline
model = Pipeline(estimators)

# set up gridsearch to find the best parameter values specified in the ranges below
parameters = {'feature_union__pca__n_components':[6, 8], 'feature_union__select_best__k':[2, 5]}
clf = GridSearchCV(model, parameters)
model2 = clf.fit(X, Y)

sorted(clf.cv_results_.keys())

# evaluate pipeline
#seed = 7
#kfold = KFold(n_splits=10, random_state=seed)
#results = cross_val_score(model, X, Y, cv=kfold)
#print(results.mean())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'mean_train_score',
 'param_feature_union__pca__n_components',
 'param_feature_union__select_best__k',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split0_train_score',
 'split1_test_score',
 'split1_train_score',
 'split2_test_score',
 'split2_train_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score',
 'std_train_score']

# use the model created above to predict the outcome of the test set

In [29]:
model2.predict(X_test)

array([1, 1, 0, ..., 1, 1, 1], dtype=int64)

In [30]:
diff = model2.predict(X_test)-y_test

In [31]:
np.count_nonzero(diff)

131

# use the accuracy_score measure to calculate the accuracy of the model

In [32]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, model2.predict(X_test))

In [33]:
acc

0.9345