In [1]:
#1 imports
from pathlib import Path

#file paths
ROOT = ROOT = Path.cwd().parent
RAW_DIR = ROOT/"data"/"raw"
PROCESSED_DIR = ROOT/"data"/"processed"

import sys, os
import pandas as pd
import numpy as np
import re

sys.path.insert(0, os.path.abspath("..")) #preprocess at root
from preprocess import load_csv, save_csv, split_data

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from IPython.display import display

print("Imports loaded")

Imports loaded


In [2]:
#2.1 load data
df = load_csv(RAW_DIR, "data03.csv")

print("Data")
display(df.head().T) #transpose rows/cols to make viewing easier
print("\nDataTypes:")
display(df.dtypes)

Loaded data03.csv: length: 7043 

Data


Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No



DataTypes:


customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [3]:
#2.2 convert data types
total_charges = pd.to_numeric(df.TotalCharges, errors='coerce')
display(df[total_charges.isnull()][['customerID', 'TotalCharges']])

df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(0)

Unnamed: 0,customerID,TotalCharges
488,4472-LVYGI,
753,3115-CZMZD,
936,5709-LVOEQ,
1082,4367-NUYAO,
1340,1371-DWPAZ,
3331,7644-OMVMY,
3826,3213-VVOLG,
4380,2520-SGTTA,
5218,2923-ARZLG,
6670,4075-WKNIU,


In [4]:
#2.3 clean data
df.columns = df.columns.str.lower().str.replace(' ', '_') #lowercase, replace spaces in col names

string_columns = list(df.dtypes[df.dtypes == 'object'].index) #keep only string cols
for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

df.churn = (df.churn == 'yes').astype(int) #binary classification of churn

df.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [5]:
#2.4 split data
#shuffle and split 20% test, 80% train
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)

#train/validation split
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=11)

print("df train")
display(df_train.head())
print("df val")
display(df_val.head())
print("df test")
display(df_test.head())

#save churn as target
y_train = df_train.churn.values
y_val = df_val.churn.values

#delete churn col - don't use for training
del df_train['churn']
del df_val['churn']

df train


Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
2935,9435-jmlsx,male,0,yes,no,71,yes,no,dsl,yes,...,yes,yes,yes,yes,two_year,yes,bank_transfer_(automatic),86.1,6045.9,0
3639,0512-flfdw,female,1,yes,no,60,yes,yes,fiber_optic,no,...,yes,no,yes,yes,one_year,yes,credit_card_(automatic),100.5,6029.0,0
2356,3450-wxoat,male,0,no,no,46,yes,no,dsl,no,...,no,no,no,no,month-to-month,yes,credit_card_(automatic),45.2,2065.15,0
6660,1447-giqmr,male,0,yes,no,1,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,69.15,69.15,1
755,6683-vlctz,male,1,no,no,20,yes,yes,fiber_optic,no,...,yes,no,yes,yes,month-to-month,yes,electronic_check,98.55,1842.8,1


df val


Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
5805,1697-nvvgy,male,1,yes,no,19,yes,no,dsl,yes,...,yes,no,yes,no,month-to-month,no,bank_transfer_(automatic),66.4,1286.05,0
3687,6849-oyamu,male,0,yes,yes,19,yes,no,fiber_optic,no,...,no,yes,yes,yes,one_year,no,bank_transfer_(automatic),100.0,1888.65,1
6900,7921-lmdfq,male,1,no,no,51,yes,no,fiber_optic,no,...,yes,no,yes,yes,one_year,yes,bank_transfer_(automatic),94.0,4905.75,0
1644,3331-hqdtw,female,0,no,no,34,yes,yes,fiber_optic,yes,...,yes,no,yes,yes,month-to-month,yes,mailed_check,109.8,3587.25,1
6975,1904-wajaa,female,0,yes,yes,24,yes,yes,dsl,yes,...,yes,no,no,no,two_year,no,electronic_check,64.35,1558.65,0


df test


Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
3381,8879-zkjof,female,0,no,no,41,yes,no,dsl,yes,...,yes,yes,yes,yes,one_year,yes,bank_transfer_(automatic),79.85,3320.75,0
6180,0201-mibol,female,1,no,no,66,yes,yes,fiber_optic,yes,...,no,no,yes,yes,two_year,yes,bank_transfer_(automatic),102.4,6471.85,0
4829,1600-dilpe,female,0,no,no,12,yes,no,dsl,no,...,no,no,no,no,month-to-month,yes,bank_transfer_(automatic),45.0,524.35,0
3737,8601-qacrs,female,0,no,no,5,yes,yes,dsl,no,...,no,no,no,no,month-to-month,yes,mailed_check,50.6,249.95,1
4249,7919-zodzz,female,0,yes,yes,10,yes,no,dsl,no,...,yes,no,no,yes,one_year,yes,mailed_check,65.9,660.05,0


In [6]:
#3.1 EDA
#check for missing data
print("Columns missing data:")
display(df_train_full.isnull().sum())

#proportions of users who churned and who didnt
print("\nChurn split:")
display(df_train_full.churn.value_counts())

#calculate churn rate
global_mean = df_train_full.churn.mean()
print("\nChurn probability:")
display(round(global_mean, 3))

Columns missing data:


customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64


Churn split:


churn
0    4113
1    1521
Name: count, dtype: int64


Churn probability:


np.float64(0.27)

In [7]:
#3.2 #variable lists
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
               'phoneservice', 'multiplelines', 'internetservice',
               'onlinesecurity', 'onlinebackup', 'deviceprotection',
               'techsupport', 'streamingtv', 'streamingmovies',
               'contract', 'paperlessbilling', 'paymentmethod']
numerical = ['tenure', 'monthlycharges', 'totalcharges']

print("Unique values for each variable:")
display(df_train_full[categorical].nunique())

Unique values for each variable:


gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

In [8]:
#3.3 feature importance analysis - which variables will be useful
#3.3.1 churn rates
#male vs female
print("male vs female churn rate")
female_mean = df_train_full[df_train_full.gender == 'female'].churn.mean()
print("gender == female: ", round(female_mean, 3))
male_mean = df_train_full[df_train_full.gender == 'male'].churn.mean()
print("gender == male: ", round(male_mean, 3))

#parner vs non
print("\nPartner vs non churn rate")
partner_yes = df_train_full[df_train_full.partner == 'yes'].churn.mean()
print("partner == yes: ", round(partner_yes, 3))
partner_no = df_train_full[df_train_full.partner == 'no'].churn.mean()
print("partner == no: ", round(partner_no, 3))



male vs female churn rate
gender == female:  0.277
gender == male:  0.263

Partner vs non churn rate
partner == yes:  0.205
partner == no:  0.33


In [9]:
#r3.3.2 risk ratio: group rate/global ratio
#less than 1 - lower risk, greater than 1 higher risk
global_mean = df_train_full.churn.mean()

#by gender
df_group = df_train_full.groupby(by='gender').churn.agg(['mean']) #ave churn
df_group['diff'] = df_group['mean'] - global_mean #difference betweeen group and global
df_group['risk'] = df_group['mean'] / global_mean #risk of churning

print("\nRisk by gender")
display(df_group)

#categorical variables
print("\nRisks for categories")
for col in categorical:
    df_group = df_train_full.groupby(by=col).churn.agg(['mean'])
    df_group['diff'] = df_group['mean'] - global_mean
    df_group['risk'] = df_group['mean'] / global_mean

    print(f"{col} risk")
    display(df_group)


Risk by gender


Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.276824,0.006856,1.025396
male,0.263214,-0.006755,0.97498



Risks for categories
gender risk


Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.276824,0.006856,1.025396
male,0.263214,-0.006755,0.97498


seniorcitizen risk


Unnamed: 0_level_0,mean,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.24227,-0.027698,0.897403
1,0.413377,0.143409,1.531208


partner risk


Unnamed: 0_level_0,mean,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.329809,0.059841,1.221659
yes,0.205033,-0.064935,0.759472


dependents risk


Unnamed: 0_level_0,mean,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.31376,0.043792,1.162212
yes,0.165666,-0.104302,0.613651


phoneservice risk


Unnamed: 0_level_0,mean,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.241316,-0.028652,0.89387
yes,0.273049,0.003081,1.011412


multiplelines risk


Unnamed: 0_level_0,mean,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.257407,-0.012561,0.953474
no_phone_service,0.241316,-0.028652,0.89387
yes,0.290742,0.020773,1.076948


internetservice risk


Unnamed: 0_level_0,mean,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dsl,0.192347,-0.077621,0.712482
fiber_optic,0.425171,0.155203,1.574895
no,0.077805,-0.192163,0.288201


onlinesecurity risk


Unnamed: 0_level_0,mean,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.420921,0.150953,1.559152
no_internet_service,0.077805,-0.192163,0.288201
yes,0.153226,-0.116742,0.56757


onlinebackup risk


Unnamed: 0_level_0,mean,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.404323,0.134355,1.497672
no_internet_service,0.077805,-0.192163,0.288201
yes,0.217232,-0.052736,0.80466


deviceprotection risk


Unnamed: 0_level_0,mean,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.395875,0.125907,1.466379
no_internet_service,0.077805,-0.192163,0.288201
yes,0.230412,-0.039556,0.85348


techsupport risk


Unnamed: 0_level_0,mean,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.418914,0.148946,1.551717
no_internet_service,0.077805,-0.192163,0.288201
yes,0.159926,-0.110042,0.59239


streamingtv risk


Unnamed: 0_level_0,mean,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.342832,0.072864,1.269897
no_internet_service,0.077805,-0.192163,0.288201
yes,0.302723,0.032755,1.121328


streamingmovies risk


Unnamed: 0_level_0,mean,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.338906,0.068938,1.255358
no_internet_service,0.077805,-0.192163,0.288201
yes,0.307273,0.037305,1.138182


contract risk


Unnamed: 0_level_0,mean,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
month-to-month,0.431701,0.161733,1.599082
one_year,0.120573,-0.149395,0.446621
two_year,0.028274,-0.241694,0.10473


paperlessbilling risk


Unnamed: 0_level_0,mean,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.172071,-0.097897,0.637375
yes,0.338151,0.068183,1.25256


paymentmethod risk


Unnamed: 0_level_0,mean,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bank_transfer_(automatic),0.168171,-0.101797,0.622928
credit_card_(automatic),0.164339,-0.10563,0.608733
electronic_check,0.45589,0.185922,1.688682
mailed_check,0.19387,-0.076098,0.718121


In [10]:
#3.3.3 Mutual info - how much we learn about one variable if we learn the value of another
#dependency: high: variables dependent, useful for prediction; low: independent, not useful
def calculate_mi(series):
    return mutual_info_score(series, df_train_full.churn)

df_mi = df_train_full[categorical].apply(calculate_mi)         #cumpute MI between series and churn
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI') #sort result
print("Ordered MI: serivies vs chrun")
display(df_mi)

Ordered MI: serivies vs chrun


Unnamed: 0,MI
contract,0.09832
onlinesecurity,0.063085
techsupport,0.061032
internetservice,0.055868
onlinebackup,0.046923
deviceprotection,0.043453
paymentmethod,0.04321
streamingtv,0.031853
streamingmovies,0.031581
paperlessbilling,0.017589


In [11]:
#3.3.4 Correlation coefficient - measure dependency a numerical feature and binary target
#positive: proportional; zero: no relation; negative: inversely proportional
df_train_full[numerical].corrwith(df_train_full.churn)

tenure           -0.351885
monthlycharges    0.196805
totalcharges     -0.196353
dtype: float64

In [12]:
#4 Feature engineering - One-hot encoding: active values denoted 1, inactive 0
train_dict = df_train[categorical + numerical].to_dict(orient='records') #convert df to dict
print("First element of dictionary:")
display(train_dict[0])

#create numerical encodings
dv = DictVectorizer(sparse=False)  #create simple NumPy array
dv.fit(train_dict)                 #apply one-hot to categorical, leave if numerical
X_train = dv.transform(train_dict) #convert dictionary to matrix
print("\nFirst row of matrix:", X_train.shape)
display(X_train[0])                #one-hot + numerical 

print("\nNames of columns:")
display(dv.get_feature_names_out())

First element of dictionary:


{'gender': 'male',
 'seniorcitizen': 0,
 'partner': 'yes',
 'dependents': 'no',
 'phoneservice': 'yes',
 'multiplelines': 'no',
 'internetservice': 'dsl',
 'onlinesecurity': 'yes',
 'onlinebackup': 'yes',
 'deviceprotection': 'yes',
 'techsupport': 'yes',
 'streamingtv': 'yes',
 'streamingmovies': 'yes',
 'contract': 'two_year',
 'paperlessbilling': 'yes',
 'paymentmethod': 'bank_transfer_(automatic)',
 'tenure': 71,
 'monthlycharges': 86.1,
 'totalcharges': 6045.9}


First row of matrix: (3774, 45)


array([0.0000e+00, 0.0000e+00, 1.0000e+00, 1.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00,
       1.0000e+00, 0.0000e+00, 0.0000e+00, 8.6100e+01, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 1.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 7.1000e+01, 6.0459e+03])


Names of columns:


array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'dependents=no', 'dependents=yes',
       'deviceprotection=no', 'deviceprotection=no_internet_service',
       'deviceprotection=yes', 'gender=female', 'gender=male',
       'internetservice=dsl', 'internetservice=fiber_optic',
       'internetservice=no', 'monthlycharges', 'multiplelines=no',
       'multiplelines=no_phone_service', 'multiplelines=yes',
       'onlinebackup=no', 'onlinebackup=no_internet_service',
       'onlinebackup=yes', 'onlinesecurity=no',
       'onlinesecurity=no_internet_service', 'onlinesecurity=yes',
       'paperlessbilling=no', 'paperlessbilling=yes', 'partner=no',
       'partner=yes', 'paymentmethod=bank_transfer_(automatic)',
       'paymentmethod=credit_card_(automatic)',
       'paymentmethod=electronic_check', 'paymentmethod=mailed_check',
       'phoneservice=no', 'phoneservice=yes', 'seniorcitizen',
       'streamingmovies=no', 'streamingmovies=no_internet_service',

In [13]:
#5 Model Training
#5.1 Logistic regression: binary classification - 1: churned, 0: stayed; output is probability of churn
model = LogisticRegression(solver='liblinear', random_state=1) #SciKit model, fix shuffle
model.fit(X_train, y_train)                                    #train model

#validation
val_dict = df_val[categorical + numerical].to_dict(orient='records') #convert to dict
X_val = dv.transform(val_dict) #convert to  matrix containing validation datatset featrues

y_pred = model.predict_proba(X_val)
print("Probability of customer statying vs churning:")
display(y_pred)

y_pred = model.predict_proba(X_val)[:, 1] #only need probability of churning

churn = y_pred >= 0.5 #predictions array: churn true if greater than 50% chance

acc = (y_val == churn).mean() #model accuracy - check if target == pred
print("\nModel accuracy:", acc)

Probability of customer statying vs churning:


array([[0.76508893, 0.23491107],
       [0.7311339 , 0.2688661 ],
       [0.6805482 , 0.3194518 ],
       ...,
       [0.94274725, 0.05725275],
       [0.38476961, 0.61523039],
       [0.93872737, 0.06127263]], shape=(1860, 2))


Model accuracy: 0.8016129032258065


In [14]:
#6.1 Model Interpretation
print("Bias =", model.intercept_[0]) #baseline prediction

print("\nfeature | weight:") 
display(dict(zip(dv.get_feature_names_out(), map(float, model.coef_[0].round(3)))))

Bias = -0.12198863589816404

feature | weight:


{'contract=month-to-month': 0.563,
 'contract=one_year': -0.086,
 'contract=two_year': -0.599,
 'dependents=no': -0.03,
 'dependents=yes': -0.092,
 'deviceprotection=no': 0.1,
 'deviceprotection=no_internet_service': -0.116,
 'deviceprotection=yes': -0.106,
 'gender=female': -0.027,
 'gender=male': -0.095,
 'internetservice=dsl': -0.323,
 'internetservice=fiber_optic': 0.317,
 'internetservice=no': -0.116,
 'monthlycharges': 0.001,
 'multiplelines=no': -0.168,
 'multiplelines=no_phone_service': 0.127,
 'multiplelines=yes': -0.081,
 'onlinebackup=no': 0.136,
 'onlinebackup=no_internet_service': -0.116,
 'onlinebackup=yes': -0.142,
 'onlinesecurity=no': 0.258,
 'onlinesecurity=no_internet_service': -0.116,
 'onlinesecurity=yes': -0.264,
 'paperlessbilling=no': -0.213,
 'paperlessbilling=yes': 0.091,
 'partner=no': -0.048,
 'partner=yes': -0.074,
 'paymentmethod=bank_transfer_(automatic)': -0.027,
 'paymentmethod=credit_card_(automatic)': -0.136,
 'paymentmethod=electronic_check': 0.175,


In [15]:
#6.2 train model on smaller feature set
#build small featrue set
small_subset = ['contract', 'tenure', 'totalcharges']
train_dict_small = df_train[small_subset].to_dict(orient='records')
dv_small = DictVectorizer(sparse=False)
dv_small.fit(train_dict_small)

X_small_train = dv_small.transform(train_dict_small)

print("Small features:")
display(dv_small.get_feature_names_out())

#train model
model_small = LogisticRegression(solver='liblinear', random_state=1)
model_small.fit(X_small_train, y_train)

print("Small bias =", model_small.intercept_[0])

print("\nfeature | weight:") 
display(dict(zip(dv_small.get_feature_names_out(), map(float, model_small.coef_[0].round(3)))))

#validation
val_dict_small = df_val[small_subset].to_dict(orient='records')
X_val_small = dv.transform(val_dict)

y_pred_small = model.predict_proba(X_val)
print("\nSmall probability of customer statying vs churning:")
display(y_pred_small)

y_pred_small = model.predict_proba(X_val_small)[:, 1]

churn = y_pred >= 0.5 #predictions array: churn true if greater than 50% chance

acc = (y_val == churn).mean()
print("\nSmall model accuracy:", acc)

Small features:


array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'tenure', 'totalcharges'], dtype=object)

Small bias = -0.577229912199359

feature | weight:


{'contract=month-to-month': 0.866,
 'contract=one_year': -0.327,
 'contract=two_year': -1.117,
 'tenure': -0.094,
 'totalcharges': 0.001}


Small probability of customer statying vs churning:


array([[0.76508893, 0.23491107],
       [0.7311339 , 0.2688661 ],
       [0.6805482 , 0.3194518 ],
       ...,
       [0.94274725, 0.05725275],
       [0.38476961, 0.61523039],
       [0.93872737, 0.06127263]], shape=(1860, 2))


Small model accuracy: 0.8016129032258065


In [16]:
#7 use model
#extract customer
customer = df.loc[df.customerid == '8879-zkjof'].iloc[0][categorical + numerical].to_dict()
print("Customer 8879-zkjof':")
display(customer)

#predict if customer will churn
X_test = dv.transform([customer])
print("\nCustomer encoding:")
display(X_test)

print("\nProbability of customer churning = ", model.predict_proba(X_test)[0, 1])

#try on ranomd customer
other_customer = df.sample(1, random_state=1)[categorical + numerical].iloc[0].to_dict()
print("\n\nAnother customer:")
display(other_customer)

X_test = dv.transform([other_customer])
print("\nProbability of other customer churning = ", model.predict_proba(X_test)[0, 1])

Customer 8879-zkjof':


{'gender': 'female',
 'seniorcitizen': 0,
 'partner': 'no',
 'dependents': 'no',
 'phoneservice': 'yes',
 'multiplelines': 'no',
 'internetservice': 'dsl',
 'onlinesecurity': 'yes',
 'onlinebackup': 'no',
 'deviceprotection': 'yes',
 'techsupport': 'yes',
 'streamingtv': 'yes',
 'streamingmovies': 'yes',
 'contract': 'one_year',
 'paperlessbilling': 'yes',
 'paymentmethod': 'bank_transfer_(automatic)',
 'tenure': 41,
 'monthlycharges': 79.85,
 'totalcharges': 3320.75}


Customer encoding:


array([[0.00000e+00, 1.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 0.00000e+00, 7.98500e+01, 1.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 1.00000e+00,
        1.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 4.10000e+01, 3.32075e+03]])


Probability of customer churning =  0.07332239057816664


Another customer:


{'gender': 'female',
 'seniorcitizen': 0,
 'partner': 'no',
 'dependents': 'no',
 'phoneservice': 'yes',
 'multiplelines': 'no',
 'internetservice': 'dsl',
 'onlinesecurity': 'yes',
 'onlinebackup': 'no',
 'deviceprotection': 'yes',
 'techsupport': 'yes',
 'streamingtv': 'yes',
 'streamingmovies': 'yes',
 'contract': 'one_year',
 'paperlessbilling': 'yes',
 'paymentmethod': 'bank_transfer_(automatic)',
 'tenure': 41,
 'monthlycharges': 79.85,
 'totalcharges': 3320.75}


Probability of other customer churning =  0.07332239057816664
