In [301]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [302]:
data = "https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv"

In [303]:
!wget $data -O data.csv 

--2023-05-07 16:50:52--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 977501 (955K) [text/plain]
Saving to: ‘data.csv’


2023-05-07 16:50:52 (11.6 MB/s) - ‘data.csv’ saved [977501/977501]



In [304]:
pd.set_option(
'display.max_columns', None)

In [305]:
df = pd.read_csv('data.csv')

In [306]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [307]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [308]:
df.columns = df.columns.str.lower().str.replace(" ", "_")

In [309]:
df.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges         object
churn                object
dtype: object

In [310]:
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

In [311]:
categorical_columns

['customerid',
 'gender',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod',
 'totalcharges',
 'churn']

In [312]:
for col in categorical_columns:
    df[col] = df[col].str.lower().str.replace(" ", "_")

In [313]:
df.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-vhveg,female,0,yes,no,1,no,no_phone_service,dsl,no,yes,no,no,no,no,month-to-month,yes,electronic_check,29.85,29.85,no
1,5575-gnvde,male,0,no,no,34,yes,no,dsl,yes,no,yes,no,no,no,one_year,no,mailed_check,56.95,1889.5,no
2,3668-qpybk,male,0,no,no,2,yes,no,dsl,yes,yes,no,no,no,no,month-to-month,yes,mailed_check,53.85,108.15,yes
3,7795-cfocw,male,0,no,no,45,no,no_phone_service,dsl,yes,no,yes,yes,no,no,one_year,no,bank_transfer_(automatic),42.3,1840.75,no
4,9237-hqitu,female,0,no,no,2,yes,no,fiber_optic,no,no,no,no,no,no,month-to-month,yes,electronic_check,70.7,151.65,yes


In [314]:
df['totalcharges'] = pd.to_numeric(df['totalcharges'], errors="coerce")

In [315]:
df.totalcharges.isnull().sum()

11

In [316]:
df[df.totalcharges.isnull()]['totalcharges']

488    NaN
753    NaN
936    NaN
1082   NaN
1340   NaN
3331   NaN
3826   NaN
4380   NaN
5218   NaN
6670   NaN
6754   NaN
Name: totalcharges, dtype: float64

In [317]:
df.totalcharges = df.totalcharges.fillna(0)

In [318]:
df.totalcharges.isnull().sum()

0

In [319]:
df.churn = (df.churn=='yes').astype(int)

In [320]:
df.churn.head()

0    0
1    0
2    1
3    0
4    1
Name: churn, dtype: int64

In [321]:
# Data split train, validation and test

In [322]:
from sklearn.model_selection import train_test_split

In [323]:
df_full_train, df_test = train_test_split(df, test_size=0.2)

df_train, df_val = train_test_split(df_full_train, test_size = 0.25)

In [324]:
len(df_train), len(df_val), len(df_test)

(4225, 1409, 1409)

In [325]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)


In [326]:
y_train = df_train.churn.values
y_val= df_val.churn.values
y_test = df_test.churn.values

In [327]:
del df_train["churn"]
del df_test["churn"]
del df_val["churn"]

In [328]:
# EDA

In [329]:
df_full_train = df_full_train.reset_index(drop = True)

In [330]:
df_full_train.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [331]:
numerical = ['tenure','monthlycharges','totalcharges']
categorical = [
    'gender',
    'seniorcitizen', 
    'partner', 
    'dependents',
    'phoneservice', 
    'multiplelines',
    'internetservice',
    'onlinesecurity',
    'onlinebackup', 
    'deviceprotection', 
    'techsupport',
    'streamingtv', 
    'streamingmovies', 
    'contract', 
    'paperlessbilling',
    'paymentmethod'
]

In [332]:
df_full_train[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

In [333]:
# Churn rate

In [334]:
global_churn = df_full_train.churn.mean()

In [335]:
df_full_train.churn.value_counts(normalize = True)

0    0.733227
1    0.266773
Name: churn, dtype: float64

## Churn rate

In [336]:

churn_female = df_full_train[df_full_train.gender == 'female'].churn.mean()
churn_male = df_full_train[df_full_train.gender =='male'].churn.mean()
print(churn_male, churn_female)

0.26230661040787623 0.27132616487455197


In [337]:
df_full_train.partner.value_counts()

no     2936
yes    2698
Name: partner, dtype: int64

In [338]:
churn_partner = df_full_train[df_full_train.partner == 'yes'].churn.mean()
churn_no_partner = df_full_train[df_full_train.partner == 'no'].churn.mean()
print(churn_partner, churn_no_partner)

0.19866567828020756 0.3293596730245232


## Risk Ratio

In [339]:
churn_partner/global_churn

0.744698889840778

In [340]:
churn_no_partner/global_churn

1.2346057204392304

In [341]:
churn_male/global_churn

0.9832571144630569

In [342]:
churn_female/global_churn

1.017066941386045

In [343]:
from IPython.display import display

In [344]:
for c in categorical:
    print(c)
    df_group = df_full_train.groupby(c).churn.agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] - global_churn
    df_group['risk'] = df_group['mean']/global_churn
    display(df_group)
    print()
    print()

gender


Unnamed: 0_level_0,mean,count,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.271326,2790,0.004553,1.017067
male,0.262307,2844,-0.004467,0.983257




seniorcitizen


Unnamed: 0_level_0,mean,count,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.236915,4719,-0.029859,0.888075
1,0.420765,915,0.153992,1.577239




partner


Unnamed: 0_level_0,mean,count,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.32936,2936,0.062587,1.234606
yes,0.198666,2698,-0.068107,0.744699




dependents


Unnamed: 0_level_0,mean,count,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.314819,3961,0.048046,1.180102
yes,0.153019,1673,-0.113755,0.57359




phoneservice


Unnamed: 0_level_0,mean,count,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.259887,531,-0.006886,0.974187
yes,0.26749,5103,0.000717,1.002686




multiplelines


Unnamed: 0_level_0,mean,count,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.249815,2702,-0.016958,0.936432
no_phone_service,0.259887,531,-0.006886,0.974187
yes,0.28738,2401,0.020607,1.077246




internetservice


Unnamed: 0_level_0,mean,count,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dsl,0.195607,1912,-0.071166,0.733232
fiber_optic,0.416934,2492,0.150161,1.562879
no,0.073171,1230,-0.193602,0.274281




onlinesecurity


Unnamed: 0_level_0,mean,count,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.423271,2776,0.156498,1.586632
no_internet_service,0.073171,1230,-0.193602,0.274281
yes,0.146192,1628,-0.120582,0.548




onlinebackup


Unnamed: 0_level_0,mean,count,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.40228,2456,0.135507,1.507948
no_internet_service,0.073171,1230,-0.193602,0.274281
yes,0.218172,1948,-0.048601,0.81782




deviceprotection


Unnamed: 0_level_0,mean,count,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.388532,2494,0.121759,1.456415
no_internet_service,0.073171,1230,-0.193602,0.274281
yes,0.232461,1910,-0.034312,0.87138




techsupport


Unnamed: 0_level_0,mean,count,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.414932,2786,0.148159,1.555373
no_internet_service,0.073171,1230,-0.193602,0.274281
yes,0.158838,1618,-0.107935,0.595405




streamingtv


Unnamed: 0_level_0,mean,count,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.331548,2241,0.064775,1.24281
no_internet_service,0.073171,1230,-0.193602,0.274281
yes,0.309755,2163,0.042982,1.161117




streamingmovies


Unnamed: 0_level_0,mean,count,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.334081,2230,0.067308,1.252303
no_internet_service,0.073171,1230,-0.193602,0.274281
yes,0.307268,2174,0.040495,1.151794




contract


Unnamed: 0_level_0,mean,count,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
month-to-month,0.428295,3096,0.161521,1.605463
one_year,0.113882,1203,-0.152891,0.426887
two_year,0.029963,1335,-0.236811,0.112315




paperlessbilling


Unnamed: 0_level_0,mean,count,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.161206,2289,-0.105567,0.60428
yes,0.339013,3345,0.07224,1.270793




paymentmethod


Unnamed: 0_level_0,mean,count,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bank_transfer_(automatic),0.17081,1247,-0.095963,0.640282
credit_card_(automatic),0.153344,1226,-0.113429,0.574811
electronic_check,0.455319,1880,0.188546,1.706765
mailed_check,0.192037,1281,-0.074736,0.719853






## Mutual information

In [345]:
from sklearn.metrics import mutual_info_score

mutual_info_score(df_full_train.churn, df_full_train.contract)

0.09719123743715483

In [346]:
def mutual_info_churn_score(series):
    return mutual_info_score(df_full_train.churn, series)

In [347]:
df_full_train[categorical].apply(mutual_info_churn_score).sort_values(ascending=False)

contract            0.097191
onlinesecurity      0.066950
techsupport         0.061596
internetservice     0.054685
onlinebackup        0.047715
paymentmethod       0.044416
deviceprotection    0.043309
streamingmovies     0.032688
streamingtv         0.032579
paperlessbilling    0.020379
dependents          0.015023
partner             0.011032
seniorcitizen       0.010919
multiplelines       0.000824
gender              0.000052
phoneservice        0.000013
dtype: float64

In [348]:
df_full_train[numerical].corrwith(df_full_train.churn)

tenure           -0.352374
monthlycharges    0.196097
totalcharges     -0.196629
dtype: float64

In [349]:
df_full_train[numerical].corrwith(df_full_train.churn).abs()

tenure            0.352374
monthlycharges    0.196097
totalcharges      0.196629
dtype: float64

## One hot encoding

In [350]:
from sklearn.feature_extraction import DictVectorizer


In [351]:
train_dicts = df_train[categorical + numerical].to_dict(orient='records')

In [352]:
dv = DictVectorizer()


In [400]:
X_train = dv.fit_transform(train_dicts)

X_train

array([[1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.80000e+01, 1.77995e+03],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.50000e+01, 1.42640e+03],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        6.00000e+01, 3.02740e+03],
       ...,
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        6.90000e+01, 3.80440e+03],
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        6.50000e+01, 1.74855e+03],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.60000e+01, 1.29540e+03]])

In [354]:

val_dicts = df_val[categorical + numerical].to_dict(orient='records')

In [355]:
X_val = dv.transform(val_dicts)

# Training with logistic regression

In [356]:
from sklearn.linear_model import LogisticRegression

In [357]:
model = LogisticRegression()
model.fit(X_train,y_train)

In [358]:
model.coef_[0].round(3)

array([ 0.3  , -0.199, -0.224,  0.089, -0.212,  0.061, -0.095, -0.089,
       -0.024, -0.099, -0.292,  0.264, -0.095,  0.002, -0.242,  0.055,
        0.064,  0.092, -0.095, -0.12 ,  0.235, -0.095, -0.263, -0.321,
        0.198, -0.059, -0.064, -0.063, -0.106,  0.262, -0.216,  0.055,
       -0.178,  0.273, -0.103, -0.095,  0.075, -0.079, -0.095,  0.051,
        0.275, -0.095, -0.302, -0.069,  0.   ])

In [359]:
model.intercept_[0]

-0.12336671862190063

In [360]:
y_pred = model.predict_proba(X_val)[:,1]

In [361]:
churn_decision = (y_pred >= 0.5)

In [362]:
churn_decision

array([False, False,  True, ..., False, False,  True])

In [363]:
df_val[churn_decision].customerid

2       6599-sfqve
12      0439-ifyun
13      8808-eleho
15      0529-onker
19      1195-oiyej
           ...    
1392    4445-zjnmu
1395    3158-moerk
1398    5271-dbysj
1404    8714-ctzjw
1408    3999-qgrjh
Name: customerid, Length: 296, dtype: object

In [364]:
churn_decision.astype(int)

array([0, 0, 1, ..., 0, 0, 1])

In [365]:
(y_val == churn_decision).mean()

0.794180269694819

In [366]:
df_pred = pd.DataFrame()

df_pred['probability'] = y_pred
df_pred['prediction'] = churn_decision.astype(int)
df_pred['actual'] = y_val

In [367]:
df_pred['correct'] = (df_pred.prediction == df_pred.actual)

In [368]:
df_pred.correct.mean()

0.794180269694819

## Model interpretation

In [369]:
dv.get_feature_names()



['contract=month-to-month',
 'contract=one_year',
 'contract=two_year',
 'dependents=no',
 'dependents=yes',
 'deviceprotection=no',
 'deviceprotection=no_internet_service',
 'deviceprotection=yes',
 'gender=female',
 'gender=male',
 'internetservice=dsl',
 'internetservice=fiber_optic',
 'internetservice=no',
 'monthlycharges',
 'multiplelines=no',
 'multiplelines=no_phone_service',
 'multiplelines=yes',
 'onlinebackup=no',
 'onlinebackup=no_internet_service',
 'onlinebackup=yes',
 'onlinesecurity=no',
 'onlinesecurity=no_internet_service',
 'onlinesecurity=yes',
 'paperlessbilling=no',
 'paperlessbilling=yes',
 'partner=no',
 'partner=yes',
 'paymentmethod=bank_transfer_(automatic)',
 'paymentmethod=credit_card_(automatic)',
 'paymentmethod=electronic_check',
 'paymentmethod=mailed_check',
 'phoneservice=no',
 'phoneservice=yes',
 'seniorcitizen',
 'streamingmovies=no',
 'streamingmovies=no_internet_service',
 'streamingmovies=yes',
 'streamingtv=no',
 'streamingtv=no_internet_servic

In [370]:
dict(zip(dv.get_feature_names(), model.coef_[0]))

{'contract=month-to-month': 0.29997256869083005,
 'contract=one_year': -0.19850411260770393,
 'contract=two_year': -0.2242955980389297,
 'dependents=no': 0.08918175983883296,
 'dependents=yes': -0.21200890180601822,
 'deviceprotection=no': 0.061089431346052686,
 'deviceprotection=no_internet_service': -0.095333495864987,
 'deviceprotection=yes': -0.08858307741508799,
 'gender=female': -0.023607194725614088,
 'gender=male': -0.09921994722236556,
 'internetservice=dsl': -0.2918378709391961,
 'internetservice=fiber_optic': 0.26434422486304915,
 'internetservice=no': -0.095333495864987,
 'monthlycharges': 0.0017544786537899077,
 'multiplelines=no': -0.2419288191936326,
 'multiplelines=no_phone_service': 0.05534175899607861,
 'multiplelines=yes': 0.06375991822061612,
 'onlinebackup=no': 0.09239387197726096,
 'onlinebackup=no_internet_service': -0.095333495864987,
 'onlinebackup=yes': -0.11988751804631324,
 'onlinesecurity=no': 0.23532212505785402,
 'onlinesecurity=no_internet_service': -0.0

In [371]:
small = ['contract', 'tenure', 'monthlycharges']

In [372]:
dicts_train_small = df_train[small].to_dict(orient='records')
dicts_val_small = df_val[small].to_dict(orient='records')

In [373]:
dv_small = DictVectorizer(sparse=False)
dv_small.fit(dicts_train_small)


In [374]:
dv_small.get_feature_names()



['contract=month-to-month',
 'contract=one_year',
 'contract=two_year',
 'monthlycharges',
 'tenure']

In [375]:
X_train_small = dv_small.transform(dicts_train_small)

In [376]:
model_small = LogisticRegression()
model_small.fit(X_train_small, y_train)

In [377]:
model_small.intercept_[0]

-2.7595255616483074

In [378]:
model_small.coef_[0].round(3)

array([ 1.064, -0.125, -0.937,  0.03 , -0.035])

In [379]:
dict(zip(dv_small.get_feature_names(),model_small.coef_[0].round(3) ))



{'contract=month-to-month': 1.064,
 'contract=one_year': -0.125,
 'contract=two_year': -0.937,
 'monthlycharges': 0.03,
 'tenure': -0.035}

## Using the model

In [381]:
dicts_full_train= df_full_train[categorical + numerical].to_dict(orient='records')

In [383]:
dv = DictVectorizer(sparse=False)
X_full_train = dv.fit_transform(dicts_full_train)
y_full_train = df_full_train.churn.values

In [384]:
model = LogisticRegression()
model.fit(X_full_train, y_full_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [386]:
dicts_test = df_test[categorical + numerical].to_dict(orient='records')
X_test = dv.transform(dicts_test)
y_pred = model.predict_proba(X_test)[:, 1]

In [387]:
churn_decision = (y_pred >= 0.5)

In [388]:
(churn_decision == y_test).mean()

0.8026969481902059

In [390]:
customer = dicts_test[10]
customer

{'gender': 'female',
 'seniorcitizen': 1,
 'partner': 'no',
 'dependents': 'no',
 'phoneservice': 'yes',
 'multiplelines': 'no',
 'internetservice': 'fiber_optic',
 'onlinesecurity': 'yes',
 'onlinebackup': 'yes',
 'deviceprotection': 'no',
 'techsupport': 'yes',
 'streamingtv': 'yes',
 'streamingmovies': 'yes',
 'contract': 'month-to-month',
 'paperlessbilling': 'yes',
 'paymentmethod': 'credit_card_(automatic)',
 'tenure': 43,
 'monthlycharges': 108.15,
 'totalcharges': 4600.7}

In [398]:
X_customer = dv.transform(customer)

In [399]:
model.predict_proba(X_customer)[0][1]

0.24173263271812367

## The customer above is not expected to churn