In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("data/Telco-Customer-Churn.csv")
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [3]:
df.drop("customerID", axis = 1, inplace=True)

In [4]:
df.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [5]:
## Total charges column contains non-numeric data

df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')
df.TotalCharges = df.TotalCharges.fillna(0)

In [6]:
# lowercasing column names and string column values

df.columns = df.columns.str.lower().str.replace(" ", "_")

string_columns = list(df.dtypes[df.dtypes == "object"].index)
for col in string_columns:
    df[col] = df[col].str.lower().str.replace(" ", "_")


In [7]:
df.dtypes

gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
churn                object
dtype: object

In [8]:
df.churn.value_counts()

no     5174
yes    1869
Name: churn, dtype: int64

In [9]:
# Converting churn categories to 0 and 1
df.churn = (df.churn == "yes").astype(int)

In [10]:
df.churn.value_counts()

0    5174
1    1869
Name: churn, dtype: int64

In [11]:
df.seniorcitizen = df.seniorcitizen.astype("object")

In [12]:
# Train test and val split
from sklearn.model_selection import train_test_split

df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=11)

y_train = df_train.churn.values
y_val = df_val.churn.values

del df_train["churn"]
del df_val["churn"]

In [13]:
# EDA

df_train.isnull().sum()

gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
dtype: int64

In [14]:
# distribution of the target variable

df_train_full.churn.value_counts()

0    4113
1    1521
Name: churn, dtype: int64

In [15]:
# churn rate

df_train_full.churn.value_counts()[1]/len(df_train_full.churn)

0.26996805111821087

In [16]:
df_train_full.churn.mean()

0.26996805111821087

In [17]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
'phoneservice', 'multiplelines', 'internetservice',
'onlinesecurity', 'onlinebackup', 'deviceprotection',
'techsupport', 'streamingtv', 'streamingmovies',
'contract', 'paperlessbilling', 'paymentmethod']

numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [18]:
df_train_full[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

### 3.14 Feature Importance

In [19]:
# Gender
df_train_full.groupby("gender")["churn"].mean().round(3)

gender
female    0.277
male      0.263
Name: churn, dtype: float64

In [20]:
# Partner
df_train_full.groupby("partner")["churn"].mean().round(3)

partner
no     0.330
yes    0.205
Name: churn, dtype: float64

### Risk Ratio

In [21]:
df_train_full.churn.mean()

0.26996805111821087

In [22]:
# Female risk rate
27.7/26.99

1.026306039273805

In [23]:
df_train_full.groupby("gender")['churn'].agg(["mean"])

Unnamed: 0_level_0,mean
gender,Unnamed: 1_level_1
female,0.276824
male,0.263214


In [24]:
global_mean = df_train_full.churn.mean()

for col in categorical:
    df_group = df_train_full.groupby(col)['churn'].agg(["mean"])
    df_group["diff"] = df_group["mean"] - global_mean
    df_group["rate"] = df_group["mean"] / global_mean
    print(df_group)
    

            mean      diff      rate
gender                              
female  0.276824  0.006856  1.025396
male    0.263214 -0.006755  0.974980
                   mean      diff      rate
seniorcitizen                              
0              0.242270 -0.027698  0.897403
1              0.413377  0.143409  1.531208
             mean      diff      rate
partner                              
no       0.329809  0.059841  1.221659
yes      0.205033 -0.064935  0.759472
                mean      diff      rate
dependents                              
no          0.313760  0.043792  1.162212
yes         0.165666 -0.104302  0.613651
                  mean      diff      rate
phoneservice                              
no            0.241316 -0.028652  0.893870
yes           0.273049  0.003081  1.011412
                      mean      diff      rate
multiplelines                                 
no                0.257407 -0.012561  0.953474
no_phone_service  0.241316 -0.028652  0.893870


### Mutual Information

In [25]:
from sklearn.metrics import mutual_info_score

def calculate_mi(series):
    return mutual_info_score(series, df_train_full.churn)

df_mi = df_train_full[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name="MI")
df_mi

Unnamed: 0,MI
contract,0.09832
onlinesecurity,0.063085
techsupport,0.061032
internetservice,0.055868
onlinebackup,0.046923
deviceprotection,0.043453
paymentmethod,0.04321
streamingtv,0.031853
streamingmovies,0.031581
paperlessbilling,0.017589


In [26]:
# Correlation
df_train_full[numerical].corrwith(df_train_full['churn'])

tenure           -0.351885
monthlycharges    0.196805
totalcharges     -0.196353
dtype: float64

### 3.2 Feature engineering

In [27]:
df_train[categorical + numerical]

Unnamed: 0,gender,seniorcitizen,partner,dependents,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,tenure,monthlycharges,totalcharges
2935,male,0,yes,no,yes,no,dsl,yes,yes,yes,yes,yes,yes,two_year,yes,bank_transfer_(automatic),71,86.10,6045.90
3639,female,1,yes,no,yes,yes,fiber_optic,no,no,yes,no,yes,yes,one_year,yes,credit_card_(automatic),60,100.50,6029.00
2356,male,0,no,no,yes,no,dsl,no,no,no,no,no,no,month-to-month,yes,credit_card_(automatic),46,45.20,2065.15
6660,male,0,yes,no,yes,no,fiber_optic,no,no,no,no,no,no,month-to-month,yes,electronic_check,1,69.15,69.15
755,male,1,no,no,yes,yes,fiber_optic,no,no,yes,no,yes,yes,month-to-month,yes,electronic_check,20,98.55,1842.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3977,male,0,yes,no,yes,no,fiber_optic,yes,yes,yes,yes,yes,yes,two_year,yes,credit_card_(automatic),63,110.10,6705.70
6273,female,1,no,no,yes,no,fiber_optic,no,no,no,no,no,no,month-to-month,yes,electronic_check,2,70.65,142.35
3790,male,0,no,no,yes,no,no,no_internet_service,no_internet_service,no_internet_service,no_internet_service,no_internet_service,no_internet_service,one_year,no,credit_card_(automatic),2,19.30,28.30
5712,female,0,no,no,yes,no,no,no_internet_service,no_internet_service,no_internet_service,no_internet_service,no_internet_service,no_internet_service,month-to-month,yes,bank_transfer_(automatic),23,20.30,470.60


In [28]:
train_dict = df_train[categorical + numerical].to_dict(orient = "records")
train_dict

[{'gender': 'male',
  'seniorcitizen': 0,
  'partner': 'yes',
  'dependents': 'no',
  'phoneservice': 'yes',
  'multiplelines': 'no',
  'internetservice': 'dsl',
  'onlinesecurity': 'yes',
  'onlinebackup': 'yes',
  'deviceprotection': 'yes',
  'techsupport': 'yes',
  'streamingtv': 'yes',
  'streamingmovies': 'yes',
  'contract': 'two_year',
  'paperlessbilling': 'yes',
  'paymentmethod': 'bank_transfer_(automatic)',
  'tenure': 71,
  'monthlycharges': 86.1,
  'totalcharges': 6045.9},
 {'gender': 'female',
  'seniorcitizen': 1,
  'partner': 'yes',
  'dependents': 'no',
  'phoneservice': 'yes',
  'multiplelines': 'yes',
  'internetservice': 'fiber_optic',
  'onlinesecurity': 'no',
  'onlinebackup': 'no',
  'deviceprotection': 'yes',
  'techsupport': 'no',
  'streamingtv': 'yes',
  'streamingmovies': 'yes',
  'contract': 'one_year',
  'paperlessbilling': 'yes',
  'paymentmethod': 'credit_card_(automatic)',
  'tenure': 60,
  'monthlycharges': 100.5,
  'totalcharges': 6029.0},
 {'gender':

In [29]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse = False)
dv.fit(train_dict)

x_train = dv.transform(train_dict)
x_train[0]

array([0.0000e+00, 0.0000e+00, 1.0000e+00, 1.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00,
       1.0000e+00, 0.0000e+00, 0.0000e+00, 8.6100e+01, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 1.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 7.1000e+01, 6.0459e+03])

In [30]:
dv.get_feature_names()



['contract=month-to-month',
 'contract=one_year',
 'contract=two_year',
 'dependents=no',
 'dependents=yes',
 'deviceprotection=no',
 'deviceprotection=no_internet_service',
 'deviceprotection=yes',
 'gender=female',
 'gender=male',
 'internetservice=dsl',
 'internetservice=fiber_optic',
 'internetservice=no',
 'monthlycharges',
 'multiplelines=no',
 'multiplelines=no_phone_service',
 'multiplelines=yes',
 'onlinebackup=no',
 'onlinebackup=no_internet_service',
 'onlinebackup=yes',
 'onlinesecurity=no',
 'onlinesecurity=no_internet_service',
 'onlinesecurity=yes',
 'paperlessbilling=no',
 'paperlessbilling=yes',
 'partner=no',
 'partner=yes',
 'paymentmethod=bank_transfer_(automatic)',
 'paymentmethod=credit_card_(automatic)',
 'paymentmethod=electronic_check',
 'paymentmethod=mailed_check',
 'phoneservice=no',
 'phoneservice=yes',
 'seniorcitizen',
 'streamingmovies=no',
 'streamingmovies=no_internet_service',
 'streamingmovies=yes',
 'streamingtv=no',
 'streamingtv=no_internet_servic

### Machine learning for classification

In [31]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver = "liblinear", random_state=1)
model.fit(x_train , y_train)

In [32]:
val_dict = df_val[categorical + numerical].to_dict(orient = "records")
x_val = dv.transform(val_dict)

In [33]:
y_pred = model.predict(x_val)

In [34]:
y_pred = model.predict_proba(x_val)[:, 1]
y_pred

array([0.23491043, 0.26886416, 0.31945136, ..., 0.05725221, 0.61523005,
       0.0612727 ])

In [35]:
churn = y_pred >= 0.5

(y_val == churn).mean()

0.8016129032258065

In [36]:
dict(zip(dv.get_feature_names(), model.coef_[0].round(3)))



{'contract=month-to-month': 0.563,
 'contract=one_year': -0.086,
 'contract=two_year': -0.599,
 'dependents=no': -0.03,
 'dependents=yes': -0.092,
 'deviceprotection=no': 0.1,
 'deviceprotection=no_internet_service': -0.116,
 'deviceprotection=yes': -0.106,
 'gender=female': -0.027,
 'gender=male': -0.095,
 'internetservice=dsl': -0.323,
 'internetservice=fiber_optic': 0.317,
 'internetservice=no': -0.116,
 'monthlycharges': 0.001,
 'multiplelines=no': -0.168,
 'multiplelines=no_phone_service': 0.127,
 'multiplelines=yes': -0.081,
 'onlinebackup=no': 0.136,
 'onlinebackup=no_internet_service': -0.116,
 'onlinebackup=yes': -0.142,
 'onlinesecurity=no': 0.258,
 'onlinesecurity=no_internet_service': -0.116,
 'onlinesecurity=yes': -0.264,
 'paperlessbilling=no': -0.213,
 'paperlessbilling=yes': 0.091,
 'partner=no': -0.048,
 'partner=yes': -0.074,
 'paymentmethod=bank_transfer_(automatic)': -0.027,
 'paymentmethod=credit_card_(automatic)': -0.136,
 'paymentmethod=electronic_check': 0.175,


# Model deployment

In [49]:
customer = {
'gender': 'female',
'seniorcitizen': 0,
'partner': 'no',
'dependents': 'no',
'tenure': 41,
'phoneservice': 'yes',
'multiplelines': 'no',
'internetservice': 'dsl',
'onlinesecurity': 'yes',
'onlinebackup': 'no',
'deviceprotection': 'yes',
'techsupport': 'yes',
'streamingtv': 'yes',
'streamingmovies': 'yes',
'contract': 'one_year',
'paperlessbilling': 'yes',
'paymentmethod': 'bank_transfer_(automatic)',
'monthlycharges': 79.85,
'totalcharges': 3320.75,
}

In [50]:
dv.transform([customer])

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 0.00000e+00, 7.98500e+01, 1.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 1.00000e+00,
        1.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 4.10000e+01, 3.32075e+03]])

In [51]:
def predict_single(customer, dv, model):
    X = dv.transform([customer])
    y_pred = model.predict_proba(X)[:, 1]
    return y_pred[0]

In [52]:
predict_single(customer, dv, model)

0.07332292583970206

### Saving a pickle file

In [53]:
import pickle

# Saving model in binary file
with open('churn-model.bin', 'wb') as f_out:
    pickle.dump((dv, model), f_out)

In [54]:
# Opening model 

with open('churn-model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [55]:
customer = {
'gender': 'female',
'seniorcitizen': 0,
'partner': 'no',
'dependents': 'no',
'tenure': 41,
'phoneservice': 'yes',
'multiplelines': 'no',
'internetservice': 'dsl',
'onlinesecurity': 'yes',
'onlinebackup': 'no',
'deviceprotection': 'yes',
'techsupport': 'yes',
'streamingtv': 'yes',
'streamingmovies': 'yes',
'contract': 'one_year',
'paperlessbilling': 'yes',
'paymentmethod': 'bank_transfer_(automatic)',
'monthlycharges': 79.85,
'totalcharges': 3320.75
}

In [56]:
prediction = predict_single(customer, dv, model)

print("prediction: %.3f" % prediction)

if prediction >= 0.5:
    print('verdict: Churn')
else:
    print('verdict: Not churn')

prediction: 0.073
verdict: Not churn
