In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('A2.csv')

In [3]:
len(df)

8000

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,gender,SeniorCitizen,Partner,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,Churn,Geography,CreditScore,Surname,EstimatedSalary,MonthlyCharges,customerID,Dependents,PaymentMethod,Charge
0,0,Female,0.0,Yes,72.0,Yes,No,Fiber optic,Yes,Yes,...,No,Germany,602.0,Klein,45038.29,108.3,6519-ZHPXP,Yes,Bank transfer (automatic),108.3
1,1,Male,0.0,Yes,30.0,Yes,Yes,DSL,No,No,...,No,Germany,776.0,Lung,89893.6,49.9,1958-RNRKS,No,Electronic check,49.9
2,2,Male,0.0,Yes,49.0,No,No phone service,DSL,Yes,No,...,No,Germany,709.0,Lucciano,11.58,39.2,7989-AWGEH,Yes,Electronic check,39.2
3,3,Female,1.0,Yes,25.0,Yes,Yes,DSL,Yes,No,...,No,France,788.0,O'Brien,116978.19,69.5,6575-SUVOI,No,Credit card (automatic),69.5
4,4,Female,1.0,Yes,72.0,Yes,Yes,Fiber optic,Yes,Yes,...,No,Germany,733.0,Medvedeva,177994.81,114.65,1779-PWPMG,No,Bank transfer (automatic),114.65


In [5]:
df.head().T

Unnamed: 0,0,1,2,3,4
Unnamed: 0,0,1,2,3,4
gender,Female,Male,Male,Female,Female
SeniorCitizen,0.0,0.0,0.0,1.0,1.0
Partner,Yes,Yes,Yes,Yes,Yes
tenure,72.0,30.0,49.0,25.0,72.0
PhoneService,Yes,Yes,No,Yes,Yes
MultipleLines,No,Yes,No phone service,Yes,Yes
InternetService,Fiber optic,DSL,DSL,DSL,Fiber optic
OnlineSecurity,Yes,No,Yes,Yes,Yes
OnlineBackup,Yes,No,No,No,Yes


In [6]:
df.dtypes

Unnamed: 0            int64
gender               object
SeniorCitizen       float64
Partner              object
tenure              float64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
TotalCharges         object
Churn                object
Geography            object
CreditScore         float64
Surname              object
EstimatedSalary     float64
MonthlyCharges      float64
customerID           object
Dependents           object
PaymentMethod        object
Charge              float64
dtype: object

In [7]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(0)

In [8]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

string_columns = list(df.dtypes[df.dtypes == 'object'].index)

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [9]:
df.churn = (df.churn == 'yes').astype(int)

In [10]:
df.head().T

Unnamed: 0,0,1,2,3,4
unnamed:_0,0,1,2,3,4
gender,female,male,male,female,female
seniorcitizen,0.0,0.0,0.0,1.0,1.0
partner,yes,yes,yes,yes,yes
tenure,72.0,30.0,49.0,25.0,72.0
phoneservice,yes,yes,no,yes,yes
multiplelines,no,yes,no_phone_service,yes,yes
internetservice,fiber_optic,dsl,dsl,dsl,fiber_optic
onlinesecurity,yes,no,yes,yes,yes
onlinebackup,yes,no,no,no,yes


In [11]:
from sklearn.model_selection import train_test_split

In [12]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=11)
y_train = df_train.churn.values
y_val = df_val.churn.values

del df_train['churn']
del df_val['churn']

In [13]:
df_train_full.isnull().sum()

unnamed:_0           0
gender              48
seniorcitizen       36
partner             36
tenure              55
phoneservice        42
multiplelines       55
internetservice     44
onlinesecurity      46
onlinebackup        32
deviceprotection    34
techsupport         38
streamingtv         39
streamingmovies     44
contract            42
paperlessbilling    37
totalcharges         0
churn                0
geography           42
creditscore         52
surname             46
estimatedsalary     41
monthlycharges      23
customerid           0
dependents           0
paymentmethod        0
charge               0
dtype: int64

In [14]:
df_train_full.churn.value_counts()

0    4739
1    1661
Name: churn, dtype: int64

In [15]:
global_mean = df_train_full.churn.mean()
round(global_mean, 3)

0.26

In [16]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
               'phoneservice', 'multiplelines', 'internetservice',
               'onlinesecurity', 'onlinebackup', 'deviceprotection',
               'techsupport', 'streamingtv', 'streamingmovies',
               'contract', 'paperlessbilling', 'paymentmethod']
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [17]:
df_train_full[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

In [18]:
female_mean = df_train_full[df_train_full.gender == 'female'].churn.mean()
print('gender == female:', round(female_mean, 3))

male_mean = df_train_full[df_train_full.gender == 'male'].churn.mean()
print('gender == male:  ', round(male_mean, 3))

gender == female: 0.269
gender == male:   0.251


In [20]:
female_mean / global_mean

1.037797853676713

In [21]:
male_mean / global_mean

0.9683750038202138

In [22]:
partner_yes = df_train_full[df_train_full.partner == 'yes'].churn.mean()
print('partner == yes:', round(partner_yes, 3))

partner_no = df_train_full[df_train_full.partner == 'no'].churn.mean()
print('partner == no :', round(partner_no, 3))

partner == yes: 0.194
partner == no : 0.321


In [23]:
partner_yes / global_mean

0.7481384844614314

In [24]:
partner_no / global_mean

1.2385385700218805

In [25]:
df_group = df_train_full.groupby(by='gender').churn.agg(['mean'])
df_group['diff'] = df_group['mean'] - global_mean
df_group['risk'] = df_group['mean'] / global_mean
df_group

Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.269341,0.00981,1.037798
male,0.251324,-0.008208,0.968375


In [26]:
from IPython.display import display

In [27]:
global_mean = df_train_full.churn.mean()
global_mean

0.25953125

In [28]:
for col in categorical:
    df_group = df_train_full.groupby(by=col).churn.agg(['mean'])
    df_group['diff'] = df_group['mean'] - global_mean
    df_group['risk'] = df_group['mean'] / global_mean
    display(df_group)

Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.269341,0.00981,1.037798
male,0.251324,-0.008208,0.968375


Unnamed: 0_level_0,mean,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.232571,-0.02696,0.89612
1.0,0.401751,0.14222,1.547987


Unnamed: 0_level_0,mean,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.321439,0.061908,1.238539
yes,0.194165,-0.065366,0.748138


Unnamed: 0_level_0,mean,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.305499,0.045968,1.17712
yes,0.154242,-0.10529,0.594309


Unnamed: 0_level_0,mean,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.238636,-0.020895,0.91949
yes,0.261755,0.002224,1.00857


Unnamed: 0_level_0,mean,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.245837,-0.013694,0.947236
no_phone_service,0.241042,-0.018489,0.92876
yes,0.279985,0.020454,1.07881


Unnamed: 0_level_0,mean,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dsl,0.187813,-0.071719,0.723661
fiber_optic,0.410137,0.150605,1.580298
no,0.070545,-0.188986,0.271819


Unnamed: 0_level_0,mean,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.41092,0.151388,1.583314
no_internet_service,0.069666,-0.189865,0.268431
yes,0.143709,-0.115822,0.553726


Unnamed: 0_level_0,mean,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.39118,0.131648,1.507254
no_internet_service,0.069515,-0.190016,0.267848
yes,0.211101,-0.04843,0.813393


Unnamed: 0_level_0,mean,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.384864,0.125332,1.482919
no_internet_service,0.069192,-0.19034,0.266602
yes,0.219955,-0.039577,0.847507


Unnamed: 0_level_0,mean,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.415052,0.15552,1.599236
no_internet_service,0.070188,-0.189343,0.270442
yes,0.141189,-0.118342,0.544015


Unnamed: 0_level_0,mean,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.326078,0.066547,1.256413
no_internet_service,0.070443,-0.189088,0.271424
yes,0.298331,0.0388,1.1495


Unnamed: 0_level_0,mean,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.330645,0.071114,1.274009
no_internet_service,0.070494,-0.189037,0.271621
yes,0.2924,0.032869,1.126647


Unnamed: 0_level_0,mean,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
month-to-month,0.421098,0.161567,1.622533
one_year,0.11053,-0.149001,0.425884
two_year,0.025974,-0.233557,0.100081


Unnamed: 0_level_0,mean,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.161839,-0.097693,0.62358
yes,0.327239,0.067708,1.260885


Unnamed: 0_level_0,mean,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bank_transfer_(automatic),0.17,-0.089531,0.655027
credit_card_(automatic),0.148432,-0.111099,0.571924
electronic_check,0.444871,0.185339,1.714131
mailed_check,0.190669,-0.068862,0.734668


In [31]:
from sklearn.metrics import mutual_info_score


In [32]:
def calculate_mi(series):
    return mutual_info_score(series, df_train_full.churn)

df_mi = df_train_full[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')


display(df_mi.head())
display(df_mi.tail())

ValueError: Input contains NaN

In [33]:
df_train_full[numerical].corrwith(df_train_full.churn).to_frame('correlation')

Unnamed: 0,correlation
tenure,-0.349116
monthlycharges,0.190697
totalcharges,-0.1965


In [34]:
df_train_full.groupby(by='churn')[numerical].mean()

Unnamed: 0_level_0,tenure,monthlycharges,totalcharges
churn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,37.60481,61.439589,2550.07816
1,17.986027,74.530827,1528.268423


In [35]:
from sklearn.feature_extraction import DictVectorizer

In [36]:
train_dict = df_train[categorical + numerical].to_dict(orient='records')

In [37]:
train_dict[0]

{'gender': 'female',
 'seniorcitizen': 0.0,
 'partner': 'no',
 'dependents': 'no',
 'phoneservice': 'yes',
 'multiplelines': 'no',
 'internetservice': 'no',
 'onlinesecurity': 'no_internet_service',
 'onlinebackup': 'no_internet_service',
 'deviceprotection': 'no_internet_service',
 'techsupport': 'no_internet_service',
 'streamingtv': 'no_internet_service',
 'streamingmovies': 'no_internet_service',
 'contract': 'month-to-month',
 'paperlessbilling': 'yes',
 'paymentmethod': 'mailed_check',
 'tenure': 1.0,
 'monthlycharges': 19.9,
 'totalcharges': 19.9}

In [44]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

X_train = dv.transform(train_dict)
X_train.shape

(4288, 58)

In [39]:
dv.get_feature_names()



['contract',
 'contract=month-to-month',
 'contract=one_year',
 'contract=two_year',
 'dependents=no',
 'dependents=yes',
 'deviceprotection',
 'deviceprotection=no',
 'deviceprotection=no_internet_service',
 'deviceprotection=yes',
 'gender',
 'gender=female',
 'gender=male',
 'internetservice',
 'internetservice=dsl',
 'internetservice=fiber_optic',
 'internetservice=no',
 'monthlycharges',
 'multiplelines',
 'multiplelines=no',
 'multiplelines=no_phone_service',
 'multiplelines=yes',
 'onlinebackup',
 'onlinebackup=no',
 'onlinebackup=no_internet_service',
 'onlinebackup=yes',
 'onlinesecurity',
 'onlinesecurity=no',
 'onlinesecurity=no_internet_service',
 'onlinesecurity=yes',
 'paperlessbilling',
 'paperlessbilling=no',
 'paperlessbilling=yes',
 'partner',
 'partner=no',
 'partner=yes',
 'paymentmethod=bank_transfer_(automatic)',
 'paymentmethod=credit_card_(automatic)',
 'paymentmethod=electronic_check',
 'paymentmethod=mailed_check',
 'phoneservice',
 'phoneservice=no',
 'phones

In [40]:
from sklearn.linear_model import LogisticRegression

In [45]:
df_train = df_train.dropna()

In [46]:
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_train, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [47]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3770 entries, 3361 to 1533
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   unnamed:_0        3770 non-null   int64  
 1   gender            3770 non-null   object 
 2   seniorcitizen     3770 non-null   float64
 3   partner           3770 non-null   object 
 4   tenure            3770 non-null   float64
 5   phoneservice      3770 non-null   object 
 6   multiplelines     3770 non-null   object 
 7   internetservice   3770 non-null   object 
 8   onlinesecurity    3770 non-null   object 
 9   onlinebackup      3770 non-null   object 
 10  deviceprotection  3770 non-null   object 
 11  techsupport       3770 non-null   object 
 12  streamingtv       3770 non-null   object 
 13  streamingmovies   3770 non-null   object 
 14  contract          3770 non-null   object 
 15  paperlessbilling  3770 non-null   object 
 16  totalcharges      3770 non-null   float