Chapter 03 - Binary classification
==================================

In [71]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer

### 0. Functions

In [2]:
def normalize_columns(df):
    df.columns = df.columns.str.lower().str.replace(' ', '_')
    return df

def normalize_values(df):
    str_columns = list(df.dtypes[df.dtypes == 'object'].index)    
    for col in str_columns:
        df[col] = df[col].str.lower().str.replace(' ', '_')    
    return df


### 1. Data load

In [3]:
df_telco = (
    pd.read_csv('data/telco_churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
    .pipe(normalize_columns)
    .pipe(normalize_values)
    .assign(**{
        'churn': lambda x: x['churn'].__eq__('yes').astype(int),
        'totalcharges':  lambda x: pd.to_numeric(x['totalcharges'], errors='coerce').fillna(0)
        
    })
)

df_telco.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-vhveg,female,0,yes,no,1,no,no_phone_service,dsl,no,...,no,no,no,no,month-to-month,yes,electronic_check,29.85,29.85,0
1,5575-gnvde,male,0,no,no,34,yes,no,dsl,yes,...,yes,no,no,no,one_year,no,mailed_check,56.95,1889.5,0
2,3668-qpybk,male,0,no,no,2,yes,no,dsl,yes,...,no,no,no,no,month-to-month,yes,mailed_check,53.85,108.15,1
3,7795-cfocw,male,0,no,no,45,no,no_phone_service,dsl,yes,...,yes,yes,no,no,one_year,no,bank_transfer_(automatic),42.3,1840.75,0
4,9237-hqitu,female,0,no,no,2,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,70.7,151.65,1


In [4]:
df_telco.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
churn                 int64
dtype: object

### 2. Training

In [5]:
# training data
df_train_full, df_test = train_test_split(df_telco, test_size=0.2, random_state=1)

In [6]:
# validation data
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=11)

y_train = df_train['churn'].values
y_val = df_val['churn'].values

del df_train['churn']
del df_val['churn']

In [7]:
### 3.1.3 Exploratory data analysis
df_train_full.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [8]:
# Target value distribution
df_train_full['churn'].value_counts()

0    4113
1    1521
Name: churn, dtype: int64

In [9]:
train_mean = df_train_full['churn'].mean()
train_mean

0.26996805111821087

In [10]:
# Categorical columns
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents'
       , 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod']


numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [11]:
df_train_full[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

In [12]:
female_mean = df_train_full[df_train_full.gender == 'female'].churn.mean()
female_mean

0.27682403433476394

In [13]:
male_mean = df_train_full[df_train_full.gender == 'male'].churn.mean()
male_mean

0.2632135306553911

In [14]:
# Make categorical columns categories
# for category in categorical:
#     df_train_full[category] = pd.Categorical(df_train_full[category])


# diff_points = .20
    
# # Loop through categories
# for category in categorical:
#     for cat_value in df_train_full[category].cat.categories:
#         cat_mean = df_train_full[df_train_full[category] == cat_value]['churn'].mean()
#         if cat_mean / train_mean > 1.2:
#             print(f"{category} {cat_value} {cat_mean:.2f}")

# Compare category with churn mean

# Print comparison if outside of n% difference

In [80]:
churn_global_mean = df_train_full['churn'].mean()
print(f'{churn_global_mean=:.5f}')

df_churn_analysis =(
    pd.melt(df_train_full, id_vars=['customerid','churn'], value_vars=categorical)
    .groupby(['variable', 'value'])
    .agg(group_size=('churn', 'count'), churn_mean=('churn', 'mean'))
    .assign(**{
        'churn_global_diff': lambda x: x['churn_mean'] - churn_global_mean,
        'churn_risk': lambda x: x['churn_mean'] / churn_global_mean
    })
    .round({'churn_mean': 3, 'churn_risk': 3})
    # .reset_index()
    # .sort_values(by=['churn_risk'], ascending=False)
)
df_churn_analysis

churn_global_mean=0.26997


Unnamed: 0_level_0,Unnamed: 1_level_0,group_size,churn_mean,churn_global_diff,churn_risk
variable,value,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
contract,month-to-month,3104,0.432,0.161733,1.599
contract,one_year,1186,0.121,-0.149395,0.447
contract,two_year,1344,0.028,-0.241694,0.105
dependents,no,3968,0.314,0.043792,1.162
dependents,yes,1666,0.166,-0.104302,0.614
deviceprotection,no,2473,0.396,0.125907,1.466
deviceprotection,no_internet_service,1221,0.078,-0.192163,0.288
deviceprotection,yes,1940,0.23,-0.039556,0.853
gender,female,2796,0.277,0.006856,1.025
gender,male,2838,0.263,-0.006755,0.975


In [66]:
# Mutual information
def calculate_mi(series):
    return mutual_info_score(series, df_train_full['churn'])

df_mi = (
    df_train_full[categorical]
    .apply(calculate_mi)
    .sort_values(ascending=False)
    .to_frame(name='MI')
)
df_mi

Unnamed: 0,MI
contract,0.09832
onlinesecurity,0.063085
techsupport,0.061032
internetservice,0.055868
onlinebackup,0.046923
deviceprotection,0.043453
paymentmethod,0.04321
streamingtv,0.031853
streamingmovies,0.031581
paperlessbilling,0.017589


In [None]:
# Correlation
(
    df_train_full[numerical]
    .corrwith(df_train_full['churn'])
    .reset_index()
    .rename(columns={0:'correlation'})
)

Unnamed: 0,index,correlation
0,tenure,-0.351885
1,monthlycharges,0.196805
2,totalcharges,-0.196353


In [76]:
# Dictionary vecotrizor
train_dict = df_train[categorical + numerical].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

# Convert dictionary to matrix
X_train = dv.transform(train_dict)

X_train[0]

array([0.0000e+00, 0.0000e+00, 1.0000e+00, 1.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00,
       1.0000e+00, 0.0000e+00, 0.0000e+00, 8.6100e+01, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 1.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 7.1000e+01, 6.0459e+03])

In [78]:
dv.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'dependents=no', 'dependents=yes',
       'deviceprotection=no', 'deviceprotection=no_internet_service',
       'deviceprotection=yes', 'gender=female', 'gender=male',
       'internetservice=dsl', 'internetservice=fiber_optic',
       'internetservice=no', 'monthlycharges', 'multiplelines=no',
       'multiplelines=no_phone_service', 'multiplelines=yes',
       'onlinebackup=no', 'onlinebackup=no_internet_service',
       'onlinebackup=yes', 'onlinesecurity=no',
       'onlinesecurity=no_internet_service', 'onlinesecurity=yes',
       'paperlessbilling=no', 'paperlessbilling=yes', 'partner=no',
       'partner=yes', 'paymentmethod=bank_transfer_(automatic)',
       'paymentmethod=credit_card_(automatic)',
       'paymentmethod=electronic_check', 'paymentmethod=mailed_check',
       'phoneservice=no', 'phoneservice=yes', 'seniorcitizen',
       'streamingmovies=no', 'streamingmovies=no_internet_service',