Chapter 03 - Binary classification
==================================

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split

### 0. Functions

In [2]:
def normalize_columns(df):
    df.columns = df.columns.str.lower().str.replace(' ', '_')
    return df

def normalize_values(df):
    str_columns = list(df.dtypes[df.dtypes == 'object'].index)    
    for col in str_columns:
        df[col] = df[col].str.lower().str.replace(' ', '_')    
    return df


### 1. Data load

In [3]:
df_telco = (
    pd.read_csv('data/telco_churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
    .pipe(normalize_columns)
    .pipe(normalize_values)
)

df_telco.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-vhveg,female,0,yes,no,1,no,no_phone_service,dsl,no,...,no,no,no,no,month-to-month,yes,electronic_check,29.85,29.85,no
1,5575-gnvde,male,0,no,no,34,yes,no,dsl,yes,...,yes,no,no,no,one_year,no,mailed_check,56.95,1889.5,no
2,3668-qpybk,male,0,no,no,2,yes,no,dsl,yes,...,no,no,no,no,month-to-month,yes,mailed_check,53.85,108.15,yes
3,7795-cfocw,male,0,no,no,45,no,no_phone_service,dsl,yes,...,yes,yes,no,no,one_year,no,bank_transfer_(automatic),42.3,1840.75,no
4,9237-hqitu,female,0,no,no,2,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,70.7,151.65,yes


In [4]:
df_telco.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges         object
churn                object
dtype: object

In [5]:
total_charges = pd.to_numeric(df_telco['totalcharges'], errors='coerce')
total_charges
df_telco[total_charges.isnull()][['customerid', 'totalcharges']]

Unnamed: 0,customerid,totalcharges
488,4472-lvygi,_
753,3115-czmzd,_
936,5709-lvoeq,_
1082,4367-nuyao,_
1340,1371-dwpaz,_
3331,7644-omvmy,_
3826,3213-vvolg,_
4380,2520-sgtta,_
5218,2923-arzlg,_
6670,4075-wkniu,_


In [6]:
df_telco['totalcharges'] = pd.to_numeric(df_telco['totalcharges'], errors='coerce')
df_telco['totalcharges'] = df_telco['totalcharges'].fillna(0)

In [7]:
df_telco['churn'] =  (df_telco['churn'] == 'yes').astype(int)

In [8]:
# training data
df_train_full, df_test = train_test_split(df_telco, test_size=0.2, random_state=1)

In [9]:
# validation data
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=11)

y_train = df_train['churn'].values
y_val = df_val['churn'].values

del df_train['churn']
del df_val['churn']

In [10]:
### 3.1.3 Exploratory data analysis
df_train_full.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [11]:
# Target value distribution
df_train_full['churn'].value_counts()

0    4113
1    1521
Name: churn, dtype: int64

In [None]:
# Categorical columns

