In [1]:
# MACHINE LEARNING FOR CLASSIFICATION

In [2]:
# DATA PREPARATION

# import library

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [3]:
df = pd.read_csv('Telco-Customer-Churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
# to check all columns we can use Transpose
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [5]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [6]:
df.columns = df.columns.str.lower().str.replace(' ','_')
df.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [7]:
# for the value
strings  = list(df.dtypes[df.dtypes == 'object'].index)

for c in strings:
    df[c] = df[c].str.lower().str.replace(' ','_')

In [8]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [9]:
df.dtypes # senior_citizen must be an object(yes or no) and total_charges must be in float

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges         object
churn                object
dtype: object

In [10]:
pd.to_numeric(df.totalcharges)

ValueError: Unable to parse string "_" at position 488

In [11]:
df.totalcharges[488] # we found that the value was not true, 
# it maybe becomes space because we use replace before, 
# or the actual value is null

'_'

In [12]:
# to solved the error we can use
tc = pd.to_numeric(df.totalcharges, errors='coerce') # coerce, force the errors to be numeric

In [13]:
tc.isnull().sum() #found the null value is 11, then we check use the code below

11

In [14]:
df[tc.isnull()][['customerid', 'totalcharges']]

Unnamed: 0,customerid,totalcharges
488,4472-lvygi,_
753,3115-czmzd,_
936,5709-lvoeq,_
1082,4367-nuyao,_
1340,1371-dwpaz,_
3331,7644-omvmy,_
3826,3213-vvolg,_
4380,2520-sgtta,_
5218,2923-arzlg,_
6670,4075-wkniu,_


In [15]:
# so we can fillna, use
df.totalcharges = df.totalcharges.fillna(0)

In [16]:
# check the churn columns
df.churn.head()

0     no
1     no
2    yes
3     no
4    yes
Name: churn, dtype: object

In [17]:
# in machine learning we interested on 1, 0, so we can use
df.churn = (df.churn == 'yes').astype(int)

In [18]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [19]:
# SETTING UP VALIDATION FRAMEWORK
# using scikit-learn
from sklearn.model_selection import train_test_split

In [30]:
df_full_train, df_test = train_test_split(df, test_size= 0.2, random_state=1)

In [33]:
#df validation and train
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [34]:
len(df_train), len(df_val), len(df_test)

(4225, 1409, 1409)

In [35]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [36]:
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

In [37]:
del df_train['churn']
del df_val['churn']
del df_test['churn']

In [39]:
df_train.head().T

Unnamed: 0,0,1,2,3,4
customerid,8015-ihcgw,1960-uycnn,9250-wypll,6786-obwqr,1328-euzhc
gender,female,male,female,female,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,yes,yes
dependents,yes,no,no,yes,no
tenure,72,10,5,5,18
phoneservice,yes,yes,yes,yes,yes
multiplelines,yes,yes,yes,no,no
internetservice,fiber_optic,fiber_optic,fiber_optic,fiber_optic,no
onlinesecurity,yes,no,no,no,no_internet_service


In [40]:
df_val.head().T

Unnamed: 0,0,1,2,3,4
customerid,5846-neqvz,3645-deygf,3590-tcxtb,8433-wxgna,2654-vbvpb
gender,male,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,yes,yes,no,no,no
tenure,71,1,1,2,1
phoneservice,yes,yes,yes,yes,yes
multiplelines,no,no,no,no,no
internetservice,dsl,no,no,fiber_optic,no
onlinesecurity,yes,no_internet_service,no_internet_service,yes,no_internet_service


In [41]:
df_test.head().T

Unnamed: 0,0,1,2,3,4
customerid,8879-zkjof,0201-mibol,1600-dilpe,8601-qacrs,7919-zodzz
gender,female,female,female,female,female
seniorcitizen,0,1,0,0,0
partner,no,no,no,no,yes
dependents,no,no,no,no,yes
tenure,41,66,12,5,10
phoneservice,yes,yes,yes,yes,yes
multiplelines,no,yes,no,yes,no
internetservice,dsl,fiber_optic,dsl,dsl,dsl
onlinesecurity,yes,yes,no,no,no


In [42]:
y_train

array([0, 0, 1, ..., 1, 0, 1])

In [44]:
y_val

array([0, 0, 0, ..., 0, 1, 1])

In [43]:
y_test

array([0, 0, 0, ..., 0, 0, 1])