In [11]:
import pandas as pd
import numpy as np

import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
# To see more data
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


The data has a few coolumns

- CustomerID -- the ID of the customer
- Gender -- male/female
- Senior Citizen -- whether the customer is a senior citizen (0/1)
- Partner -- whether they live with a partner (yes/no)
- Dependents -- whether they have dependants (yes/no)
- Tenure -- number of months since the start of the contract
- Phone service -- whether they have phone services (yes/no)
- Multiple lines -- whether they have multiple phone lines (yes/no/no phone service)
- Internet service -- the type of internet service (no/fiber/optic)
- Online security -- if online security is enabled (yes/no/no internet)
- Online backup -- if online bakup service is enabled (yes/no/no internet)
- Device protection -- if the device protection service is enabled (yes/no/no internet)
- Tech support -- if the customer have tech support (yes/no/no internet)
- Streaming TV -- if the TV streaming service is enabled (yes/no/no internet)
- Streaming movies -- if the moovie streaming service is enabled (yes/no/no internet)
- Contract -- the type of contract (monthly/yearly/two years)
- Paperless billing -- if the billing is paperless (yes/no)
- Payment method -- payment method (electronic check, mailed check, bank transfer, credit card)
- Monthly charges -- the amount charged monthly (numeric)
- Total charges -- the total amount charged (numeric)
- Churn -- if the client has canceled the contract (yes/no)

In [4]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [6]:
# Convert the column TotalCharges to numeric
total_charegs = pd.to_numeric(df.TotalCharges, errors='coerce') # errors='coerce' to replace all non-numeric values with a NaN

# To confirm that there are non-numeric characters in the data
df[total_charegs.isnull()][['customerID', 'TotalCharges']]

Unnamed: 0,customerID,TotalCharges
488,4472-LVYGI,
753,3115-CZMZD,
936,5709-LVOEQ,
1082,4367-NUYAO,
1340,1371-DWPAZ,
3331,7644-OMVMY,
3826,3213-VVOLG,
4380,2520-SGTTA,
5218,2923-ARZLG,
6670,4075-WKNIU,


In [7]:
# Set the missing values to 0
df.TotalCharges = df.TotalCharges.fillna(0)

In [9]:
# Make the column_names uniform
df.columns = df.columns.str.lower().str.replace(' ', '_')
string_columns = list(df.dtypes[df.dtypes == 'object'].index)
for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [10]:
# Convert the target to number (0 for no and 1 for yes)
df.churn = (df.churn == 'yes').astype(int)

In [13]:
# Split the data in train, validation and test
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1) # random_state=1 Controlls the shuffling before the split 
# Set the random seed to ensure that the result is the same at each code run
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=11)

# Take the churn column and save it out of the dataframe
y_train = df_train.churn.values
y_val = df_val.churn.values

# Delete the churn column to avoid confusion
del df_train['churn']
del df_val['churn']