In [33]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

#since I may get warnings, I can do the following
import warnings
warnings.filterwarnings("ignore")

# Import dataset

In [34]:
#to load csv
df = pd.read_csv('ch3_dataset.csv')
len(df)

7043

 # Beginning examination of the data

In [35]:
#look at first rows
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [36]:
#look at column names and change all uppercase to lowercase
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [37]:
#to look at the datatypes
df.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges         object
churn                object
dtype: object

In [38]:
#from above, seniorcitizen should not be an integer; it should be an object
# also from above, totalcharges should not be an object; it should be a float64
#I tried to do this before but didn't know how to convert the data

# Converting data types

In [39]:
#to convert totalcharges from object to numeric and fill with a 0
df['totalcharges'] = pd.to_numeric(df['totalcharges'], errors='coerce')
df['totalcharges'] = df['totalcharges'].fillna(0)

In [42]:
#This changes data and not just column names to lowercase
df.columns = df.columns.str.lower().str.replace(' ', '_')

string_columns = list(df.dtypes[df.dtypes == 'object'].index)

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [40]:
#we want 0's and 1's instead of yes and nos for churn
#to change all the yeses to int
#GAH!!!! I totally needed this before
df.churn = (df.churn == 'yes').astype(int)

In [43]:
#transposes
df.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


# Set up validation framework

In [44]:
from sklearn.model_selection import train_test_split

In [46]:
#split data into two parts (not three)
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [53]:
#now split up the full training into training and validation df's
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=11)

In [54]:
#to see sizes of the three datasets
len(df_train), len(df_val), len(df_test)

(4225, 1409, 1409)

In [55]:
#reset the indexes
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [56]:
#define your y values
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

#delete the churn columns
del df_train['churn']
del df_val['churn']
del df_test['churn']

# Exploratory Data Analysis