In [27]:
#imports
from pathlib import Path

#file paths
ROOT = ROOT = Path.cwd().parent
RAW_DIR = ROOT/"data"/"raw"
PROCESSED_DIR = ROOT/"data"/"processed"

import sys, os
import pandas as pd
import numpy as np

sys.path.insert(0, os.path.abspath("..")) #preprocess at root
from preprocess import load_csv, save_csv

from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
print("Imports loaded")

Imports loaded


In [28]:
#1.1 load data
"""
Data cleaned in previous assignments. Load cleaned data directly
Cleaned data contains:
    NaN filled with 0
    regex normalization (spaces, lowercase,etc)
    churn to binary (1=churn, 0=not)
"""

df_clean = load_csv(PROCESSED_DIR, "data03_cleaned.csv")
total_charges = pd.to_numeric(df_clean.totalcharges, errors='coerce') #reconvert data type. pandas infers incorrectly from csv

print("Data")
display(df_clean.head().T) #transpose rows/cols to make viewing easier
print("\nDataTypes:")
display(df_clean.dtypes)

Loaded data03_cleaned.csv: length: 7043 

Data


Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no



DataTypes:


customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
churn                 int64
dtype: object

In [29]:
#1.2 split features 'X' and target 'y' (churn=1 or not-churn=0). KMeans only applied to features
y = df_clean['churn'].values #target

df_features = df_clean.drop(['customerid', 'churn'], axis=1) #drop id and target

print("Features", df_features.head().T)

Features                                  0             1               2  \
gender                      female          male            male   
seniorcitizen                    0             0               0   
partner                        yes            no              no   
dependents                      no            no              no   
tenure                           1            34               2   
phoneservice                    no           yes             yes   
multiplelines     no_phone_service            no              no   
internetservice                dsl           dsl             dsl   
onlinesecurity                  no           yes             yes   
onlinebackup                   yes            no             yes   
deviceprotection                no           yes              no   
techsupport                     no            no              no   
streamingtv                     no            no              no   
streamingmovies                 no     

In [30]:
#1.3 find object (categorical) columns
print("Object Types")
print(df_features.dtypes)

cat_cols = df_features.select_dtypes(include=['object']).columns.tolist()
print("\nObject Columns:", cat_cols)

Object Types
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
dtype: object

Object Columns: ['gender', 'partner', 'dependents', 'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod']


In [31]:
#1.4 split data
#shuffle and split 20% test, 80% train
X_train_full, X_test = train_test_split(df_clean, test_size=0.2, random_state=1)

#train/validation split
X_train, X_val = train_test_split(df_train_full, test_size=0.33, random_state=11)

print("df train")
display(X_train.head())
print("\ndf val")
display(X_val.head())
print("\ndf test")
display(X_test.head())

#confirm shapes
print("\nInitial Shapes:")
print("Train:", X_train.shape)
print("Val:  ", X_val.shape)
print("Test: ", X_test.shape)

#save churn as target
y_train = X_train.churn.values
y_val = X_val.churn.values

#delete churn col - don't use for training
del X_train['churn']
del X_train['customerid']
del X_val['churn']
del X_val['customerid']
del X_test['churn']
del X_test['customerid']

#confirm shapes after drop
print("\nFinal Shapes:")
print("Train:", X_train.shape)
print("Val:  ", X_val.shape)
print("Test: ", X_test.shape)

df train


Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
2935,9435-jmlsx,male,0,yes,no,71,yes,no,dsl,yes,...,yes,yes,yes,yes,two_year,yes,bank_transfer_(automatic),86.1,6045.9,0
3639,0512-flfdw,female,1,yes,no,60,yes,yes,fiber_optic,no,...,yes,no,yes,yes,one_year,yes,credit_card_(automatic),100.5,6029.0,0
2356,3450-wxoat,male,0,no,no,46,yes,no,dsl,no,...,no,no,no,no,month-to-month,yes,credit_card_(automatic),45.2,2065.15,0
6660,1447-giqmr,male,0,yes,no,1,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,69.15,69.15,1
755,6683-vlctz,male,1,no,no,20,yes,yes,fiber_optic,no,...,yes,no,yes,yes,month-to-month,yes,electronic_check,98.55,1842.8,1



df val


Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
5805,1697-nvvgy,male,1,yes,no,19,yes,no,dsl,yes,...,yes,no,yes,no,month-to-month,no,bank_transfer_(automatic),66.4,1286.05,0
3687,6849-oyamu,male,0,yes,yes,19,yes,no,fiber_optic,no,...,no,yes,yes,yes,one_year,no,bank_transfer_(automatic),100.0,1888.65,1
6900,7921-lmdfq,male,1,no,no,51,yes,no,fiber_optic,no,...,yes,no,yes,yes,one_year,yes,bank_transfer_(automatic),94.0,4905.75,0
1644,3331-hqdtw,female,0,no,no,34,yes,yes,fiber_optic,yes,...,yes,no,yes,yes,month-to-month,yes,mailed_check,109.8,3587.25,1
6975,1904-wajaa,female,0,yes,yes,24,yes,yes,dsl,yes,...,yes,no,no,no,two_year,no,electronic_check,64.35,1558.65,0



df test


Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
3381,8879-zkjof,female,0,no,no,41,yes,no,dsl,yes,...,yes,yes,yes,yes,one_year,yes,bank_transfer_(automatic),79.85,3320.75,0
6180,0201-mibol,female,1,no,no,66,yes,yes,fiber_optic,yes,...,no,no,yes,yes,two_year,yes,bank_transfer_(automatic),102.4,6471.85,0
4829,1600-dilpe,female,0,no,no,12,yes,no,dsl,no,...,no,no,no,no,month-to-month,yes,bank_transfer_(automatic),45.0,524.35,0
3737,8601-qacrs,female,0,no,no,5,yes,yes,dsl,no,...,no,no,no,no,month-to-month,yes,mailed_check,50.6,249.95,1
4249,7919-zodzz,female,0,yes,yes,10,yes,no,dsl,no,...,yes,no,no,yes,one_year,yes,mailed_check,65.9,660.05,0



Initial Shapes:
Train: (3774, 21)
Val:   (1860, 21)
Test:  (1409, 21)

Final Shapes:
Train: (3774, 19)
Val:   (1860, 19)
Test:  (1409, 19)


In [32]:
#2.1 label categorical columns *only run once or encoding will be overwritten*
#add categorical columns to list
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()
print(cat_cols)

encoders = {} #dictionary to store a label encoder for each column

for col in cat_cols:
    enc = LabelEncoder() #new encoder for this column

    enc.fit(X_train[col])

    X_train[col] = enc.transform(X_train[col])
    X_val[col]   = enc.transform(X_val[col])
    X_test[col]  = enc.transform(X_test[col])

    encoders[col] = enc #save encoding

['gender', 'partner', 'dependents', 'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod']


In [35]:
#2.2 view mappings
for col, enc in encoders.items(): 
    print(f"\nColumn: {col}") 
    
    #mapping: category to int 
    mapping = dict(zip(enc.classes_, enc.transform(enc.classes_))) 
    print(mapping)


Column: gender
{'female': np.int64(0), 'male': np.int64(1)}

Column: partner
{'no': np.int64(0), 'yes': np.int64(1)}

Column: dependents
{'no': np.int64(0), 'yes': np.int64(1)}

Column: phoneservice
{'no': np.int64(0), 'yes': np.int64(1)}

Column: multiplelines
{'no': np.int64(0), 'no_phone_service': np.int64(1), 'yes': np.int64(2)}

Column: internetservice
{'dsl': np.int64(0), 'fiber_optic': np.int64(1), 'no': np.int64(2)}

Column: onlinesecurity
{'no': np.int64(0), 'no_internet_service': np.int64(1), 'yes': np.int64(2)}

Column: onlinebackup
{'no': np.int64(0), 'no_internet_service': np.int64(1), 'yes': np.int64(2)}

Column: deviceprotection
{'no': np.int64(0), 'no_internet_service': np.int64(1), 'yes': np.int64(2)}

Column: techsupport
{'no': np.int64(0), 'no_internet_service': np.int64(1), 'yes': np.int64(2)}

Column: streamingtv
{'no': np.int64(0), 'no_internet_service': np.int64(1), 'yes': np.int64(2)}

Column: streamingmovies
{'no': np.int64(0), 'no_internet_service': np.int64(

In [None]:
#3.1 tree
dt = DecisionTreeClassifier(random_state=1)

dt.fit(X_train, y_train)