In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os

from scipy import stats

import sklearn.model_selection
import sklearn.metrics
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import make_scorer, recall_score, confusion_matrix, plot_confusion_matrix, ConfusionMatrixDisplay, f1_score

from xgboost import XGBClassifier

import wrangle as wr
import explore as ex
import model as m

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
pd.options.display.max_columns = None

In [2]:
df = wr.get_telco_data()

In [3]:
df

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,2,0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,No,Yes,No,Yes,Yes,No,Yes,65.60,593.3,No,One year,DSL,Mailed check
1,2,1,1,0003-MKNFE,Male,0,No,No,9,Yes,Yes,No,No,No,No,No,Yes,No,59.90,542.4,No,Month-to-month,DSL,Mailed check
2,1,2,1,0004-TLHLJ,Male,0,No,No,4,Yes,No,No,No,Yes,No,No,No,Yes,73.90,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,1,2,1,0011-IGKFF,Male,1,Yes,No,13,Yes,No,No,Yes,Yes,No,Yes,Yes,Yes,98.00,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,2,2,1,0013-EXCHZ,Female,1,Yes,No,3,Yes,No,No,No,No,Yes,Yes,No,Yes,83.90,267.4,Yes,Month-to-month,Fiber optic,Mailed check
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,2,1,2,9987-LUTYD,Female,0,No,No,13,Yes,No,Yes,No,No,Yes,No,No,No,55.15,742.9,No,One year,DSL,Mailed check
7039,1,2,1,9992-RRAMN,Male,0,Yes,No,22,Yes,Yes,No,No,No,No,No,Yes,Yes,85.10,1873.7,Yes,Month-to-month,Fiber optic,Electronic check
7040,2,1,1,9992-UJOEL,Male,0,No,No,2,Yes,No,No,Yes,No,No,No,No,Yes,50.30,92.75,No,Month-to-month,DSL,Mailed check
7041,2,1,3,9993-LHIEB,Male,0,Yes,Yes,67,Yes,No,Yes,No,Yes,Yes,No,Yes,No,67.85,4627.65,No,Two year,DSL,Mailed check


In [4]:
df['payment_type'] = df['payment_type'].replace('Electronic check', '0')
df['payment_type'] = df['payment_type'].replace('Mailed check', '0')
df['payment_type'] = df['payment_type'].replace('Bank transfer (automatic)', '1')
df['payment_type'] = df['payment_type'].replace('Credit card (automatic)', '1')

In [5]:
# commented out so wont error if notebook is run from top
#df['payment_type'] = df['payment_type'].replace('manual', '0')
#df['payment_type'] = df['payment_type'].replace('auto', '1')

In [6]:
df['gender'] = df['gender'].replace('Male', 1)
df['gender'] = df['gender'].replace('Female', 0)

In [7]:
df = df.replace('Yes', 1)
df = df.replace('No', 0)

In [8]:
df['churn'] = df['churn'].astype(int)

In [9]:
df['payment_type'] = df['payment_type'].astype(int)

In [10]:
df = df.drop(columns=['payment_type_id', 'internet_service_type_id', 'contract_type_id', 'customer_id'])

In [11]:
df['tenure']

0        9
1        9
2        4
3       13
4        3
        ..
7038    13
7039    22
7040     2
7041    67
7042    63
Name: tenure, Length: 7043, dtype: int64

In [12]:
df = df[df['total_charges'] != ' ']

In [13]:
df['total_charges'] = df['total_charges'].astype(float)

In [27]:
df.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,0,0,1,1,9,1,No,No,Yes,No,Yes,Yes,No,1,65.6,593.3,0,One year,DSL,0
1,1,0,0,0,9,1,Yes,No,No,No,No,No,Yes,0,59.9,542.4,0,Month-to-month,DSL,0
2,1,0,0,0,4,1,No,No,No,Yes,No,No,No,1,73.9,280.85,1,Month-to-month,Fiber optic,0
3,1,1,1,0,13,1,No,No,Yes,Yes,No,Yes,Yes,1,98.0,1237.85,1,Month-to-month,Fiber optic,0
4,0,1,1,0,3,1,No,No,No,No,Yes,Yes,No,1,83.9,267.4,1,Month-to-month,Fiber optic,0


In [15]:
df.dtypes

gender                     int64
senior_citizen             int64
partner                    int64
dependents                 int64
tenure                     int64
phone_service              int64
multiple_lines            object
online_security           object
online_backup             object
device_protection         object
tech_support              object
streaming_tv              object
streaming_movies          object
paperless_billing          int64
monthly_charges          float64
total_charges            float64
churn                      int64
contract_type             object
internet_service_type     object
payment_type               int64
dtype: object

In [16]:
for col in df.columns:
    if df[col].dtypes == 'object':
        df[col] = df[col].replace(1, 'Yes')
        df[col] = df[col].replace(0, 'No')

In [24]:
num_cols = []
cat_cols = []

for col in df.columns:
    
    if df[col].dtypes == 'object':
        cat_cols.append([col])
    else:
        num_cols.append([col])

In [25]:
num_cols

[['gender'],
 ['senior_citizen'],
 ['partner'],
 ['dependents'],
 ['tenure'],
 ['phone_service'],
 ['paperless_billing'],
 ['monthly_charges'],
 ['total_charges'],
 ['churn'],
 ['payment_type']]

In [26]:
cat_cols

[['multiple_lines'],
 ['online_security'],
 ['online_backup'],
 ['device_protection'],
 ['tech_support'],
 ['streaming_tv'],
 ['streaming_movies'],
 ['contract_type'],
 ['internet_service_type']]

In [33]:
def prep_telco_data():
    '''
    
    '''
    # Collects the dataframe
    df = wr.get_telco_data()
    
    # Creating empty lists for numerical and categorical columns
    cat_cols = []
    num_cols = []
    
    # Changing all of the yes's and no's to numerical values 
    df = df.replace('Yes', 1)
    df = df.replace('No', 0)
    
    # For if you want to payment types to be categorical
    # df['payment_type'] = df['payment_type'].replace('Electronic check', 'manual')
    # df['payment_type'] = df['payment_type'].replace('Mailed check', 'manual')
    # df['payment_type'] = df['payment_type'].replace('Bank transfer (automatic)', 'auto')
    # df['payment_type'] = df['payment_type'].replace('Credit card (automatic)', 'auto')
    
    # Setting all of the payments to 0 for manual payments and 1 for automatic payments
    df['payment_type'] = df['payment_type'].replace('Electronic check', '0')
    df['payment_type'] = df['payment_type'].replace('Mailed check', '0')
    df['payment_type'] = df['payment_type'].replace('Bank transfer (automatic)', '1')
    df['payment_type'] = df['payment_type'].replace('Credit card (automatic)', '1')
    # Changing the gender columns to numerical values
    df['gender'] = df['gender'].replace('Male', 1)
    df['gender'] = df['gender'].replace('Female', 0)
    # Making the column to a numerical dtype
    df['payment_type'] = df['payment_type'].astype(int)
    # Setting the churn column to a numerical
    df['churn'] = df['churn'].astype(int)
        
    # Dropping unneeded columns
    df = df.drop(columns=['payment_type_id', 'internet_service_type_id', 'contract_type_id', 'customer_id'])
    # Removing the rows where the total charges were blank
    # tenure was at 0, so I can only imagine customer canceled account before being charged
    df = df[df['total_charges'] != ' ']
    # Converting dtype to float
    df['total_charges'] = df['total_charges'].astype(float)
    
    # Changing back the 1s and 0s in the categorical columns to yes's and no's
    for col in df.columns:
        if df[col].dtypes == 'object':
            df[col] = df[col].replace(1, 'Yes')
            df[col] = df[col].replace(0, 'No')

    # Loop to get numericl and categorical columns
    for col in df.columns:
    
        if df[col].dtypes == 'object':
            cat_cols.append(col)
        else:
            num_cols.append(col)
            
    return df, cat_cols, num_cols
    

In [48]:
df, cat_cols, num_cols = prep_telco_data()

In [49]:
cat_cols

['multiple_lines',
 'online_security',
 'online_backup',
 'device_protection',
 'tech_support',
 'streaming_tv',
 'streaming_movies',
 'contract_type',
 'internet_service_type']

In [50]:
dummies = pd.get_dummies(df[cat_cols], drop_first=True)
df = df.drop(columns=cat_cols)
df = pd.concat([df, dummies], axis=1)
df

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,paperless_billing,monthly_charges,total_charges,churn,payment_type,multiple_lines_No phone service,multiple_lines_Yes,online_security_No internet service,online_security_Yes,online_backup_No internet service,online_backup_Yes,device_protection_No internet service,device_protection_Yes,tech_support_No internet service,tech_support_Yes,streaming_tv_No internet service,streaming_tv_Yes,streaming_movies_No internet service,streaming_movies_Yes,contract_type_One year,contract_type_Two year,internet_service_type_Fiber optic,internet_service_type_None
0,0,0,1,1,9,1,1,65.60,593.30,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0
1,1,0,0,0,9,1,0,59.90,542.40,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,1,0,0,0,4,1,1,73.90,280.85,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
3,1,1,1,0,13,1,1,98.00,1237.85,1,0,0,0,0,0,0,1,0,1,0,0,0,1,0,1,0,0,1,0
4,0,1,1,0,3,1,1,83.90,267.40,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,0,0,0,13,1,0,55.15,742.90,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0
7039,1,0,1,0,22,1,1,85.10,1873.70,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
7040,1,0,0,0,2,1,1,50.30,92.75,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
7041,1,0,1,1,67,1,0,67.85,4627.65,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0


In [52]:
def create_dummies(df, cols):
    '''
    This function will, quite simply, create dummy variables for a dataframe, remove the columns used to create them, 
    and then concat the dummies back onto the dataframe
    '''
    # This will create the dummy variables from our categorical columns list
    dummies = pd.get_dummies(df[cols], drop_first=True)
    # This will drop the original categorical columns from the df
    df = df.drop(columns=cols)
    # This will concatenate the dummies onto the current df
    df = pd.concat([df, dummies], axis=1)
            
    return df