In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os

from scipy import stats

import sklearn.model_selection
import sklearn.metrics
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import make_scorer, recall_score, confusion_matrix, ConfusionMatrixDisplay, f1_score
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

from xgboost import XGBClassifier

import wrangle as wr
import explore as ex
import model as m

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
pd.options.display.max_columns = None

In [3]:
df = wr.get_telco_data()

In [4]:
df

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,3,1,3,2162-FRZAA,Male,0,Yes,Yes,63,No,No phone service,No,Yes,Yes,Yes,No,No,No,39.35,2395.05,No,Two year,DSL,Bank transfer (automatic)
1,4,1,3,2160-GPFXD,Male,0,Yes,Yes,54,Yes,Yes,Yes,Yes,No,Yes,No,No,Yes,65.65,3566.7,No,Two year,DSL,Credit card (automatic)
2,2,1,2,2157-MXBJS,Male,0,Yes,No,13,Yes,Yes,No,No,Yes,No,Yes,Yes,Yes,75.30,989.45,Yes,One year,DSL,Mailed check
3,4,1,1,2155-AMQRX,Female,0,No,No,28,Yes,Yes,No,No,No,Yes,No,No,Yes,54.90,1505.15,No,Month-to-month,DSL,Credit card (automatic)
4,3,1,2,2150-WLKUW,Female,0,Yes,No,40,Yes,Yes,No,Yes,No,No,Yes,No,No,63.90,2635,No,One year,DSL,Bank transfer (automatic)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,2,3,1,8942-DBMHZ,Male,0,No,No,12,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,No,20.45,255.35,No,Month-to-month,,Mailed check
7039,2,3,3,8922-NPKBJ,Male,0,Yes,Yes,42,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,No,19.35,867.3,No,Two year,,Mailed check
7040,4,3,3,8917-SZTTJ,Male,0,Yes,Yes,60,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,No,19.65,1161.75,No,Two year,,Credit card (automatic)
7041,2,3,3,8917-FAEMR,Female,0,No,No,37,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,No,19.85,784.25,No,Two year,,Mailed check


In [5]:
dfdf['churn']

0        No
1        No
2       Yes
3        No
4        No
       ... 
7038     No
7039     No
7040     No
7041     No
7042     No
Name: churn, Length: 7043, dtype: object

# Cleaning the data and creating a function for further use

In [4]:
df['payment_type'] = df['payment_type'].replace('Electronic check', '0')
df['payment_type'] = df['payment_type'].replace('Mailed check', '0')
df['payment_type'] = df['payment_type'].replace('Bank transfer (automatic)', '1')
df['payment_type'] = df['payment_type'].replace('Credit card (automatic)', '1')

In [5]:
# commented out so wont error if notebook is run from top
#df['payment_type'] = df['payment_type'].replace('manual', '0')
#df['payment_type'] = df['payment_type'].replace('auto', '1')

In [6]:
df['gender'] = df['gender'].replace('Male', 1)
df['gender'] = df['gender'].replace('Female', 0)

In [7]:
df = df.replace('Yes', 1)
df = df.replace('No', 0)

In [8]:
df['churn'] = df['churn'].astype(int)

In [9]:
df['payment_type'] = df['payment_type'].astype(int)

In [10]:
df = df.drop(columns=['payment_type_id', 'internet_service_type_id', 'contract_type_id', 'customer_id'])

In [11]:
df['tenure']

0        9
1        9
2        4
3       13
4        3
        ..
7038    13
7039    22
7040     2
7041    67
7042    63
Name: tenure, Length: 7043, dtype: int64

In [12]:
df = df[df['total_charges'] != ' ']

In [13]:
df['total_charges'] = df['total_charges'].astype(float)

In [27]:
df.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,0,0,1,1,9,1,No,No,Yes,No,Yes,Yes,No,1,65.6,593.3,0,One year,DSL,0
1,1,0,0,0,9,1,Yes,No,No,No,No,No,Yes,0,59.9,542.4,0,Month-to-month,DSL,0
2,1,0,0,0,4,1,No,No,No,Yes,No,No,No,1,73.9,280.85,1,Month-to-month,Fiber optic,0
3,1,1,1,0,13,1,No,No,Yes,Yes,No,Yes,Yes,1,98.0,1237.85,1,Month-to-month,Fiber optic,0
4,0,1,1,0,3,1,No,No,No,No,Yes,Yes,No,1,83.9,267.4,1,Month-to-month,Fiber optic,0


In [15]:
df.dtypes

gender                     int64
senior_citizen             int64
partner                    int64
dependents                 int64
tenure                     int64
phone_service              int64
multiple_lines            object
online_security           object
online_backup             object
device_protection         object
tech_support              object
streaming_tv              object
streaming_movies          object
paperless_billing          int64
monthly_charges          float64
total_charges            float64
churn                      int64
contract_type             object
internet_service_type     object
payment_type               int64
dtype: object

In [16]:
for col in df.columns:
    if df[col].dtypes == 'object':
        df[col] = df[col].replace(1, 'Yes')
        df[col] = df[col].replace(0, 'No')

In [24]:
num_cols = []
cat_cols = []

for col in df.columns:
    
    if df[col].dtypes == 'object':
        cat_cols.append([col])
    else:
        num_cols.append([col])

In [25]:
num_cols

[['gender'],
 ['senior_citizen'],
 ['partner'],
 ['dependents'],
 ['tenure'],
 ['phone_service'],
 ['paperless_billing'],
 ['monthly_charges'],
 ['total_charges'],
 ['churn'],
 ['payment_type']]

In [26]:
cat_cols

[['multiple_lines'],
 ['online_security'],
 ['online_backup'],
 ['device_protection'],
 ['tech_support'],
 ['streaming_tv'],
 ['streaming_movies'],
 ['contract_type'],
 ['internet_service_type']]

In [2]:
def prep_telco_data():
    '''
    
    '''
    # Collects the dataframe
    df = wr.get_telco_data()
    
    # Creating empty lists for numerical and categorical columns
    cat_cols = []
    num_cols = []
    
    # Changing all of the yes's and no's to numerical values 
    df = df.replace('Yes', 1)
    df = df.replace('No', 0)
    
    # For if you want to payment types to be categorical
    # df['payment_type'] = df['payment_type'].replace('Electronic check', 'manual')
    # df['payment_type'] = df['payment_type'].replace('Mailed check', 'manual')
    # df['payment_type'] = df['payment_type'].replace('Bank transfer (automatic)', 'auto')
    # df['payment_type'] = df['payment_type'].replace('Credit card (automatic)', 'auto')
    
    # Setting all of the payments to 0 for manual payments and 1 for automatic payments
    df['payment_type'] = df['payment_type'].replace('Electronic check', '0')
    df['payment_type'] = df['payment_type'].replace('Mailed check', '0')
    df['payment_type'] = df['payment_type'].replace('Bank transfer (automatic)', '1')
    df['payment_type'] = df['payment_type'].replace('Credit card (automatic)', '1')
    # Changing the gender columns to numerical values
    df['gender'] = df['gender'].replace('Male', 1)
    df['gender'] = df['gender'].replace('Female', 0)
    # Making the column to a numerical dtype
    df['payment_type'] = df['payment_type'].astype(int)
    # Setting the churn column to a numerical
    df['churn'] = df['churn'].astype(int)
        
    # Dropping unneeded columns
    df = df.drop(columns=['payment_type_id', 'internet_service_type_id', 'contract_type_id', 'customer_id'])
    # Removing the rows where the total charges were blank
    # tenure was at 0, so I can only imagine customer canceled account before being charged
    df = df[df['total_charges'] != ' ']
    # Converting dtype to float
    df['total_charges'] = df['total_charges'].astype(float)
    
    # Changing back the 1s and 0s in the categorical columns to yes's and no's
    for col in df.columns:
        if df[col].dtypes == 'object':
            df[col] = df[col].replace(1, 'Yes')
            df[col] = df[col].replace(0, 'No')

    # Loop to get numericl and categorical columns
    for col in df.columns:
    
        if df[col].dtypes == 'object':
            cat_cols.append(col)
        else:
            num_cols.append(col)
            
    return df, cat_cols, num_cols
    

In [2]:
df, cat_cols, num_cols = wr.prep_telco_data()

In [3]:
df.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,1,No,Yes,Yes,63,No,No phone service,No,Yes,Yes,Yes,No,No,No,39.35,2395.05,0,Two year,DSL,auto
1,1,No,Yes,Yes,54,Yes,Yes,Yes,Yes,No,Yes,No,No,Yes,65.65,3566.7,0,Two year,DSL,auto
2,1,No,Yes,No,13,Yes,Yes,No,No,Yes,No,Yes,Yes,Yes,75.3,989.45,1,One year,DSL,manual
3,0,No,No,No,28,Yes,Yes,No,No,No,Yes,No,No,Yes,54.9,1505.15,0,Month-to-month,DSL,auto
4,0,No,Yes,No,40,Yes,Yes,No,Yes,No,No,Yes,No,No,63.9,2635.0,0,One year,DSL,auto


In [8]:
#df['senior_citizen'] = df['senior_citizen'].replace(0, 'No')
#df['senior_citizen'] = df['senior_citizen'].replace(1, 'Yes')

In [4]:
cat_cols

['senior_citizen',
 'partner',
 'dependents',
 'phone_service',
 'multiple_lines',
 'online_security',
 'online_backup',
 'device_protection',
 'tech_support',
 'streaming_tv',
 'streaming_movies',
 'paperless_billing',
 'contract_type',
 'internet_service_type',
 'payment_type']

In [5]:
num_cols

['tenure', 'monthly_charges', 'total_charges']

# Creating a function to create dummies

In [5]:
dummies = pd.get_dummies(df[cat_cols], drop_first=True)
df = df.drop(columns=cat_cols)
df = pd.concat([df, dummies], axis=1)
df

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,paperless_billing,monthly_charges,total_charges,churn,payment_type,multiple_lines_No phone service,multiple_lines_Yes,online_security_No internet service,online_security_Yes,online_backup_No internet service,online_backup_Yes,device_protection_No internet service,device_protection_Yes,tech_support_No internet service,tech_support_Yes,streaming_tv_No internet service,streaming_tv_Yes,streaming_movies_No internet service,streaming_movies_Yes,contract_type_One year,contract_type_Two year,internet_service_type_Fiber optic,internet_service_type_None
0,1,0,1,1,63,0,0,39.35,2395.05,0,1,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,1,0,0
1,1,0,1,1,54,1,1,65.65,3566.70,0,1,0,1,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0
2,1,0,1,0,13,1,1,75.30,989.45,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0
3,0,0,0,0,28,1,1,54.90,1505.15,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,1,0,40,1,0,63.90,2635.00,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,0,0,0,12,1,0,20.45,255.35,0,0,0,0,1,0,1,0,1,0,1,0,1,0,1,0,0,0,0,1
7039,1,0,1,1,42,1,0,19.35,867.30,0,0,0,0,1,0,1,0,1,0,1,0,1,0,1,0,0,1,0,1
7040,1,0,1,1,60,1,0,19.65,1161.75,0,1,0,0,1,0,1,0,1,0,1,0,1,0,1,0,0,1,0,1
7041,0,0,0,0,37,1,0,19.85,784.25,0,0,0,0,1,0,1,0,1,0,1,0,1,0,1,0,0,1,0,1


In [52]:
def create_dummies(df, cols):
    '''
    This function will, quite simply, create dummy variables for a dataframe, remove the columns used to create them, 
    and then concat the dummies back onto the dataframe
    '''
    # This will create the dummy variables from our categorical columns list
    dummies = pd.get_dummies(df[cols], drop_first=True)
    # This will drop the original categorical columns from the df
    df = df.drop(columns=cols)
    # This will concatenate the dummies onto the current df
    df = pd.concat([df, dummies], axis=1)
            
    return df

# Function to make a train/test or a train/validate/test split

In [7]:
 def df_splits(df, col, val='Yes', strat='Yes', seed=42):
    '''
    This function takes in a dataframe and a taget column, as well as several optional arguments. You can decide if you want a validate set and if you want to stratify. 
    By leaving those two variables alone you will stratify on the target column and get a validate subset. If you change them to anything when calling the function,
    you will not get a validate or stratification. There is also an argument for the seed, which is set to 42 by default.
    '''
    # If val is left alone at 'Yes', the function will run this loop and return a validate subset.
    if val == 'Yes':
        # If strat is left alone at 'Yes', the function will stratify by the column named when calling the function.
        if strat == 'Yes':
            # Train, validate, and test subsets created.
            train, val_test = train_test_split(df, train_size=.6, random_state=seed, stratify=df[col])
            validate, test = train_test_split(val_test, train_size=.6, random_state=seed, stratify=val_test[col])
            # Printing the shapes of each subset 
            print(train.shape, validate.shape, test.shape)
            return train, validate, test
        # If the strat argument is changed at all, it will default to not doing it and not stratify when splitting.
        else:
            #Splitting the data into train, validate, and test.
            train, val_test = train_test_split(df, train_size=.6, random_state=seed)
            validate, test = train_test_split(val_test, train_size=.6, random_state=seed)
            # Again, printing the subset
            print(train.shape, validate.shape, test.shape)
            return train, validate, test
    # This part of the loop is for if you changed the val argument to something other than 'Yes', which will make it not create a validate subset. 
    else:
        # If strat is left at 'Yes', the function will stratify on the column named when calling the function
        if strat == 'Yes':
            # Splitting the data into train and test subsets
            train, test = train_test_split(df, train_size=.8, random_state=seed, stratify=df[col])
            # Printing the shapes
            print(train.shape, test.shape)
            return train, test

        else:
            # Splitting the data into train and test
            train, test = train_test_split(df, train_size=.8, random_state=seed)
            # Printing the shapes
            print(train.shape, test.shape)
            return train, test

In [8]:
train, validate, test = df_splits(df, 'churn')

(4219, 20) (1687, 20) (1126, 20)


In [23]:
train, test = df_splits(df, 'churn', val='No')

(5625, 29) (1407, 29)


In [24]:
train, validate, test = df_splits(df, 'churn', val='Yes', strat='No')

(4219, 29) (1687, 29) (1126, 29)


In [27]:
train, test = df_splits(df, 'churn', val='meow', strat='hi')

(5625, 29) (1407, 29)


# Function(s) to create x/y subsets

In [9]:
def Xy_validate(validate, target):
    '''
    This function will take in a validate df and target column from that df and create X and y subsets.
    '''  
    # Creating the validate subsets 
    X_val = validate.drop(columns=target)
    y_val = validate[target]
 
    # Resetting indexes
    X_val = X_val.reset_index(drop= True)
    y_val = y_val.reset_index(drop= True)
        
    return X_val, y_val


In [10]:
def Xy_train_test(train, test, target):
    '''
    This function will take in a train and test df as well as the target column for each and create X and y subsets.
    '''
    # creating train and test (x and y) subsets
    X_train = train.drop(columns= target)
    y_train = train[target]
        
    # creating train and test (x and y) subsets
    X_test = test.drop(columns= target)
    y_test = test[target]
    
    # Resetting indexes
    X_train = X_train.reset_index(drop= True)
    y_train = y_train.reset_index(drop= True)
    X_test = X_test.reset_index(drop= True)
    y_test = y_test.reset_index(drop= True)
        
    return X_train, y_train, X_test, y_test

In [11]:
def subsets(train, test, validate, target):
    '''
    This function will call the two functions to create validate subsets and the train/test subsets in one 
    to streamline the creation of all three subsets at once.
    '''
    X_val, y_val = Xy_validate(validate, target)
    
    X_train, y_train, X_test, y_test = Xy_train_test(train, test, target)
    
    return X_train, y_train, X_val, y_val, X_test, y_test

In [12]:
X_train, y_train, X_val, y_val, X_test, y_test = subsets(train, test, validate, 'churn')

# Function(s) to scale data

In [5]:
cat_cols, num_cols

(['multiple_lines',
  'online_security',
  'online_backup',
  'device_protection',
  'tech_support',
  'streaming_tv',
  'streaming_movies',
  'contract_type',
  'internet_service_type'],
 ['gender',
  'senior_citizen',
  'partner',
  'dependents',
  'tenure',
  'phone_service',
  'paperless_billing',
  'monthly_charges',
  'total_charges',
  'churn',
  'payment_type'])

In [6]:
X_train

NameError: name 'X_train' is not defined

In [28]:
num_cols.remove('churn')

num_cols

['gender',
 'senior_citizen',
 'partner',
 'dependents',
 'tenure',
 'phone_service',
 'paperless_billing',
 'monthly_charges',
 'total_charges',
 'payment_type']

In [29]:
X_train_nums = X_train[num_cols]

In [30]:
X_train_nums

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,paperless_billing,monthly_charges,total_charges,payment_type
0,1,0,0,0,69,1,1,110.05,7430.75,1
1,0,0,1,0,71,1,1,25.95,1801.90,1
2,0,0,1,1,15,1,0,76.50,1155.60,1
3,0,0,1,1,5,1,1,24.30,132.25,0
4,0,1,0,0,4,1,1,88.85,372.45,0
...,...,...,...,...,...,...,...,...,...,...
4214,1,0,1,1,17,0,0,44.45,792.15,1
4215,0,0,1,1,7,1,1,69.00,506.90,0
4216,1,0,0,0,2,1,1,80.20,181.10,0
4217,1,0,1,0,71,1,1,82.55,5832.65,0


In [None]:
def scale_data(num_cols, X_train, X_test, scale):
    '''
    
    '''
    # Subsetting our numerical columns
    X_train_nums = X_train[num_cols]
    # Can take any of these: StandardScaler, RobustScaler, MinMaxScaler
    scaler = scale()
    # We only use the.fit method with our training data
    # Fitting the data
    scaler.fit(X_train_nums)
    # Transforming the data
    X_train_scaled = scaler.transform(X_train[num_cols])
    X_test_Scaled = scaler.transform(X_test[num_cols])
    # Creating dataframes
    X_train_scaled = pd.DataFrame(X_train_scaled, columns=[num_cols])
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=[num_cols])
    # 

In [None]:
def scaling(X_train, X_test):
    
    '''
    Input: A X_train and X_test dataframe.
    Output: X_train_scaled, X_test_scaled dataframe, where the train and test are scaled.
    
    This function takes X_train and X_test and creates a scaler object with X_train(fit). It then transforms specific numeric columns on the dataframes, adds ['neg','neutral','pos','compound'] columns from their respective X_train and X_test.
    
    '''
    
    
    # create a subset of numerical column
    xtrainnums = X_train[['review_count','number_of_ratings','length','rating']]
    
    number_list = ['review_count','number_of_ratings','length','rating']

    # Note that we only call .fit with the training data
    scaler = sklearn.preprocessing.StandardScaler()
    
    # fit training data to scaler, not transforming here
    scaler.fit(xtrainnums)
    
    # transform the numerical values that we want based on the trained fit scaler
    X_train_scaled = scaler.transform(X_train[number_list])
    X_test_scaled = scaler.transform(X_test[number_list])
    
    # create a dataframe
    X_train_scaled = pd.DataFrame(X_train_scaled, columns= [number_list])
    X_test_scaled = pd.DataFrame(X_test_scaled, columns= [number_list])
    
    
    # add the 'neg','neutral','pos','compound' from x_train to the scaled data. reset
    X_train_scaled[['neg','neutral','pos','compound']] = X_train[['neg','neutral','pos','compound']].reset_index(drop = True)
    X_test_scaled[['neg','neutral','pos','compound']] = X_test[['neg','neutral','pos','compound']].reset_index(drop = True)

    # create a list of the dummies 
    dummies = X_train.columns.tolist()[11:]
    
    # add dummies to dataframe
    X_train_scaled = pd.concat([X_train_scaled, X_train[dummies]],axis = 1 )
    X_test_scaled = pd.concat([X_test_scaled, X_test[dummies]],axis = 1 )
    
    return X_train_scaled, X_test_scaled