In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import env
import wrangle as w
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [37]:
# acquire Telco churn data
df = w.wrangle_telco_data()

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   gender                    7043 non-null   object 
 1   senior_citizen            7043 non-null   int64  
 2   partner                   7043 non-null   object 
 3   dependents                7043 non-null   object 
 4   tenure                    7043 non-null   int64  
 5   phone_service             7043 non-null   object 
 6   multiple_lines            7043 non-null   object 
 7   internet_service_type_id  7043 non-null   int64  
 8   tech_support              7043 non-null   object 
 9   contract_type_id          7043 non-null   int64  
 10  payment_type_id           7043 non-null   int64  
 11  monthly_charges           7043 non-null   float64
 12  total_charges             7032 non-null   float64
 13  churn                     7043 non-null   object 
dtypes: float

In [4]:
df.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service_type_id,tech_support,contract_type_id,payment_type_id,monthly_charges,total_charges,churn
0,Female,0,Yes,Yes,9,Yes,No,1,Yes,2,2,65.6,593.3,No
1,Male,0,No,No,9,Yes,Yes,1,No,1,2,59.9,542.4,No
2,Male,0,No,No,4,Yes,No,2,No,1,1,73.9,280.85,Yes
3,Male,1,Yes,No,13,Yes,No,2,No,1,1,98.0,1237.85,Yes
4,Female,1,Yes,No,3,Yes,No,2,Yes,1,2,83.9,267.4,Yes


In [54]:
subset1 = df[['senior_citizen', 'churn']]

In [57]:
pd.crosstab(subset1.senior_citizen, subset1.churn)

churn,No,Yes
senior_citizen,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4508,1393
1,666,476


In [58]:
subset2 = df[['payment_type_id', 'churn']]

In [59]:
subset2

Unnamed: 0,payment_type_id,churn
0,2,No
1,2,No
2,1,Yes
3,1,Yes
4,2,Yes
...,...,...
7038,2,No
7039,1,Yes
7040,2,No
7041,2,No


In [None]:
online_security	online_backup	device_protection streaming_tv	streaming_movies paperless_billing

In [7]:
tdf = pd.read_csv('telco_churn.csv')

In [9]:
tdf.shape

(7043, 22)

Lets use a chi-square test to see if Churn and Senior Citizen status are related

I will use a confidence interval of 95%
The resulting alpha is .05

Ho: 'churn' and 'senior_citizen' are independent of one another.
Ha: 'churn and 'senior_citizen' are related.

In [None]:
H
0
 (Null Hypothesis): class is independent of cylinders.

H
a
 (Alternative Hypothesis): class is dependent on cylinders.

In [38]:
df.total_charges.isna().value_counts()

False    7032
Name: total_charges, dtype: int64

In [11]:
df1 = df.dropna()

In [12]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   gender                    7032 non-null   object 
 1   senior_citizen            7032 non-null   int64  
 2   partner                   7032 non-null   object 
 3   dependents                7032 non-null   object 
 4   tenure                    7032 non-null   int64  
 5   phone_service             7032 non-null   object 
 6   multiple_lines            7032 non-null   object 
 7   internet_service_type_id  7032 non-null   int64  
 8   tech_support              7032 non-null   object 
 9   contract_type_id          7032 non-null   int64  
 10  payment_type_id           7032 non-null   int64  
 11  monthly_charges           7032 non-null   float64
 12  total_charges             7032 non-null   float64
 13  churn                     7032 non-null   object 
dtypes: float

In [39]:
from sklearn.model_selection import train_test_split

def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=828, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=828,
                                       stratify=train_validate[target])
    return train, validate, test


In [40]:
# split into train, validate, test
train, validate, test = train_validate_test_split(df, target='churn', seed=123)

# create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train = train.drop(columns=['churn'])
y_train = train.churn

X_validate = validate.drop(columns=['churn'])
y_validate = validate.churn

X_test = test.drop(columns=['churn'])
y_test = test.churn


In [46]:
len(y_train)

3937

In [41]:
keep_cols = ['churn','monthly_charges',
                 'senior_citizen','tech_support']
train = train[keep_cols]
validate = validate[keep_cols]
test = test[keep_cols]
    
# Split data into predicting variables (X) and target variable (y) and reset the index for each dataframe
train_X = train.drop(columns='churn').reset_index(drop=True)
train_y = train[['churn']].reset_index(drop=True)

validate_X = validate.drop(columns='churn').reset_index(drop=True)
validate_y = validate[['churn']].reset_index(drop=True)

test_X = test.drop(columns='churn').reset_index(drop=True)
test_y = test[['churn']].reset_index(drop=True)

train_X['senior_citizen'] = train_X.senior_citizen.apply(lambda value: 1 if value == 'Yes' else 0)
train_X['tech_support'] = train_X.tech_support.apply(lambda value: 1 if value == 'Yes' else 0)

# Change target column to show values as upset or non-upset
train_y['churn'] = train_y.churn.apply(lambda value: "churn" if value == 'Yes' else "no-churn")
validate_y['churn'] = validate_y.churn.apply(lambda value: "churn" if value == 'Yes' else "no-churn")
test_y['churn'] = test_y.churn.apply(lambda value: "churn" if value == 'Yes' else "no-churn")


In [47]:
len(y_train)

3937