In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.metrics import accuracy_score, f1_score
from feature_engine.encoding import OrdinalEncoder, OneHotEncoder
from feature_engine.transformation import YeoJohnsonTransformer
from sklearn.linear_model import LogisticRegression

In [4]:
pd.set_option("display.max_columns", 25)

In [6]:
data = pd.read_csv("../data/input_data/telco_customer_churn_1.csv")

In [7]:
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [9]:
data.isnull().count()

customerID          6000
gender              6000
SeniorCitizen       6000
Partner             6000
Dependents          6000
tenure              6000
PhoneService        6000
MultipleLines       6000
InternetService     6000
OnlineSecurity      6000
OnlineBackup        6000
DeviceProtection    6000
TechSupport         6000
StreamingTV         6000
StreamingMovies     6000
Contract            6000
PaperlessBilling    6000
PaymentMethod       6000
MonthlyCharges      6000
TotalCharges        6000
Churn               6000
dtype: int64

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        6000 non-null   object 
 1   gender            6000 non-null   object 
 2   SeniorCitizen     6000 non-null   int64  
 3   Partner           6000 non-null   object 
 4   Dependents        6000 non-null   object 
 5   tenure            6000 non-null   int64  
 6   PhoneService      6000 non-null   object 
 7   MultipleLines     6000 non-null   object 
 8   InternetService   6000 non-null   object 
 9   OnlineSecurity    6000 non-null   object 
 10  OnlineBackup      6000 non-null   object 
 11  DeviceProtection  6000 non-null   object 
 12  TechSupport       6000 non-null   object 
 13  StreamingTV       6000 non-null   object 
 14  StreamingMovies   6000 non-null   object 
 15  Contract          6000 non-null   object 
 16  PaperlessBilling  6000 non-null   object 


In [24]:
data['TotalCharges'].isnull().values.any()

False

In [28]:
nan_rows = data[data['TotalCharges'].isnull()]

In [30]:
data.isnull().T.any().T.sum()

0

In [31]:
data.isnull().any().any()

False

In [34]:
data.isna().values.any()

False

## Train - Test Split.

In [42]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(['customerID', 'Churn'], axis=1),
                                data['Churn'], test_size=0.2, random_state=0)

## Categorical and Numerical Features

In [68]:
cat_vars = [var for var in X_train.columns if X_train[var].dtype == 'object']
num_vars = [var for var in X_train.columns if var not in cat_vars]

In [67]:
cat_vars

['gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'TotalCharges']

In [69]:
num_vars

['SeniorCitizen', 'tenure', 'MonthlyCharges']

## Categorical features

In [70]:
cat_vars_onehot = ['gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
cat_vars_ordinal_arbitrary = ['MultipleLines', 'InternetService', 'OnlineSecurity',
    'OnlineBackup', 'DeviceProtection', 'TechSupport','StreamingTV', 'StreamingMovies',
    'Contract', 'PaymentMethod']

ordinal_encoder_arbitrary = OrdinalEncoder(encoding_method='arbitrary', variables=cat_vars_ordinal_arbitrary)
ordinal_encoder_arbitrary.fit(X_train, y_train)

onehot_encoder = OneHotEncoder(variables=cat_vars_onehot)
onehot_encoder.fit(X_train)

In [72]:
X_train = ordinal_encoder_arbitrary.transform(X_train)
X_test =  ordinal_encoder_arbitrary.transform(X_test)



In [73]:
X_train = onehot_encoder.transform(X_train)
X_test  = onehot_encoder.transform(X_test)

## Numerical Features

In [74]:
yeo_transformer = YeoJohnsonTransformer(variables=['TotalCharges'])

X_train = yeo_transformer.fit_transform(X_train)
X_test = yeo_transformer.transform(X_test)

TypeError: Some of the variables are not numerical. Please cast them as numerical before using this transformer.