In [1]:
import seaborn as sns
sns.set()

In [2]:
import pandas as pd

df_telco = pd.read_csv('Telco-Customer-Churn.csv')

df_telco.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,no,Yes,No,71,Yes,Yes,Fiber optic,Yes,No,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),104.1,7412.25,0.0
1,Female,no,Yes,Yes,43,Yes,Yes,Fiber optic,No,No,Yes,No,No,Yes,Month-to-month,Yes,Bank transfer (automatic),92.55,4039.0,0.0
2,Female,yes,No,No,2,Yes,Yes,Fiber optic,No,No,No,No,Yes,Yes,Month-to-month,Yes,Electronic check,93.85,170.85,1.0
3,Male,yes,No,No,29,Yes,No,Fiber optic,No,Yes,Yes,No,Yes,Yes,Month-to-month,Yes,Bank transfer (automatic),101.45,2948.6,0.0
4,Female,no,Yes,No,15,Yes,No,Fiber optic,No,No,Yes,No,Yes,No,Month-to-month,Yes,Electronic check,84.3,1308.4,1.0


In [3]:
df_telco.shape

(5635, 20)

In [4]:
df_telco.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5635 entries, 0 to 5634
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            5635 non-null   object 
 1   SeniorCitizen     5635 non-null   object 
 2   Partner           5635 non-null   object 
 3   Dependents        5635 non-null   object 
 4   tenure            5635 non-null   int64  
 5   PhoneService      5635 non-null   object 
 6   MultipleLines     5635 non-null   object 
 7   InternetService   5635 non-null   object 
 8   OnlineSecurity    5635 non-null   object 
 9   OnlineBackup      5635 non-null   object 
 10  DeviceProtection  5635 non-null   object 
 11  TechSupport       5635 non-null   object 
 12  StreamingTV       5635 non-null   object 
 13  StreamingMovies   5635 non-null   object 
 14  Contract          5635 non-null   object 
 15  PaperlessBilling  5635 non-null   object 
 16  PaymentMethod     5635 non-null   object 


### Data Cleaning

In [5]:
numerical_features = ['MonthlyCharges', 'tenure', 'TotalCharges']

# first drop rows with missing values from df_telco
df_telco1= df_telco.dropna(subset=['TotalCharges'])
# then set X to have only numerical features and y to be the churn column
X = df_telco1[numerical_features]
y = df_telco1['Churn']

In [6]:
X.shape

(5629, 3)

In [7]:
y.shape

(5629,)

### Logistic Model Pipeline (Numerical features only)

Use the numerical columns to predict whether a customer churns or not

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

lr_pipe = Pipeline([('scaler', StandardScaler()),
                    ('lr', LogisticRegression())])
lr_numeric = lr_pipe.fit(X, y)

In [9]:
lr_numeric.predict

<function sklearn.pipeline.Pipeline.predict(self, X, **predict_params)>

Show coefficients

In [10]:
coefs = lr_pipe.named_steps['lr'].coef_

In [11]:
coefs

array([[ 0.92209216, -1.59295954,  0.28720095]])

### Logistic Model Pipeline (Categorical features only)

In [12]:
columns_to_drop = ['MonthlyCharges', 'tenure', 'TotalCharges']

# drop numeric fatures and drop rows with missing values from df_telco
df_telco2 = df_telco.drop(columns=columns_to_drop)
df_telco2 = df_telco2.dropna()
# set X to have only categorical features and y to be the churn column
X = df_telco2.drop(columns=['Churn'])
y = df_telco2['Churn']

In [13]:
X.shape

(5635, 16)

In [14]:
y.shape

(5635,)

### OneHotEncoder to transform categorical features

In [15]:
from sklearn.preprocessing import OneHotEncoder

lr_pipe1 = Pipeline([('ohe', OneHotEncoder()),
                    ('lr', LogisticRegression())])
lr_categorical = lr_pipe1.fit(X, y)

In [16]:
lr_categorical.predict

<function sklearn.pipeline.Pipeline.predict(self, X, **predict_params)>

### Logistic Model Pipeline (ALL features)

In [17]:
# first drop rows with missing values from df_telco
df_telco3 = df_telco.dropna()

# then set X to have all columns except for churn and set y to be the churn column
X = df_telco3.drop(columns=['Churn'])
y = df_telco3['Churn']

In [18]:
list(df_telco3.columns)

['gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'tenure',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'MonthlyCharges',
 'TotalCharges',
 'Churn']

Only encode categorical columns. Use passthrough to skip all numerical columns. 

In [19]:
from sklearn.compose import ColumnTransformer

transformer_name = 'ohe_on_all_categorical_features'
transformer = OneHotEncoder(sparse=False)
columns_to_encode = ['gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod']

ohe_final = ColumnTransformer([
    (transformer_name, transformer, columns_to_encode)], 
    remainder='passthrough')

In [21]:
X_transformed = ohe_final.fit_transform(X)

In [22]:
X_transformed.shape

(5629, 46)

In [23]:
lr_pipe2 = Pipeline([('ohe', ohe_final),
                    ('lr', LogisticRegression())])

In [28]:
lr_final = lr_pipe2.fit(X, y)

In [29]:
lr_final.predict

<function sklearn.pipeline.Pipeline.predict(self, X, **predict_params)>