In [1]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_selector

%matplotlib inline

In [2]:
df = pd.read_csv("Customer-Churn.csv")
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [5]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
count,7043.0,7043.0,7043.0,7032.0
mean,0.162147,32.371149,64.761692,2283.300441
std,0.368612,24.559481,30.090047,2266.771362
min,0.0,0.0,18.25,18.8
25%,0.0,9.0,35.5,401.45
50%,0.0,29.0,70.35,1397.475
75%,0.0,55.0,89.85,3794.7375
max,1.0,72.0,118.75,8684.8


In [6]:
df.isnull().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [7]:
#Initializing instance
imputer = SimpleImputer()

#Fitting the TotalCharges column to the imputer,
#transforming it and assigning the new array to the new_TC variable
new_TC = imputer.fit_transform(df[['TotalCharges']])

#Assign the new_TC value to the original column
df['TotalCharges'] = new_TC

#check for null values again
df['TotalCharges'].isnull().sum()

0

In [9]:
cat_var = df[['StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']]
cat_var.head()

Unnamed: 0,StreamingMovies,Contract,PaperlessBilling,PaymentMethod
0,No,Month-to-month,Yes,Electronic check
1,No,One year,No,Mailed check
2,No,Month-to-month,Yes,Mailed check
3,No,One year,No,Bank transfer (automatic)
4,No,Month-to-month,Yes,Electronic check


In [10]:
#Initializing an instance of OneHotEncoder
encoder = OneHotEncoder(drop = 'first')
encoded_var = encoder.fit_transform(cat_var)

In [11]:
# Viewing the encoded variable
print("Encoded matrix \n", encoded_var.toarray(), "\n")

#Viewing labels
print("Encoded column names \n", encoder.get_feature_names_out(),"\n")

#Dataframe of new encoded features
print("Encoded categorical dataframe \n")
pd.DataFrame(encoded_var.toarray(), columns=encoder.get_feature_names_out())


Encoded matrix 
 [[0. 0. 0. ... 0. 1. 0.]
 [0. 0. 1. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 1. 0. ... 0. 0. 0.]] 

Encoded column names 
 ['StreamingMovies_No internet service' 'StreamingMovies_Yes'
 'Contract_One year' 'Contract_Two year' 'PaperlessBilling_Yes'
 'PaymentMethod_Credit card (automatic)' 'PaymentMethod_Electronic check'
 'PaymentMethod_Mailed check'] 

Encoded categorical dataframe 



Unnamed: 0,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...
7038,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
7039,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
7040,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
7041,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [12]:
num_var = df[['MonthlyCharges', 'TotalCharges']]
num_var.head()

Unnamed: 0,MonthlyCharges,TotalCharges
0,29.85,29.85
1,56.95,1889.5
2,53.85,108.15
3,42.3,1840.75
4,70.7,151.65


In [13]:
#Initializing an instance of StandardScaler
scaler = StandardScaler()

scaled_var = scaler.fit_transform(num_var)

print(scaled_var)

[[-1.16032292 -0.99497138]
 [-0.25962894 -0.17387565]
 [-0.36266036 -0.96039939]
 ...
 [-1.1686319  -0.85518222]
 [ 0.32033821 -0.87277729]
 [ 1.35896134  2.01391739]]


In [14]:
#Remove the label column from the dataset
features = df.drop(columns ='Churn')
features.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65


In [16]:
#Prepare perprocessors using the scalar and encoder from ealier and
#specify the column to apply the transformation using the make_column_selector() function
preprocessor = ColumnTransformer([('scaler',scaler,make_column_selector(dtype_include = 'number')),
                                  ('encoder', encoder,make_column_selector(dtype_include = 'object'))])
processed_feat = preprocessor.fit_transform(features)

print(processed_feat)
print(processed_feat.shape)


[[-0.43991649 -1.27744458 -1.16032292 ...  0.          1.
   0.        ]
 [-0.43991649  0.06632742 -0.25962894 ...  0.          0.
   1.        ]
 [-0.43991649 -1.23672422 -0.36266036 ...  0.          0.
   1.        ]
 ...
 [-0.43991649 -0.87024095 -1.1686319  ...  0.          1.
   0.        ]
 [ 2.27315869 -1.15528349  0.32033821 ...  0.          0.
   1.        ]
 [-0.43991649  1.36937906  1.35896134 ...  0.          0.
   0.        ]]
(7043, 30)
