In [65]:
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns 
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
%matplotlib inline

In [66]:
df = pd.read_csv("TelcoChurnData.csv")

In [67]:
df.shape

(7043, 21)

In [68]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [69]:
df.drop('customerID', axis='columns', inplace=True)

In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [71]:
df.TotalCharges = df.TotalCharges.apply(lambda x: x.strip())
df.TotalCharges = pd.to_numeric(df.TotalCharges)

In [72]:
df["tenure"]

0        1
1       34
2        2
3       45
4        2
        ..
7038    24
7039    72
7040    11
7041     4
7042    66
Name: tenure, Length: 7043, dtype: int64

## EDA

In [73]:
# tenure relationship with Churn behavior
px.histogram(data_frame=df[["tenure", "Churn"]],
       x='tenure',color='Churn', barmode="group")

In [74]:
# MontlyCharges relationship with Churn behavior

px.histogram(data_frame=df,
       x='MonthlyCharges',
       color='Churn', barmode='group')

## Data cleaning

In [75]:
for column in df.columns:
  if df[column].dtypes == "object":
    print("Column {} : {}".format(column, list(df[column].unique())))

Column gender : ['Female', 'Male']
Column Partner : ['Yes', 'No']
Column Dependents : ['No', 'Yes']
Column PhoneService : ['No', 'Yes']
Column MultipleLines : ['No phone service', 'No', 'Yes']
Column InternetService : ['DSL', 'Fiber optic', 'No']
Column OnlineSecurity : ['No', 'Yes', 'No internet service']
Column OnlineBackup : ['Yes', 'No', 'No internet service']
Column DeviceProtection : ['No', 'Yes', 'No internet service']
Column TechSupport : ['No', 'Yes', 'No internet service']
Column StreamingTV : ['No', 'Yes', 'No internet service']
Column StreamingMovies : ['No', 'Yes', 'No internet service']
Column Contract : ['Month-to-month', 'One year', 'Two year']
Column PaperlessBilling : ['Yes', 'No']
Column PaymentMethod : ['Electronic check', 'Mailed check', 'Bank transfer (automatic)', 'Credit card (automatic)']
Column Churn : ['No', 'Yes']


In [76]:
df.replace('No phone service', 'No', inplace=True)
df.replace('No internet service', 'No', inplace=True)

In [77]:
df.replace({'Yes':1, 'No': 0}, inplace=True)

In [78]:
df["gender"].replace({'Male':0, 'Female':1}, inplace=True)

In [79]:
df_new = pd.get_dummies(df, columns=['InternetService', 'Contract','PaymentMethod'])

In [80]:
df_new.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,InternetService_0,InternetService_DSL,InternetService_Fiber optic,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1,0,1,0,1,0,0,0,1,0,0,0,0,1,29.85,29.85,0,0,1,0,1,0,0,0,0,1,0
1,0,0,0,0,34,1,0,1,0,1,0,0,0,0,56.95,1889.5,0,0,1,0,0,1,0,0,0,0,1
2,0,0,0,0,2,1,0,1,1,0,0,0,0,1,53.85,108.15,1,0,1,0,1,0,0,0,0,0,1
3,0,0,0,0,45,0,0,1,0,1,1,0,0,0,42.3,1840.75,0,0,1,0,0,1,0,1,0,0,0
4,1,0,0,0,2,1,0,0,0,0,0,0,0,1,70.7,151.65,1,0,0,1,1,0,0,0,0,1,0


In [83]:
# train-test split
target = 'Churn'
features = list(df_new.columns)
features.remove(target)



In [85]:
X_train, y_train, X_test, y_test = train_test_split(df_new[features], df_new[target], test_size=0.2, stratify=df_new[target])

In [86]:
X_train.shape

(5634, 26)

In [88]:
#Scaling training data
scaler_dict = dict()

for cols in ["tenure", "MonthlyCharges", "TotalCharges"]:
  scaler = MinMaxScaler()
  X_train[cols] = scaler.fit_transform(X_train[cols])
  scaler_dict[cols] = scaler

SyntaxError: ignored