In [None]:
# Let's predict MonthlyCharge!
# Based on: Children, Age, Tenure, Bandwidth_GB_Year, 'Port_modem','Tablet', 'Phone', 'Multiple', 'OnlineSec urity', 
# 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling',

In [18]:
# read data into DataFrame using panda
import pandas as pd

# Title: panda-dev/pandas
# Author: The pandas development team
# Date: 2023
# Code Version: latest
# Availability: https://doi.org/10.5281/zenodo.7741580

df = pd.read_csv('source_output/churn_clean.csv', header='infer')
df = df[['MonthlyCharge',
         'Children', 
         'Age', 
         'Tenure', 
         'Bandwidth_GB_Year', 
         'Port_modem',
         'Tablet', 
         'Phone', 
         'Multiple', 
         'OnlineSecurity', 
         'OnlineBackup', 
         'DeviceProtection', 
         'TechSupport', 
         'StreamingTV', 
         'StreamingMovies', 
         'PaperlessBilling']]

df.head(5)

Unnamed: 0,MonthlyCharge,Children,Age,Tenure,Bandwidth_GB_Year,Port_modem,Tablet,Phone,Multiple,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,PaperlessBilling
0,172.455519,0,68,6.795513,904.53611,Yes,Yes,Yes,No,Yes,Yes,No,No,No,Yes,Yes
1,242.632554,1,27,1.156681,800.982766,No,Yes,Yes,Yes,Yes,No,No,No,Yes,Yes,Yes
2,159.947583,4,50,15.754144,2054.706961,Yes,No,Yes,Yes,No,No,No,No,No,Yes,Yes
3,119.95684,1,48,17.087227,2164.579412,No,No,Yes,No,Yes,No,No,No,Yes,No,Yes
4,149.948316,0,83,1.670972,271.493436,Yes,No,No,No,No,No,No,Yes,Yes,No,No


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   MonthlyCharge      10000 non-null  float64
 1   Children           10000 non-null  int64  
 2   Age                10000 non-null  int64  
 3   Tenure             10000 non-null  float64
 4   Bandwidth_GB_Year  10000 non-null  float64
 5   Port_modem         10000 non-null  object 
 6   Tablet             10000 non-null  object 
 7   Phone              10000 non-null  object 
 8   Multiple           10000 non-null  object 
 9   OnlineSecurity     10000 non-null  object 
 10  OnlineBackup       10000 non-null  object 
 11  DeviceProtection   10000 non-null  object 
 12  TechSupport        10000 non-null  object 
 13  StreamingTV        10000 non-null  object 
 14  StreamingMovies    10000 non-null  object 
 15  PaperlessBilling   10000 non-null  object 
dtypes: float64(3), int64(2)

# Data Cleaning

## Duplicates

In [20]:
# check for duplication
df[df.duplicated()]

Unnamed: 0,MonthlyCharge,Children,Age,Tenure,Bandwidth_GB_Year,Port_modem,Tablet,Phone,Multiple,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,PaperlessBilling


## Missing Values

In [21]:
# check for missing values
df.isnull().sum()

MonthlyCharge        0
Children             0
Age                  0
Tenure               0
Bandwidth_GB_Year    0
Port_modem           0
Tablet               0
Phone                0
Multiple             0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
PaperlessBilling     0
dtype: int64

## Outliers

In [22]:
# check for outliers
# import scipy.stats to calculate z-scores

# Title: scipy/scipy: Scipy
# Author: Gommers, et al.
# Date: 2023
# Code Version: latest
# Availability: https://doi.org/10.5281/zenodo.7655153

from scipy import stats

outliers = df.select_dtypes(include='number')
outliers = stats.zscore(outliers)
outliers[outliers.abs() >= 3].count()

MonthlyCharge          0
Children             191
Age                    0
Tenure                 0
Bandwidth_GB_Year      0
dtype: int64

In [23]:
# Children contains outliers
Children_outliers = outliers[outliers.Children.abs() >= 3].index

In [24]:
# replace outlier values with median values
df.Children.iloc[Children_outliers] = df.Children.median()

# Data Preparation

## Data Description: Categorical Variables

In [25]:
df.select_dtypes(exclude='number').describe()

Unnamed: 0,Port_modem,Tablet,Phone,Multiple,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,PaperlessBilling
count,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000
unique,2,2,2,2,2,2,2,2,2,2,2
top,No,No,Yes,No,No,No,No,No,No,No,Yes
freq,5166,7009,9067,5392,6424,5494,5614,6250,5071,5110,5882


## Data Description: Continuous Variables

In [26]:
df.select_dtypes(include='number').describe()

Unnamed: 0,MonthlyCharge,Children,Age,Tenure,Bandwidth_GB_Year
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,172.624816,1.925,53.0784,34.526188,3392.34155
std,42.943094,1.883383,20.698882,26.443063,2185.294852
min,79.97886,0.0,18.0,1.000259,155.506715
25%,139.979239,0.0,35.0,7.917694,1236.470827
50%,167.4847,1.0,53.0,35.430507,3279.536903
75%,200.734725,3.0,71.0,61.479795,5586.14137
max,290.160419,8.0,89.0,71.99928,7158.98153


# Data Transformation

In [27]:
# perform ordinal encoding
df.replace('No', 0, inplace=True)
df.replace('Yes', 1, inplace=True)

df.head(5)

Unnamed: 0,MonthlyCharge,Children,Age,Tenure,Bandwidth_GB_Year,Port_modem,Tablet,Phone,Multiple,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,PaperlessBilling
0,172.455519,0,68,6.795513,904.53611,1,1,1,0,1,1,0,0,0,1,1
1,242.632554,1,27,1.156681,800.982766,0,1,1,1,1,0,0,0,1,1,1
2,159.947583,4,50,15.754144,2054.706961,1,0,1,1,0,0,0,0,0,1,1
3,119.95684,1,48,17.087227,2164.579412,0,0,1,0,1,0,0,0,1,0,1
4,149.948316,0,83,1.670972,271.493436,1,0,0,0,0,0,0,1,1,0,0


# Output

In [28]:
df.to_csv("source_output/churn_prepped2.csv")