In [2]:
# read data into DataFrame using panda
import pandas as pd

# Title: panda-dev/pandas
# Author: The pandas development team
# Date: 2023
# Code Version: latest
# Availability: https://doi.org/10.5281/zenodo.7741580

df = pd.read_csv('source_output/churn_clean.csv', header='infer')
df = df[['MonthlyCharge',
         'Bandwidth_GB_Year', 
         'Port_modem',
         'Tablet', 
         'Phone', 
         'Multiple', 
         'OnlineSecurity', 
         'OnlineBackup', 
         'DeviceProtection', 
         'TechSupport', 
         'StreamingTV', 
         'StreamingMovies', 
         'PaperlessBilling']]

df.head(5)

Unnamed: 0,MonthlyCharge,Bandwidth_GB_Year,Port_modem,Tablet,Phone,Multiple,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,PaperlessBilling
0,172.455519,904.53611,Yes,Yes,Yes,No,Yes,Yes,No,No,No,Yes,Yes
1,242.632554,800.982766,No,Yes,Yes,Yes,Yes,No,No,No,Yes,Yes,Yes
2,159.947583,2054.706961,Yes,No,Yes,Yes,No,No,No,No,No,Yes,Yes
3,119.95684,2164.579412,No,No,Yes,No,Yes,No,No,No,Yes,No,Yes
4,149.948316,271.493436,Yes,No,No,No,No,No,No,Yes,Yes,No,No


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   MonthlyCharge      10000 non-null  float64
 1   Bandwidth_GB_Year  10000 non-null  float64
 2   Port_modem         10000 non-null  object 
 3   Tablet             10000 non-null  object 
 4   Phone              10000 non-null  object 
 5   Multiple           10000 non-null  object 
 6   OnlineSecurity     10000 non-null  object 
 7   OnlineBackup       10000 non-null  object 
 8   DeviceProtection   10000 non-null  object 
 9   TechSupport        10000 non-null  object 
 10  StreamingTV        10000 non-null  object 
 11  StreamingMovies    10000 non-null  object 
 12  PaperlessBilling   10000 non-null  object 
dtypes: float64(2), object(11)
memory usage: 1015.8+ KB


# Data Cleaning

## Duplicates

In [4]:
# check for duplication
df[df.duplicated()]

Unnamed: 0,MonthlyCharge,Bandwidth_GB_Year,Port_modem,Tablet,Phone,Multiple,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,PaperlessBilling


## Missing Values

In [5]:
# check for missing values
df.isnull().sum()

MonthlyCharge        0
Bandwidth_GB_Year    0
Port_modem           0
Tablet               0
Phone                0
Multiple             0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
PaperlessBilling     0
dtype: int64

## Outliers

In [6]:
# check for outliers
# import scipy.stats to calculate z-scores

# Title: scipy/scipy: Scipy
# Author: Gommers, et al.
# Date: 2023
# Code Version: latest
# Availability: https://doi.org/10.5281/zenodo.7655153

from scipy import stats

outliers = df.select_dtypes(include='number')
outliers = stats.zscore(outliers)
outliers[outliers.abs() >= 3].count()

MonthlyCharge        0
Bandwidth_GB_Year    0
dtype: int64

# Data Preparation

## Data Description: Categorical Variables

In [8]:
df.select_dtypes(exclude='number').describe()

Unnamed: 0,Port_modem,Tablet,Phone,Multiple,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,PaperlessBilling
count,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000
unique,2,2,2,2,2,2,2,2,2,2,2
top,No,No,Yes,No,No,No,No,No,No,No,Yes
freq,5166,7009,9067,5392,6424,5494,5614,6250,5071,5110,5882


## Data Description: Continuous Variables

In [9]:
df.select_dtypes(include='number').describe()

Unnamed: 0,MonthlyCharge,Bandwidth_GB_Year
count,10000.0,10000.0
mean,172.624816,3392.34155
std,42.943094,2185.294852
min,79.97886,155.506715
25%,139.979239,1236.470827
50%,167.4847,3279.536903
75%,200.734725,5586.14137
max,290.160419,7158.98153


# Data Transformation

In [11]:
# perform ordinal encoding
df.replace('No', 0, inplace=True)
df.replace('Yes', 1, inplace=True)

df.head(5)

Unnamed: 0,MonthlyCharge,Bandwidth_GB_Year,Port_modem,Tablet,Phone,Multiple,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,PaperlessBilling
0,172.455519,904.53611,1,1,1,0,1,1,0,0,0,1,1
1,242.632554,800.982766,0,1,1,1,1,0,0,0,1,1,1
2,159.947583,2054.706961,1,0,1,1,0,0,0,0,0,1,1
3,119.95684,2164.579412,0,0,1,0,1,0,0,0,1,0,1
4,149.948316,271.493436,1,0,0,0,0,0,0,1,1,0,0


# Output

In [12]:
df.to_csv("source_output/churn_prepped2.csv")