In [None]:
# suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# read churn data into DataFrame using pandas

# Title: panda-dev/pandas
# Author: The pandas development team
# Date: 2023
# Code Version: latest
# Availability: https://doi.org/10.5281/zenodo.7741580

import pandas as pd

df = pd.read_csv('source_output/churn_clean.csv', header='infer')

In [None]:
# select relevant variables
df = df[['Area',
        'Children',
        'Age',
        'Income',
        'Marital',
        'Gender',
        'Email',
        'Contract',
        'Port_modem',
        'Tablet',
        'InternetService',
        'Phone',
        'Multiple',
        'OnlineSecurity',
        'OnlineBackup',
        'DeviceProtection',
        'TechSupport',
        'StreamingTV',
        'StreamingMovies',
        'PaperlessBilling',
        'PaymentMethod',
        'Tenure',
        'MonthlyCharge',
        'Bandwidth_GB_Year',
        'Churn']]

In [None]:
df.head()

In [None]:
df.info()

# Data Cleaning

## Duplicates

In [None]:
# check for duplication
df[df.duplicated()]

## Missing Values

In [None]:
# check for missing values
df.isnull().sum()

## Outliers

In [None]:
# check for outliers
# import scipy.stats to calculate z-scores

# Title: scipy/scipy: Scipy
# Author: Gommers, et al.
# Date: 2023
# Code Version: latest
# Availability: https://doi.org/10.5281/zenodo.7655153

from scipy import stats

outliers = df.select_dtypes(include='number')
outliers = stats.zscore(outliers)
outliers[outliers.abs() >= 3].count()

In [None]:
# children, income, and email features contain outliers
Children_outliers = outliers[outliers.Children.abs() >= 3].index
Income_outliers = outliers[outliers.Income.abs() >= 3].index
Email_outliers = outliers[outliers.Email.abs() >= 3].index

In [None]:
# replace outlier values with median values
df.Children.iloc[Children_outliers] = df.Children.median()
df.Income.iloc[Income_outliers] = df.Income.median()
df.Email.iloc[Email_outliers] = df.Email.median()

# Data Preparation

## Data Description: Categorical Variables

In [None]:
df.select_dtypes(exclude='number').describe()

## Data Description: Continuous Variables

In [None]:
df.select_dtypes(include='number').describe()

# Data Transformation

## Data Transformation: Numerical Variables

In [None]:
# transform numerical variables to categorical variables
df.Children = pd.cut(df['Children'].array,bins=[0,1,3,8],labels=['children_1', 'children_2', 'children_3'])
df.Age = pd.cut(df['Age'].array,bins=[18,35,53,71,89],labels=['age_1', 'age_2', 'age_3', 'age_4'])
df.Income = pd.cut(df['Income'].array,bins=[340,19200,33100,51700,124000],labels=['income_1', 'income_2', 'income_3', 'income_4'])
df.Email = pd.cut(df['Email'].array,bins=[3,10,12,14,21],labels=['email_1', 'email_2', 'email_3', 'email_4'])
df.Tenure = pd.cut(df['Tenure'].array,bins=[1,8,35,61,71],labels=['tenure_1', 'tenure_2', 'tenure_3', 'tenure_4'])
df.MonthlyCharge = pd.cut(df['MonthlyCharge'].array,bins=[79,139,167,200,290],labels=['charge_1', 'charge_2', 'charge_3', 'charge_4'])
df.Bandwidth_GB_Year = pd.cut(df['Bandwidth_GB_Year'].array,bins=[150,1230,3280,5590,7160],labels=['bw_1', 'bw_2', 'bw_3', 'bw_4'])                        

## Data Transformation: Categorical Variables

In [None]:
# perform nominal encoding
df = pd.get_dummies(df, columns=['Area', 
                                 'Marital', 
                                 'Gender', 
                                 'InternetService', 
                                 'PaymentMethod',
                                 'Bandwidth_GB_Year', 
                                 'MonthlyCharge', 
                                 'Tenure', 
                                 'Email', 
                                 'Income', 
                                 'Age', 
                                 'Children',
                                 'Contract'])

In [None]:
# perform ordinal encoding
df.replace('No', 0, inplace=True)
df.replace('Yes', 1, inplace=True)

In [None]:
df.head(5)

## Output

In [None]:
df.to_csv("source_output/churn_prepped1.csv")