In [1]:
import pandas as pd
import numpy as np

In [2]:
cleaned_churn_df = pd.read_csv("../data/cleaned_churn_data.csv")
cleaned_churn_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 50 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   customer_id                        7043 non-null   object 
 1   gender                             7043 non-null   object 
 2   age                                7043 non-null   int64  
 3   under_30                           7043 non-null   object 
 4   senior_citizen                     7043 non-null   object 
 5   married                            7043 non-null   object 
 6   dependents                         7043 non-null   object 
 7   number_of_dependents               7043 non-null   int64  
 8   country                            7043 non-null   object 
 9   state                              7043 non-null   object 
 10  city                               7043 non-null   object 
 11  zip_code                           7043 non-null   int64

## Converting `object` to `binary`

In [3]:
# selet `object` columns
object_cols = cleaned_churn_df.select_dtypes(include='object')

# creating an empty dictionary to hold these binary columns
binary_cols = {}

# finding the binary columns
for col in object_cols:
    unique_vals = cleaned_churn_df[col].dropna().unique()
    if len(unique_vals) == 2:
        binary_cols[col] = unique_vals

# Now manually map the ones that are not 'Yes'/'No' like gender
for col in binary_cols:
    if set(cleaned_churn_df[col].dropna().unique()) == {'Male', 'Female'}:
        cleaned_churn_df[col] = cleaned_churn_df[col].map({'Male': 1, 'Female': 0})
    else:
        cleaned_churn_df[col] = cleaned_churn_df[col].map({'Yes': 1, 'No': 0})

cleaned_churn_df

Unnamed: 0,customer_id,gender,age,under_30,senior_citizen,married,dependents,number_of_dependents,country,state,...,total_extra_data_charges,total_long_distance_charges,total_revenue,satisfaction_score,customer_status,churn_label,churn_score,cltv,churn_category,churn_reason
0,8779-QRDMV,1,78,0,1,0,0,0,United States,California,...,20,0.00,59.65,3,Churned,1,91,5433,Competitor,Competitor offered more data
1,7495-OOKFY,0,74,0,1,1,1,1,United States,California,...,0,390.80,1024.10,3,Churned,1,69,5302,Competitor,Competitor made better offer
2,1658-BYGOY,1,71,0,1,0,1,3,United States,California,...,0,203.94,1910.88,2,Churned,1,81,3179,Competitor,Competitor made better offer
3,4598-XLKNJ,0,78,0,1,1,1,1,United States,California,...,0,494.00,2995.07,2,Churned,1,88,5337,Dissatisfaction,Limited range of services
4,4846-WHAFZ,0,80,0,1,1,1,1,United States,California,...,0,234.21,3102.36,2,Churned,1,67,2793,Price,Extra data charges
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,2569-WGERO,0,30,0,0,0,0,0,United States,California,...,0,1639.44,3039.53,5,Stayed,0,45,5306,Not Applicable,Don't know
7039,6840-RESVB,1,38,0,0,1,1,2,United States,California,...,0,865.20,2807.47,3,Stayed,0,59,2140,Not Applicable,Don't know
7040,2234-XADUH,0,30,0,0,1,1,2,United States,California,...,0,2135.52,9453.04,4,Stayed,0,71,5560,Not Applicable,Don't know
7041,4801-JZAZL,0,32,0,0,1,1,2,United States,California,...,0,0.00,319.21,4,Stayed,0,59,2793,Not Applicable,Don't know


## Dropping columns

In [4]:
columns_to_drop = [
    'customer_id', 'under_30', 'senior_citizen', 'dependents',
    'country', 'city', 'latitude', 'longitude', 'quarter',
    'referred_a_friend', 'number_of_referrals', 'offer',
    'customer_status', 'churn_score', 'churn_category',
    'churn_reason', 'state'
]

newCleaned_churn_df = cleaned_churn_df.drop(columns_to_drop, axis=1)

In [5]:
newCleaned_churn_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 33 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   gender                             7043 non-null   int64  
 1   age                                7043 non-null   int64  
 2   married                            7043 non-null   int64  
 3   number_of_dependents               7043 non-null   int64  
 4   zip_code                           7043 non-null   int64  
 5   population                         7043 non-null   int64  
 6   tenure_in_months                   7043 non-null   int64  
 7   phone_service                      7043 non-null   int64  
 8   avg_monthly_long_distance_charges  7043 non-null   float64
 9   multiple_lines                     7043 non-null   int64  
 10  internet_service                   7043 non-null   int64  
 11  internet_type                      7043 non-null   objec

In [6]:
newCleaned_churn_df['internet_type'].value_counts()

internet_type
Fiber Optic    3035
DSL            1652
No Internet    1526
Cable           830
Name: count, dtype: int64

In [7]:
newCleaned_churn_df['internet_service'].value_counts()

internet_service
1    5517
0    1526
Name: count, dtype: int64

In [8]:
newCleaned_churn_df['contract'].value_counts()

contract
Month-to-Month    3610
Two Year          1883
One Year          1550
Name: count, dtype: int64

In [9]:
newCleaned_churn_df['payment_method'].value_counts()

payment_method
Bank Withdrawal    3909
Credit Card        2749
Mailed Check        385
Name: count, dtype: int64

## Creating two datasets
1. df_linear: used for linear models with drop_first=True
2. df_tree: used for tree-based models with drop_first=False

In [10]:
# columns to one-hot encode
categorical_cols = ["internet_type", "contract", "payment_method"]

# dataset for linear models
df_linear = pd.get_dummies(newCleaned_churn_df, columns=categorical_cols,
                           drop_first=True)

# dataset for tree-based models
df_tree = pd.get_dummies(newCleaned_churn_df, columns=categorical_cols,
                         drop_first=False)

## Saving the datasets

In [11]:
output_path_linear = "../data/linear_data.csv"
df_linear.to_csv(output_path_linear, index=False)
print(f"Dataset used for Linear modeling is saved at {output_path_linear}")

Dataset used for Linear modeling is saved at ../data/linear_data.csv


In [12]:
output_path_tree = "../data/tree_data.csv"
df_tree.to_csv(output_path_tree, index=False)
print(f"Dataset used for Tree-based modeling is saved at {output_path_tree}")

Dataset used for Tree-based modeling is saved at ../data/tree_data.csv
