<a href="https://colab.research.google.com/github/rlong6767/ryan-ml-portfolio/blob/main/customer-churn/notebooks/Churn_prepocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [52]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("blastchar/telco-customer-churn")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/telco-customer-churn


In [53]:
import pandas as pd
import os

df = pd.read_csv(os.path.join(path, "WA_Fn-UseC_-Telco-Customer-Churn.csv"))

In [54]:
# Show info about the DataFrame (column types, non-null counts)
print("=== DataFrame Info ===")
df.info()
print("\n")  # Add a blank line for readability

# Show first 5 rows
print("=== First 5 Rows ===")
print(df.head())
print("\n")

# Show shape (rows, columns)
print("=== Shape ===")
print(df.shape)
print("\n")

# Show basic statistics for numeric columns
print("=== Descriptive Statistics ===")
print(df.describe())
print("\n")

# Show counts of target variable (Churn)
print("=== Churn Value Counts ===")
print(df['Churn'].value_counts())


=== DataFrame Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7

In [55]:
# Check unique values that might cause issues
df['TotalCharges'].unique()[:20]  # Show first 20 unique values


array(['29.85', '1889.5', '108.15', '1840.75', '151.65', '820.5',
       '1949.4', '301.9', '3046.05', '3487.95', '587.45', '326.8',
       '5681.1', '5036.3', '2686.05', '7895.15', '1022.95', '7382.25',
       '528.35', '1862.9'], dtype=object)

In [56]:
# Convert 'TotalCharges' to numeric; coerce errors into NaN
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Check for NaNs created in the conversion
df['TotalCharges'].isna().sum()


np.int64(11)

In [57]:
# Drop rows with NaN (since < 1%)
df = df.dropna(subset=['TotalCharges'])

In [58]:
# Convert simple Yes/No or Male/Female to 0/1
binary_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']
for col in binary_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0, 'Male': 1, 'Female': 0})

# For other categorical columns with multiple values, first check unique values
for col in ['MultipleLines', 'InternetService', 'OnlineSecurity',
            'OnlineBackup', 'DeviceProtection', 'TechSupport',
            'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']:
    print(f"{col} unique values: {df[col].unique()}")

MultipleLines unique values: ['No phone service' 'No' 'Yes']
InternetService unique values: ['DSL' 'Fiber optic' 'No']
OnlineSecurity unique values: ['No' 'Yes' 'No internet service']
OnlineBackup unique values: ['Yes' 'No' 'No internet service']
DeviceProtection unique values: ['No' 'Yes' 'No internet service']
TechSupport unique values: ['No' 'Yes' 'No internet service']
StreamingTV unique values: ['No' 'Yes' 'No internet service']
StreamingMovies unique values: ['No' 'Yes' 'No internet service']
Contract unique values: ['Month-to-month' 'One year' 'Two year']
PaymentMethod unique values: ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']


In [59]:
# Merge 'No phone service' and 'No internet service' into a single 'No'
cols_to_merge = ['MultipleLines', 'OnlineSecurity', 'OnlineBackup',
                 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']

for col in cols_to_merge:
    df[col] = df[col].replace({'No phone service':'No', 'No internet service':'No'})

# Double-check unique values
for col in cols_to_merge:
    print(f"{col} unique values after merge: {df[col].unique()}")


MultipleLines unique values after merge: ['No' 'Yes']
OnlineSecurity unique values after merge: ['No' 'Yes']
OnlineBackup unique values after merge: ['Yes' 'No']
DeviceProtection unique values after merge: ['No' 'Yes']
TechSupport unique values after merge: ['No' 'Yes']
StreamingTV unique values after merge: ['No' 'Yes']
StreamingMovies unique values after merge: ['No' 'Yes']


In [60]:
# Now apply one-hot encoding to the multi-category features
df = pd.get_dummies(df, columns=['InternetService', 'Contract', 'PaymentMethod'], drop_first=True)

In [37]:
binary_service_cols = ['MultipleLines', 'OnlineSecurity', 'OnlineBackup',
                       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']

for col in binary_service_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0})

print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 25 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   customerID                             7032 non-null   object 
 1   gender                                 7032 non-null   int64  
 2   SeniorCitizen                          7032 non-null   int64  
 3   Partner                                7032 non-null   int64  
 4   Dependents                             7032 non-null   int64  
 5   tenure                                 7032 non-null   int64  
 6   PhoneService                           7032 non-null   int64  
 7   MultipleLines                          7032 non-null   int64  
 8   OnlineSecurity                         7032 non-null   int64  
 9   OnlineBackup                           7032 non-null   int64  
 10  DeviceProtection                       7032 non-null   int64  
 11  TechSuppo

In [61]:
# Keep a copy of IDs for later (to match predictions to customers)
customer_ids = df['customerID']

# Drop the ID column for training
df = df.drop('customerID', axis=1)

# Convert boolean columns to int to prevent edge-case issues with model training
df = df.astype({col: 'int' for col in df.select_dtypes('bool').columns})
