In [None]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')

# This helps identify incorrect data types
print(df.dtypes)

# The 'TotalCharges' column is stored as object because of empty spaces

df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')


df.drop('customerID', axis=1, inplace=True)

In [None]:
# Checking how many missing values are present in each column
print(df.isnull().sum())

# median is a safer choice than mean for filling missing values
total_charges_median = df['TotalCharges'].median()
df['TotalCharges'] = df['TotalCharges'].fillna(total_charges_median)

In [None]:
# Function to handle outliers using the IQR method
# Instead of deleting rows, extreme values are capped to reduce their impact

def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    

    df[column] = np.where(df[column] > upper_bound, upper_bound,
                 np.where(df[column] < lower_bound, lower_bound, df[column]))
    return df


# Applying outlier handling to numerical columns

# These columns are more likely to have skewed values

for col in ['tenure', 'MonthlyCharges', 'TotalCharges']:
    df = handle_outliers(df, col)

In [None]:
# double checking
print(df.columns)

# Selecting categorical columns that have multiple categories
# These columns need one-hot encoding

multi_category_cols = [
    'gender', 'MultipleLines', 'InternetService', 'OnlineSecurity', 
    'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 
    'StreamingMovies', 'Contract', 'PaymentMethod'
]

# Converting categorical variables into numerical format using one-hot encoding
# drop_first=True ........ because this avoids creating unncecessary dummy columns

df = pd.get_dummies(df, columns=multi_category_cols, drop_first=True)

In [None]:
# Saving the cleaned dataset

df.to_csv('Telco_Churn_Cleaned.csv', index=False)
print("Data Preprocessing Complete!")