In [11]:
import pandas as pd

# Load dataset
df = pd.read_csv("Telco-Customer-Churn.csv")

# Quick peek
print(df.shape)
df.head()


(7043, 21)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [13]:
# Handle Missing Values:-
# Check for missing values
df.isnull().sum()

# For TotalCharges, some blanks exist → convert to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Fill missing TotalCharges with median
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())


customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [15]:
#Drop irrelavant columns

# customerID is just an identifier, drop it
df = df.drop(columns=['customerID'])

In [17]:
# Encode Target Variable:-

# Churn column is Yes/No → convert to binary (1 = churn, 0 = no churn)
df['Churn'] = df['Churn'].map({'Yes':1, 'No':0})

In [19]:
#Handle Categorial variables:-
# Split categorical and numeric
cat_cols = df.select_dtypes(include=['object']).columns
num_cols = df.select_dtypes(exclude=['object']).columns

# One-hot encode categorical columns
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)

print("Final shape after encoding:", df_encoded.shape)


Final shape after encoding: (7043, 6560)


In [21]:
#Feature Sclaing:

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_encoded[num_cols] = scaler.fit_transform(df_encoded[num_cols])

df_encoded.head()


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,...,TotalCharges_995.35,TotalCharges_996.45,TotalCharges_996.85,TotalCharges_996.95,TotalCharges_997.65,TotalCharges_997.75,TotalCharges_998.1,TotalCharges_999.45,TotalCharges_999.8,TotalCharges_999.9
0,-0.439916,-1.277445,-1.160323,-0.601023,False,True,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
1,-0.439916,0.066327,-0.259629,-0.601023,True,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
2,-0.439916,-1.236724,-0.36266,1.663829,True,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,-0.439916,0.514251,-0.746535,-0.601023,True,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
4,-0.439916,-1.236724,0.197365,1.663829,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False


In [27]:
#Save the clean dataset
df_encoded.to_csv("Telco-Customer-Churn-Clean.csv", index=False)

print("Cleaned dataset saved as 'Telco-Customer-Churn-Clean.csv'")
print("Shape:", df_encoded.shape)


Cleaned dataset saved as 'Telco-Customer-Churn-Clean.csv'
Shape: (7043, 6560)
