In [1]:
import pandas as pd
import numpy as np

# 1. Load data (adjust filename if needed)

In [2]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

# 2. Quick preview & types

In [3]:
print(df.shape)
print(df.info())
print(df.head())

(7043, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null

# 3. Remove duplicates

In [4]:
df = df.drop_duplicates()

# 4. Handle missing and blank values

In [5]:
# 'TotalCharges' can have blank/space values, convert to NaN then to float
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
print('Missing TotalCharges:', df['TotalCharges'].isna().sum())

Missing TotalCharges: 11


In [6]:
# In this dataset, NaN in TotalCharges always happen where tenure==0 (brand new customers)
df = df[df['TotalCharges'].notna()]
df['TotalCharges'] = df['TotalCharges'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['TotalCharges'] = df['TotalCharges'].astype(float)



# 5. Convert integer flags to categorical (optional, for consistency)

In [7]:
df['SeniorCitizen'] = df['SeniorCitizen'].replace({1: 'Yes', 0: 'No'})

# 6. Clean categorical columns

In [8]:
# Replace "No internet service" and "No phone service" with "No" for simplicity
no_internet_cols = [
    'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
    'TechSupport', 'StreamingTV', 'StreamingMovies'
]
for col in no_internet_cols:
    df[col] = df[col].replace({'No internet service': 'No'})
df['MultipleLines'] = df['MultipleLines'].replace({'No phone service': 'No'})

# 7. Encode categorical variables (One-hot encoding, but first drop customerID)

In [9]:
X = df.drop(['customerID', 'Churn'], axis=1)
y = df['Churn'].map({'No': 0, 'Yes': 1})

In [10]:
# List of categorical columns
cat_cols = X.select_dtypes('object').columns.tolist()
# Get dummies (drop_first=True to avoid redundant columns)
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

# 8. (Optional) Remove outliers in numeric columns (tenure, MonthlyCharges, TotalCharges)

In [11]:
for col in ['tenure', 'MonthlyCharges', 'TotalCharges']:
    upper = X[col].quantile(0.99)
    X = X[X[col] <= upper]

In [12]:
# Align y index (if rows dropped due to outliers)
y = y.loc[X.index]

# 9. Final cleaned data check

In [13]:
print(X.shape)
print(y.value_counts())
print(X.info())
print(X.head())

(6891, 23)
Churn
0    5030
1    1861
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 6891 entries, 0 to 7042
Data columns (total 23 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   tenure                                 6891 non-null   int64  
 1   MonthlyCharges                         6891 non-null   float64
 2   TotalCharges                           6891 non-null   float64
 3   gender_Male                            6891 non-null   bool   
 4   SeniorCitizen_Yes                      6891 non-null   bool   
 5   Partner_Yes                            6891 non-null   bool   
 6   Dependents_Yes                         6891 non-null   bool   
 7   PhoneService_Yes                       6891 non-null   bool   
 8   MultipleLines_Yes                      6891 non-null   bool   
 9   InternetService_Fiber optic            6891 non-null   bool   
 10  InternetServic

# 10. Save cleaned dataset (features + target)

In [14]:
cleaned_data = X.copy()
cleaned_data['Churn'] = y  # Add target column
cleaned_data.to_csv('telco_churn_cleaned.csv', index=False)