In [1]:
import pandas as pd
import numpy as np


In [2]:
file_path = "Telco_Customer_Churn.csv"
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
df_clean = df.copy()


In [4]:
df_clean['TotalCharges'].dtype


dtype('O')

In [5]:
df_clean['TotalCharges'] = pd.to_numeric(
    df_clean['TotalCharges'],
    errors='coerce'
)


In [6]:
df_clean['TotalCharges'].dtype


dtype('float64')

In [7]:
df_clean['TotalCharges'].isnull().sum()


np.int64(11)

In [8]:
df_clean.isnull().sum()


Unnamed: 0,0
customerID,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0


In [9]:
df_clean['TotalCharges'] = df_clean['TotalCharges'].fillna(0)


In [10]:
df_clean.isnull().sum()

Unnamed: 0,0
customerID,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0


In [11]:
df_clean.drop(columns=['customerID'], inplace=True)


In [12]:
df_clean.columns


Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [13]:
binary_cols = [
    'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn'
]


In [14]:
for col in binary_cols:
    df_clean[col] = df_clean[col].map({'Yes': 1, 'No': 0})


In [15]:
df_clean[binary_cols].head()

Unnamed: 0,Partner,Dependents,PhoneService,PaperlessBilling,Churn
0,1,0,0,1,0
1,0,0,1,0,0
2,0,0,1,1,1
3,0,0,0,0,0
4,0,0,1,1,1


In [16]:
df_clean['SeniorCitizen'].value_counts()

Unnamed: 0_level_0,count
SeniorCitizen,Unnamed: 1_level_1
0,5901
1,1142


In [17]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   int64  
 3   Dependents        7043 non-null   int64  
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   int64  
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   int64  
 16  PaymentMethod     7043 non-null   object 


In [18]:
df_clean.describe()

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn
count,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0
mean,0.162147,0.483033,0.299588,32.371149,0.903166,0.592219,64.761692,2279.734304,0.26537
std,0.368612,0.499748,0.45811,24.559481,0.295752,0.491457,30.090047,2266.79447,0.441561
min,0.0,0.0,0.0,0.0,0.0,0.0,18.25,0.0,0.0
25%,0.0,0.0,0.0,9.0,1.0,0.0,35.5,398.55,0.0
50%,0.0,0.0,0.0,29.0,1.0,1.0,70.35,1394.55,0.0
75%,0.0,1.0,1.0,55.0,1.0,1.0,89.85,3786.6,1.0
max,1.0,1.0,1.0,72.0,1.0,1.0,118.75,8684.8,1.0


In [19]:
cleaned_path = "cleaned_data.csv"
df_clean.to_csv(cleaned_path, index=False)

cleaned_path

'cleaned_data.csv'

In [20]:
from google.colab import files
files.download(cleaned_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>