### Loading data - Colab

In [None]:
import pandas as pd
import kagglehub
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
path = kagglehub.dataset_download("blastchar/telco-customer-churn")

# List files in the downloaded directory
print(os.listdir(path))

Using Colab cache for faster access to the 'telco-customer-churn' dataset.
['WA_Fn-UseC_-Telco-Customer-Churn.csv']


In [None]:
df = pd.read_csv(os.path.join(path, 'WA_Fn-UseC_-Telco-Customer-Churn.csv'))

### Loading data - Local

In [1]:
# pip install seaborn matplotlib pandas numpy

import pandas as pd
import os
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
file_path = os.path.join('..', '..', 'data', 'raw', 'WA_Fn-UseC_-Telco-Customer-Churn.csv')

if os.path.exists(file_path):
    print(f"Loading data locally from: {file_path}")
    df = pd.read_csv(file_path)
else:
    print(f"[ISSUE]: File not found at {file_path}")
    print(f"Current Working Directory is: {os.getcwd()}")

Loading data locally from: ..\..\data\raw\WA_Fn-UseC_-Telco-Customer-Churn.csv


### Contd

In [4]:
# first 5 rows of the dataframe to check
# df.head(2)
# head alone, hides the middles columns

pd.set_option('display.max_columns', None)
display(df.head(3))

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes


In [None]:
# Adds a background gradient to numerical columns
display(df.head(10).style.background_gradient(cmap='Blues'))

NameError: name 'df_raw' is not defined

In [5]:
df.shape

(7043, 21)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [7]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


TotalCharges, even though it's a number... seems to be a text.

In [4]:
# check how many null values we have
print(df.isnull().sum())


customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [22]:
# Correct syntax: df_raw['ColumnName'].isnull()
null_rows = df[df['TotalCharges'].isnull()].index

print("Row numbers with missing TotalCharges:")
print(null_rows.tolist())

print("\nInspecting the rows:")
display(df.loc[null_rows])

Row numbers with missing TotalCharges:
[]

Inspecting the rows:


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn


The tenure is 0 in these columns. Meaning that the customers are new. Thus, no total charges makes sense.

In [23]:
# Replacing NaN with 0
df['TotalCharges'] = df['TotalCharges'].fillna(0)

# Verify the fix
print("Remaining missing values:", df['TotalCharges'].isnull().sum())

Remaining missing values: 0


In [31]:
# Set professional plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

# --- 1. DATA CLEANING & TYPE FIXING ---

# FORCE TotalCharges to numeric. 
# 'coerce' turns the empty strings "" into NaN (Not a Number)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Check how many missing values we have now
missing_val = df.isnull().sum()
print(f"Missing values found:\n{missing_val[missing_val > 0]}")

# Fill the missing values (usually 11 rows for new customers with 0 tenure) with 0
#df_raw['TotalCharges'].fillna(0, inplace=True)

# Verify types are correct now
print("\nData Types after fix:")
#print(df_raw.dtypes[['tenure', 'MonthlyCharges', 'TotalCharges']])

Missing values found:
TotalCharges    11
dtype: int64

Data Types after fix:
