In [2]:
import pandas as pd

# Define the file path relative to the notebook's location.
# It's a good practice to use 'os.path' for this, but for simplicity here's the direct path.
# The notebook is in 'notebooks', and the data is in 'data/raw'
file_path = '../data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv'

# Load the data into a DataFrame
df = pd.read_csv(file_path)

# Display the first 5 rows to inspect the data
print("Initial Data Preview:")
print(df.head())

# Display a summary of the DataFrame to check data types and non-null values
print("\nDataFrame Info:")
df.info()

Initial Data Preview:
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovie

In [3]:
# Check for unique values to spot non-numeric entries
print("Unique values in TotalCharges:")
print(df['TotalCharges'].unique())

# Count how many entries are not valid numbers (empty strings, etc.)
# We will use the 'to_numeric' function to identify these errors
# 'errors="coerce"' will turn problematic values into NaN
# 'isnull().sum()' will count the NaNs
print("\nNumber of problematic entries in TotalCharges:")
print(pd.to_numeric(df['TotalCharges'], errors='coerce').isnull().sum())

Unique values in TotalCharges:
['29.85' '1889.5' '108.15' ... '346.45' '306.6' '6844.5']

Number of problematic entries in TotalCharges:
11


In [4]:
# Convert TotalCharges to numeric, coercing errors to NaN
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Drop the rows where TotalCharges is NaN
df.dropna(inplace=True)

# Verify the changes
print("DataFrame Info after cleaning:")
df.info()

DataFrame Info after cleaning:
<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   object 
 1   gender            7032 non-null   object 
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   object 
 4   Dependents        7032 non-null   object 
 5   tenure            7032 non-null   int64  
 6   PhoneService      7032 non-null   object 
 7   MultipleLines     7032 non-null   object 
 8   InternetService   7032 non-null   object 
 9   OnlineSecurity    7032 non-null   object 
 10  OnlineBackup      7032 non-null   object 
 11  DeviceProtection  7032 non-null   object 
 12  TechSupport       7032 non-null   object 
 13  StreamingTV       7032 non-null   object 
 14  StreamingMovies   7032 non-null   object 
 15  Contract          7032 non-null   object 
 16  PaperlessBilling

In [5]:
# Calculate the overall ARPU (Average Revenue Per User)
# The average of the 'MonthlyCharges' column
overall_arpu = df['MonthlyCharges'].mean()
print(f"Overall ARPU: ${overall_arpu:.2f}")

# Convert 'Churn' column to numeric (1 for 'Yes', 0 for 'No') for easier calculation
# This is a common and important data preparation step
df['Churn_numeric'] = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

# Calculate the overall Churn Rate
# This is the mean of the newly created 'Churn_numeric' column, multiplied by 100
overall_churn_rate = df['Churn_numeric'].mean() * 100
print(f"Overall Churn Rate: {overall_churn_rate:.2f}%")

Overall ARPU: $64.80
Overall Churn Rate: 26.58%


In [None]:
# Calculate churn rate by contract type
churn_by_contract = df.groupby('Contract')['Churn_numeric'].mean() * 100

print("Churn Rate by Contract Type:")
print(churn_by_contract)