<h1 style="color:#1f77b4; text-align:left; font-size:40px;">
    Data Preparation and Cleaning
</h1>

<h3 style="color:#555; text-align:left;">
    Dataset Understanding, Descriptives Statistics, Outliers, correlations and visualization
</h3>


<h2 style="color:#1f77b4; border-bottom: 3px solid #1f77b4; padding-bottom:4px;">
</h2>

In [1]:
import pandas as pd

# Path to original master file
RAW_PATH = "../data/interim/telco_master.csv"

df = pd.read_csv(RAW_PATH)

print(df.shape)
df.head()


(7043, 56)


Unnamed: 0,demo_count,demo_gender,demo_age,demo_under_30,demo_senior_citizen,demo_married,demo_dependents,demo_number_of_dependents,loc_count,loc_country,...,st_quarter,st_satisfaction_score,st_customer_status,st_churn_label,st_churn_value,st_churn_score,st_cltv,st_churn_category,st_churn_reason,zipcode_population
0,1,Male,78,No,Yes,No,No,0,1,United States,...,Q3,3,Churned,Yes,1,91,5433,Competitor,Competitor offered more data,68701
1,1,Female,74,No,Yes,Yes,Yes,1,1,United States,...,Q3,3,Churned,Yes,1,69,5302,Competitor,Competitor made better offer,55668
2,1,Male,71,No,Yes,No,Yes,3,1,United States,...,Q3,2,Churned,Yes,1,81,3179,Competitor,Competitor made better offer,47534
3,1,Female,78,No,Yes,Yes,Yes,1,1,United States,...,Q3,2,Churned,Yes,1,88,5337,Dissatisfaction,Limited range of services,27778
4,1,Female,80,No,Yes,Yes,Yes,1,1,United States,...,Q3,2,Churned,Yes,1,67,2793,Price,Extra data charges,26265


In [2]:
#based on EDA conclusions, drop unneeded columns
cols_to_drop = [
    "demo_count",
    "loc_count",
    "svc_count",
    "st_count",
    "loc_country",
    "loc_state",
    "svc_quarter",
    "st_quarter",
    "st_churn_label",
    "st_churn_score",
    "loc_city",
    "loc_lat_long",
    "loc_latitude",
    "loc_longitude",
    "svc_total_charges",
    "svc_total_long_distance_charges",
    "svc_monthly_charge",
]

# Drop only those that exist (defensive)
existing_to_drop = [c for c in cols_to_drop if c in df.columns]

df = df.drop(columns=existing_to_drop)

print("Dropped columns:", existing_to_drop)
print("New shape:", df.shape)


Dropped columns: ['demo_count', 'loc_count', 'svc_count', 'st_count', 'loc_country', 'loc_state', 'svc_quarter', 'st_quarter', 'st_churn_label', 'st_churn_score', 'loc_city', 'loc_lat_long', 'loc_latitude', 'loc_longitude', 'svc_total_charges', 'svc_total_long_distance_charges', 'svc_monthly_charge']
New shape: (7043, 39)


In [3]:
#Handle meaningful missing values in categoricals
# Fill missing values with explicit categories where appropriate
fill_map = {}

if "svc_offer" in df.columns:
    fill_map["svc_offer"] = "No Offer"

if "svc_internet_type" in df.columns:
    fill_map["svc_internet_type"] = "No Internet"

if "st_churn_category" in df.columns:
    fill_map["st_churn_category"] = "Not Churned"

if "st_churn_reason" in df.columns:
    fill_map["st_churn_reason"] = "Not Churned"

df = df.fillna(value=fill_map)

# Quick check
df[["svc_offer", "svc_internet_type", "st_churn_category", "st_churn_reason"]].head()


Unnamed: 0,svc_offer,svc_internet_type,st_churn_category,st_churn_reason
0,No Offer,DSL,Competitor,Competitor offered more data
1,Offer E,Fiber Optic,Competitor,Competitor made better offer
2,Offer D,Fiber Optic,Competitor,Competitor made better offer
3,Offer C,Fiber Optic,Dissatisfaction,Limited range of services
4,Offer C,Fiber Optic,Price,Extra data charges


In [5]:
#Ensure Yes/No consistency AND convert binary fields to 0/1

binary_cols = [
    "demo_under_30",
    "demo_senior_citizen",
    "demo_married",
    "demo_dependents",
    "svc_referred_a_friend",
    "svc_phone_service",
    "svc_multiple_lines",
    "svc_internet_service",
    "svc_online_security",
    "svc_online_backup",
    "svc_device_protection_plan",
    "svc_premium_tech_support",
    "svc_streaming_tv",
    "svc_streaming_movies",
    "svc_streaming_music",
    "svc_unlimited_data",
    "svc_paperless_billing",
]

# Standardize Yes/No → Yes/No
for col in binary_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip().str.title()  # e.g., "yes" → "Yes"

# Convert Yes/No → 1/0
for col in binary_cols:
    if col in df.columns:
        df[col] = df[col].map({"Yes": 1, "No": 0})

# Quick check
print("Binary columns converted to 0/1:")
for col in binary_cols:
    if col in df.columns:
        print(col, df[col].unique())



Binary columns converted to 0/1:
demo_under_30 [0 1]
demo_senior_citizen [1 0]
demo_married [0 1]
demo_dependents [0 1]
svc_referred_a_friend [0 1]
svc_phone_service [0 1]
svc_multiple_lines [0 1]
svc_internet_service [1 0]
svc_online_security [0 1]
svc_online_backup [0 1]
svc_device_protection_plan [1 0]
svc_premium_tech_support [0 1]
svc_streaming_tv [0 1]
svc_streaming_movies [1 0]
svc_streaming_music [0 1]
svc_unlimited_data [0 1]
svc_paperless_billing [1 0]


In [6]:
# Convert demo_number_of_dependents into a binary feature

if "demo_number_of_dependents" in df.columns:
    df["demo_has_dependents"] = df["demo_number_of_dependents"].apply(lambda x: 1 if x > 0 else 0)

    # Optionally drop the original skewed variable
    df = df.drop(columns=["demo_number_of_dependents"])

# Quick check
print("demo_has_dependents unique values:", df["demo_has_dependents"].unique())


demo_has_dependents unique values: [0 1]


In [None]:
sys.path.append(os.path.abspath(".."))


In [None]:
# === 1. Imports ===
import os
import pandas as pd
import numpy as np
from src.utils_data import load_df, save_df

# === 2. Configurações e caminhos ===
DATA_DIR = os.path.join("..", "data")
RAW_MERGED = os.path.join(DATA_DIR, "interim", "raw_merged.csv")
CLEANED = os.path.join(DATA_DIR, "interim", "cleaned_df.csv")

# === 3. Carregar dados ===
df = pd.read_csv(RAW_MERGED)
print(f"Shape inicial: {df.shape}")
df.head()


In [None]:
# Normalizar nomes de colunas
df.columns = (
    df.columns.str.strip()
              .str.lower()
              .str.replace(" ", "", regex=False)
)

# Remover colunas irrelevantes ou duplicadas
drop_cols = ["count"]  # já veio repetida nos merges
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# Converter colunas numéricas
num_cols = ["totalcharges", "monthlycharge", "tenureinmonths", "churnscore", "cltv"]
for col in num_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# Corrigir nulos
df = df.fillna({
    "offer": "None",
    "internettype": "Unknown",
    "churncategory": "Unknown",
    "churnreason": "Unknown"
})
df = df.fillna(0)


In [7]:
#Quick sanity check after cleaning
print("Shape after cleaning:", df.shape)
print("\nMissing values (top 10):")
print(df.isna().sum().sort_values(ascending=False).head(10))


Shape after cleaning: (7043, 39)

Missing values (top 10):
demo_gender                0
demo_age                   0
demo_under_30              0
demo_senior_citizen        0
demo_married               0
demo_dependents            0
loc_zip_code               0
svc_referred_a_friend      0
svc_number_of_referrals    0
svc_tenure_in_months       0
dtype: int64


In [10]:
#Save cleaned dataset to interim folder
CLEAN_PATH = "../data/interim/telco_master_clean.csv"

df.to_csv(CLEAN_PATH, index=False)

print(f"Cleaned dataset saved to: {CLEAN_PATH}")


Cleaned dataset saved to: ../data/interim/telco_master_clean.csv


In [9]:
# Data Type Summary + Small-Cardinality Category Check
# Step: Summarize data types
dtype_summary = (
    df.dtypes
    .reset_index()
    .rename(columns={'index': 'column', 0: 'dtype'})
    .sort_values('dtype')
)

print("=== Data Type Summary ===")
display(dtype_summary)


# Step: Identify categorical columns AFTER cleaning
cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()

print("\n=== Categorical Columns ===")
print(cat_cols)


# Step: Show unique values for categorical columns with <5 unique entries
print("\n=== Categorical Variables with <5 Unique Values ===")
for col in cat_cols:
    unique_vals = df[col].unique()
    if len(unique_vals) <= 5:
        print(f"\nColumn: {col}  (unique={len(unique_vals)})")
        print(unique_vals)


=== Data Type Summary ===


Unnamed: 0,column,dtype
19,svc_device_protection_plan,int64
34,st_cltv,int64
33,st_churn_value,int64
31,st_satisfaction_score,int64
29,svc_total_extra_data_charges,int64
26,svc_paperless_billing,int64
24,svc_unlimited_data,int64
23,svc_streaming_music,int64
22,svc_streaming_movies,int64
21,svc_streaming_tv,int64



=== Categorical Columns ===
['demo_gender', 'svc_offer', 'svc_internet_type', 'svc_contract', 'svc_payment_method', 'st_customer_status', 'st_churn_category', 'st_churn_reason']

=== Categorical Variables with <5 Unique Values ===

Column: demo_gender  (unique=2)
['Male' 'Female']

Column: svc_internet_type  (unique=4)
['DSL' 'Fiber Optic' 'Cable' 'No Internet']

Column: svc_contract  (unique=3)
['Month-to-Month' 'One Year' 'Two Year']

Column: svc_payment_method  (unique=3)
['Bank Withdrawal' 'Credit Card' 'Mailed Check']

Column: st_customer_status  (unique=3)
['Churned' 'Stayed' 'Joined']


# 1: Dataset Understanding

# 2: Descriptive Statistics

# 3: Outliers Detection

# 4: Correlations

# 5: Visualization of variables and relations

<!-- MODELO: Secção principal numerada -->
<!-- 
<h2 style="background-color:#1f77b4; color:white; padding:10px; border-radius:6px;">
    X. Nome da Secção
</h2>
-->

<!-- MODELO: Secção com linha colorida -->
<!-- 
<h2 style="color:#ff7f0e; border-bottom: 3px solid #ff7f0e; padding-bottom:4px;">
    X. Nome da Secção
</h2>
-->

<!-- MODELO: Subsecção -->
<!-- 
<h3 style="color:#2ca02c; margin-top:10px;">
    X.Y Nome da Subsecção
</h3>
-->

<!-- MODELO: Caixa de Nota -->
<!-- 
<div style="border-left: 5px solid #1f77b4; padding:10px; background-color:#f5f9ff; margin:15px 0;">
    <b>Nota:</b> Texto da nota.
</div>
-->

<!-- MODELO: Secção principal numerada -->
<!-- 
<h2 style="background-color:#1f77b4; color:white; padding:10px; border-radius:6px;">
    X. Nome da Secção
</h2>
-->

<!-- MODELO: Secção com linha colorida -->
<!-- 
<h2 style="color:#ff7f0e; border-bottom: 3px solid #ff7f0e; padding-bottom:4px;">
    X. Nome da Secção
</h2>
-->

<!-- MODELO: Subsecção -->
<!-- 
<h3 style="color:#2ca02c; margin-top:10px;">
    X.Y Nome da Subsecção
</h3>
-->

<!-- MODELO: Caixa de Nota -->
<!-- 
<div style="border-left: 5px solid #1f77b4; padding:10px; background-color:#f5f9ff; margin:15px 0;">
    <b>Nota:</b> Texto da nota.
</div>
-->