Objetivo do Notebook 01
- Unificar as tabelas (se necessário) e padronizar chaves (CustomerID).
- Limpar: tipos, duplicates, missing values, strings.
- Preparar features: encoding, scaling e guardar o dataset limpo + metadados.
- Validação rápida com um baseline (LogReg) para garantir que o pipeline funciona.

In [1]:
import sys, os
sys.path.append(os.path.abspath(".."))


In [4]:
# === 1. Imports ===
import os
import pandas as pd
import numpy as np
from src.utils_data import load_df, save_df

# === 2. Configurações e caminhos ===
DATA_DIR = os.path.join("..", "data")
RAW_MERGED = os.path.join(DATA_DIR, "interim", "raw_merged.csv")
CLEANED = os.path.join(DATA_DIR, "interim", "cleaned_df.csv")

# === 3. Carregar dados ===
df = pd.read_csv(RAW_MERGED)
print(f"Shape inicial: {df.shape}")
df.head()


Shape inicial: (7043, 51)


Unnamed: 0,customerid,count,gender,age,under30,seniorcitizen,married,dependents,numberofdependents,country,...,totallongdistancecharges,totalrevenue,satisfactionscore,customerstatus,churnlabel,churnvalue,churnscore,cltv,churncategory,churnreason
0,8779-QRDMV,1,Male,78,No,Yes,No,No,0,United States,...,0.0,59.65,3,Churned,Yes,1,91,5433,Competitor,Competitor offered more data
1,7495-OOKFY,1,Female,74,No,Yes,Yes,Yes,1,United States,...,390.8,1024.1,3,Churned,Yes,1,69,5302,Competitor,Competitor made better offer
2,1658-BYGOY,1,Male,71,No,Yes,No,Yes,3,United States,...,203.94,1910.88,2,Churned,Yes,1,81,3179,Competitor,Competitor made better offer
3,4598-XLKNJ,1,Female,78,No,Yes,Yes,Yes,1,United States,...,494.0,2995.07,2,Churned,Yes,1,88,5337,Dissatisfaction,Limited range of services
4,4846-WHAFZ,1,Female,80,No,Yes,Yes,Yes,1,United States,...,234.21,3102.36,2,Churned,Yes,1,67,2793,Price,Extra data charges


In [5]:
# Normalizar nomes de colunas
df.columns = (
    df.columns.str.strip()
              .str.lower()
              .str.replace(" ", "", regex=False)
)

# Remover colunas irrelevantes ou duplicadas
drop_cols = ["count"]  # já veio repetida nos merges
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# Converter colunas numéricas
num_cols = ["totalcharges", "monthlycharge", "tenureinmonths", "churnscore", "cltv"]
for col in num_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# Corrigir nulos
df = df.fillna({
    "offer": "None",
    "internettype": "Unknown",
    "churncategory": "Unknown",
    "churnreason": "Unknown"
})
df = df.fillna(0)


In [6]:
save_df(df, "cleaned_df", folder="interim")
print("✅ Guardado dataset limpo: data/interim/cleaned_df.csv")


✅ Guardado: C:\Users\sarac\rep_EDSB\EnterpriseDataScienceBootcamp_workgroup\data\interim\cleaned_df.csv
✅ Guardado dataset limpo: data/interim/cleaned_df.csv


In [7]:
import os, json, re
import numpy as np
import pandas as pd

In [8]:
DATA_DIR = os.path.join('..', 'data')
RAW_MERGED_CSV = os.path.join(DATA_DIR, 'Telco_customer_churn_merged.csv')
CLEAN_CSV = os.path.join(DATA_DIR, 'telco_churn_clean.csv')
FEATURES_META = os.path.join(DATA_DIR, 'features.json')

USE_RAW_EXCELS = False
FILES = {
    'demographics': 'Telco_customer_churn_demographics.xlsx',
    'location':     'Telco_customer_churn_location.xlsx',
    'population':   'Telco_customer_churn_population.xlsx',
    'services':     'Telco_customer_churn_services.xlsx',
    'status':       'Telco_customer_churn_status.xlsx',
}
