# =====================================
# 00. Data Loading
# =====================================

PURPOSE:
Load all raw datasets and check their basic structure.

1. --- Imports ---

In [1]:
import sys
from pathlib import Path
import pandas as pd

2. --- Path Configuration ---

In [2]:
ROOT = Path.cwd().resolve().parent
sys.path.append(str(ROOT / "src"))

from config import RAW_DIR
from utils_data import quick_overview  # optional helper

3. --- Load Data ---

In [3]:
files = {
    "demographics": RAW_DIR / "Telco_customer_churn_demographics.xlsx",
    "location": RAW_DIR / "Telco_customer_churn_location.xlsx",
    "population": RAW_DIR / "Telco_customer_churn_population.xlsx",
    "services": RAW_DIR / "Telco_customer_churn_services.xlsx",
    "status": RAW_DIR / "Telco_customer_churn_status.xlsx",
}

dfs = {}
for name, path in files.items():
    dfs[name] = pd.read_excel(path)
    print(f"Loaded {name:12s}: {dfs[name].shape[0]} rows × {dfs[name].shape[1]} cols")


Loaded demographics: 7043 rows × 9 cols
Loaded location    : 7043 rows × 9 cols
Loaded population  : 1671 rows × 3 cols
Loaded services    : 7043 rows × 30 cols
Loaded status      : 7043 rows × 11 cols


4. --- Quick Overview ---

In [4]:
for name, df in dfs.items():
    quick_overview(df, f"{name} dataset")


===== demographics dataset =====
Shape: 7043 rows × 9 columns

Data types:
Customer ID             object
Count                    int64
Gender                  object
Age                      int64
Under 30                object
Senior Citizen          object
Married                 object
Dependents              object
Number of Dependents     int64
dtype: object

Missing values per column:
Customer ID             0
Count                   0
Gender                  0
Age                     0
Under 30                0
Senior Citizen          0
Married                 0
Dependents              0
Number of Dependents    0
dtype: int64

First 5 rows:


Unnamed: 0,Customer ID,Count,Gender,Age,Under 30,Senior Citizen,Married,Dependents,Number of Dependents
0,8779-QRDMV,1,Male,78,No,Yes,No,No,0
1,7495-OOKFY,1,Female,74,No,Yes,Yes,Yes,1
2,1658-BYGOY,1,Male,71,No,Yes,No,Yes,3
3,4598-XLKNJ,1,Female,78,No,Yes,Yes,Yes,1
4,4846-WHAFZ,1,Female,80,No,Yes,Yes,Yes,1



===== location dataset =====
Shape: 7043 rows × 9 columns

Data types:
Customer ID     object
Count            int64
Country         object
State           object
City            object
Zip Code         int64
Lat Long        object
Latitude       float64
Longitude      float64
dtype: object

Missing values per column:
Customer ID    0
Count          0
Country        0
State          0
City           0
Zip Code       0
Lat Long       0
Latitude       0
Longitude      0
dtype: int64

First 5 rows:


Unnamed: 0,Customer ID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude
0,8779-QRDMV,1,United States,California,Los Angeles,90022,"34.02381, -118.156582",34.02381,-118.156582
1,7495-OOKFY,1,United States,California,Los Angeles,90063,"34.044271, -118.185237",34.044271,-118.185237
2,1658-BYGOY,1,United States,California,Los Angeles,90065,"34.108833, -118.229715",34.108833,-118.229715
3,4598-XLKNJ,1,United States,California,Inglewood,90303,"33.936291, -118.332639",33.936291,-118.332639
4,4846-WHAFZ,1,United States,California,Whittier,90602,"33.972119, -118.020188",33.972119,-118.020188



===== population dataset =====
Shape: 1671 rows × 3 columns

Data types:
ID            int64
Zip Code      int64
Population    int64
dtype: object

Missing values per column:
ID            0
Zip Code      0
Population    0
dtype: int64

First 5 rows:


Unnamed: 0,ID,Zip Code,Population
0,1,90001,54492
1,2,90002,44586
2,3,90003,58198
3,4,90004,67852
4,5,90005,43019



===== services dataset =====
Shape: 7043 rows × 30 columns

Data types:
Customer ID                           object
Count                                  int64
Quarter                               object
Referred a Friend                     object
Number of Referrals                    int64
Tenure in Months                       int64
Offer                                 object
Phone Service                         object
Avg Monthly Long Distance Charges    float64
Multiple Lines                        object
Internet Service                      object
Internet Type                         object
Avg Monthly GB Download                int64
Online Security                       object
Online Backup                         object
Device Protection Plan                object
Premium Tech Support                  object
Streaming TV                          object
Streaming Movies                      object
Streaming Music                       object
Unlimited Data             

Unnamed: 0,Customer ID,Count,Quarter,Referred a Friend,Number of Referrals,Tenure in Months,Offer,Phone Service,Avg Monthly Long Distance Charges,Multiple Lines,...,Unlimited Data,Contract,Paperless Billing,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue
0,8779-QRDMV,1,Q3,No,0,1,,No,0.0,No,...,No,Month-to-Month,Yes,Bank Withdrawal,39.65,39.65,0.0,20,0.0,59.65
1,7495-OOKFY,1,Q3,Yes,1,8,Offer E,Yes,48.85,Yes,...,Yes,Month-to-Month,Yes,Credit Card,80.65,633.3,0.0,0,390.8,1024.1
2,1658-BYGOY,1,Q3,No,0,18,Offer D,Yes,11.33,Yes,...,Yes,Month-to-Month,Yes,Bank Withdrawal,95.45,1752.55,45.61,0,203.94,1910.88
3,4598-XLKNJ,1,Q3,Yes,1,25,Offer C,Yes,19.76,No,...,Yes,Month-to-Month,Yes,Bank Withdrawal,98.5,2514.5,13.43,0,494.0,2995.07
4,4846-WHAFZ,1,Q3,Yes,1,37,Offer C,Yes,6.33,Yes,...,Yes,Month-to-Month,Yes,Bank Withdrawal,76.5,2868.15,0.0,0,234.21,3102.36



===== status dataset =====
Shape: 7043 rows × 11 columns

Data types:
Customer ID           object
Count                  int64
Quarter               object
Satisfaction Score     int64
Customer Status       object
Churn Label           object
Churn Value            int64
Churn Score            int64
CLTV                   int64
Churn Category        object
Churn Reason          object
dtype: object

Missing values per column:
Churn Category        5174
Churn Reason          5174
Customer ID              0
Count                    0
Quarter                  0
Satisfaction Score       0
Customer Status          0
Churn Label              0
Churn Value              0
Churn Score              0
CLTV                     0
dtype: int64

First 5 rows:


Unnamed: 0,Customer ID,Count,Quarter,Satisfaction Score,Customer Status,Churn Label,Churn Value,Churn Score,CLTV,Churn Category,Churn Reason
0,8779-QRDMV,1,Q3,3,Churned,Yes,1,91,5433,Competitor,Competitor offered more data
1,7495-OOKFY,1,Q3,3,Churned,Yes,1,69,5302,Competitor,Competitor made better offer
2,1658-BYGOY,1,Q3,2,Churned,Yes,1,81,3179,Competitor,Competitor made better offer
3,4598-XLKNJ,1,Q3,2,Churned,Yes,1,88,5337,Dissatisfaction,Limited range of services
4,4846-WHAFZ,1,Q3,2,Churned,Yes,1,67,2793,Price,Extra data charges


In [6]:
print("✅ All raw Telco datasets loaded and validated. Ready for data cleaning and merging.")


✅ All raw Telco datasets loaded and validated. Ready for data cleaning and merging.


# **COMMENT DIANA: OS PASSOS SEGUINTES DEVEM SER MOVIDOS PARA O NOTEBOOK DATA CLEANING PARA MANTER A ESTRUTURA ORGANIZADA. ESTE NOTEBOOK DEVERÁ SER APENAS PARA DATA LOADING**

4. EDA per Table (Types, nulls, min/max, samples)

In [101]:
for name, df in dfs.items():
    print(f"\n{'='*30} {name.upper()} {'='*30}")
    print(f"Shape: {df.shape[0]} lines × {df.shape[1]} columns")
    print("\n Data types and general summary:")
    display(df.info())
    
    # Contagem de nulos e percentagem
    print("\nMissing values:")
    missing = df.isna().sum().to_frame("n_missing")
    missing["pct_missing"] = (missing["n_missing"] / len(df) * 100).round(2)
    display(missing[missing["n_missing"] > 0].sort_values("pct_missing", ascending=False))
    
    # Valores únicos, min, max
    print("\nColumn summary:")
    summary = pd.DataFrame({
        "dtype": df.dtypes,
        "n_unique": df.nunique(),
        "n_missing": df.isna().sum(),
    })
    
    # Para colunas numéricas: adicionar min e max
    numeric_cols = df.select_dtypes(include=np.number).columns
    summary.loc[numeric_cols, "min"] = df[numeric_cols].min()
    summary.loc[numeric_cols, "max"] = df[numeric_cols].max()
    
    display(summary)
    
    # Quick view de algumas linhas
    print("\nSample:")
    display(df.head(3))
    
    print("-" * 80)





Shape: 7043 lines × 9 columns

 Data types and general summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Customer ID           7043 non-null   object
 1   Count                 7043 non-null   int64 
 2   Gender                7043 non-null   object
 3   Age                   7043 non-null   int64 
 4   Under 30              7043 non-null   object
 5   Senior Citizen        7043 non-null   object
 6   Married               7043 non-null   object
 7   Dependents            7043 non-null   object
 8   Number of Dependents  7043 non-null   int64 
dtypes: int64(3), object(6)
memory usage: 495.3+ KB


None


Missing values:


Unnamed: 0,n_missing,pct_missing



Column summary:


Unnamed: 0,dtype,n_unique,n_missing,min,max
Customer ID,object,7043,0,,
Count,int64,1,0,1.0,1.0
Gender,object,2,0,,
Age,int64,62,0,19.0,80.0
Under 30,object,2,0,,
Senior Citizen,object,2,0,,
Married,object,2,0,,
Dependents,object,2,0,,
Number of Dependents,int64,10,0,0.0,9.0



Sample:


Unnamed: 0,Customer ID,Count,Gender,Age,Under 30,Senior Citizen,Married,Dependents,Number of Dependents
0,8779-QRDMV,1,Male,78,No,Yes,No,No,0
1,7495-OOKFY,1,Female,74,No,Yes,Yes,Yes,1
2,1658-BYGOY,1,Male,71,No,Yes,No,Yes,3


--------------------------------------------------------------------------------

Shape: 7043 lines × 9 columns

 Data types and general summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Customer ID  7043 non-null   object 
 1   Count        7043 non-null   int64  
 2   Country      7043 non-null   object 
 3   State        7043 non-null   object 
 4   City         7043 non-null   object 
 5   Zip Code     7043 non-null   int64  
 6   Lat Long     7043 non-null   object 
 7   Latitude     7043 non-null   float64
 8   Longitude    7043 non-null   float64
dtypes: float64(2), int64(2), object(5)
memory usage: 495.3+ KB


None


Missing values:


Unnamed: 0,n_missing,pct_missing



Column summary:


Unnamed: 0,dtype,n_unique,n_missing,min,max
Customer ID,object,7043,0,,
Count,int64,1,0,1.0,1.0
Country,object,1,0,,
State,object,1,0,,
City,object,1106,0,,
Zip Code,int64,1626,0,90001.0,96150.0
Lat Long,object,1679,0,,
Latitude,float64,1626,0,32.555828,41.962127
Longitude,float64,1625,0,-124.301372,-114.192901



Sample:


Unnamed: 0,Customer ID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude
0,8779-QRDMV,1,United States,California,Los Angeles,90022,"34.02381, -118.156582",34.02381,-118.156582
1,7495-OOKFY,1,United States,California,Los Angeles,90063,"34.044271, -118.185237",34.044271,-118.185237
2,1658-BYGOY,1,United States,California,Los Angeles,90065,"34.108833, -118.229715",34.108833,-118.229715


--------------------------------------------------------------------------------

Shape: 1671 lines × 3 columns

 Data types and general summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1671 entries, 0 to 1670
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   ID          1671 non-null   int64
 1   Zip Code    1671 non-null   int64
 2   Population  1671 non-null   int64
dtypes: int64(3)
memory usage: 39.3 KB


None


Missing values:


Unnamed: 0,n_missing,pct_missing



Column summary:


Unnamed: 0,dtype,n_unique,n_missing,min,max
ID,int64,1671,0,1.0,1671.0
Zip Code,int64,1671,0,90001.0,96161.0
Population,int64,1607,0,11.0,105285.0



Sample:


Unnamed: 0,ID,Zip Code,Population
0,1,90001,54492
1,2,90002,44586
2,3,90003,58198


--------------------------------------------------------------------------------

Shape: 7043 lines × 30 columns

 Data types and general summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 30 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Customer ID                        7043 non-null   object 
 1   Count                              7043 non-null   int64  
 2   Quarter                            7043 non-null   object 
 3   Referred a Friend                  7043 non-null   object 
 4   Number of Referrals                7043 non-null   int64  
 5   Tenure in Months                   7043 non-null   int64  
 6   Offer                              3166 non-null   object 
 7   Phone Service                      7043 non-null   object 
 8   Avg Monthly Long Distance Charges  7043 non-null   float64
 9   Multiple Lines                     70

None


Missing values:


Unnamed: 0,n_missing,pct_missing
Offer,3877,55.05
Internet Type,1526,21.67



Column summary:


Unnamed: 0,dtype,n_unique,n_missing,min,max
Customer ID,object,7043,0,,
Count,int64,1,0,1.0,1.0
Quarter,object,1,0,,
Referred a Friend,object,2,0,,
Number of Referrals,int64,12,0,0.0,11.0
Tenure in Months,int64,72,0,1.0,72.0
Offer,object,5,3877,,
Phone Service,object,2,0,,
Avg Monthly Long Distance Charges,float64,3584,0,0.0,49.99
Multiple Lines,object,2,0,,



Sample:


Unnamed: 0,Customer ID,Count,Quarter,Referred a Friend,Number of Referrals,Tenure in Months,Offer,Phone Service,Avg Monthly Long Distance Charges,Multiple Lines,...,Unlimited Data,Contract,Paperless Billing,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue
0,8779-QRDMV,1,Q3,No,0,1,,No,0.0,No,...,No,Month-to-Month,Yes,Bank Withdrawal,39.65,39.65,0.0,20,0.0,59.65
1,7495-OOKFY,1,Q3,Yes,1,8,Offer E,Yes,48.85,Yes,...,Yes,Month-to-Month,Yes,Credit Card,80.65,633.3,0.0,0,390.8,1024.1
2,1658-BYGOY,1,Q3,No,0,18,Offer D,Yes,11.33,Yes,...,Yes,Month-to-Month,Yes,Bank Withdrawal,95.45,1752.55,45.61,0,203.94,1910.88


--------------------------------------------------------------------------------

Shape: 7043 lines × 11 columns

 Data types and general summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Customer ID         7043 non-null   object
 1   Count               7043 non-null   int64 
 2   Quarter             7043 non-null   object
 3   Satisfaction Score  7043 non-null   int64 
 4   Customer Status     7043 non-null   object
 5   Churn Label         7043 non-null   object
 6   Churn Value         7043 non-null   int64 
 7   Churn Score         7043 non-null   int64 
 8   CLTV                7043 non-null   int64 
 9   Churn Category      1869 non-null   object
 10  Churn Reason        1869 non-null   object
dtypes: int64(5), object(6)
memory usage: 605.4+ KB


None


Missing values:


Unnamed: 0,n_missing,pct_missing
Churn Category,5174,73.46
Churn Reason,5174,73.46



Column summary:


Unnamed: 0,dtype,n_unique,n_missing,min,max
Customer ID,object,7043,0,,
Count,int64,1,0,1.0,1.0
Quarter,object,1,0,,
Satisfaction Score,int64,5,0,1.0,5.0
Customer Status,object,3,0,,
Churn Label,object,2,0,,
Churn Value,int64,2,0,0.0,1.0
Churn Score,int64,81,0,5.0,96.0
CLTV,int64,3438,0,2003.0,6500.0
Churn Category,object,5,5174,,



Sample:


Unnamed: 0,Customer ID,Count,Quarter,Satisfaction Score,Customer Status,Churn Label,Churn Value,Churn Score,CLTV,Churn Category,Churn Reason
0,8779-QRDMV,1,Q3,3,Churned,Yes,1,91,5433,Competitor,Competitor offered more data
1,7495-OOKFY,1,Q3,3,Churned,Yes,1,69,5302,Competitor,Competitor made better offer
2,1658-BYGOY,1,Q3,2,Churned,Yes,1,81,3179,Competitor,Competitor made better offer


--------------------------------------------------------------------------------


Demographics
7043 linhas e 9 colunas
Não há missing values
Age: 19 aos 80 anos
Nº dependentes: 0 aos 9
Object (Y/N): Under 30, Senior Citizen, Married, Dependents
Object (F/M): Gender

Location
7043 linhas e 9 colunas
Não há missing values
Country é só USA
State é só California
City são 1106 ?? <- ver se não há nome iguais com formas diferentes de escrita

Population
1671 linhas × 3 colunas
No missing values
ID não parece ter ligação aos outros ID’s
É a população por zip code. 

Services
7043 linhas × 30 colunas
Tem missing values em offer e em internet type

Status
7043 linhas × 11 colunas
Missing values: Churn Category, Churn Reason (5174, são 73.46%)


5. Global summary by Table

In [102]:
print("\nGlobal summary by table:")

global_summary = []

for name, df in dfs.items():
    total_missing = df.isna().sum().sum()
    pct_missing = (total_missing / (df.shape[0] * df.shape[1]) * 100).round(2)
    num_cols = len(df.select_dtypes(include='number').columns)
    cat_cols = len(df.select_dtypes(exclude='number').columns)
    global_summary.append([
        name, df.shape[0], df.shape[1], 
        total_missing, pct_missing, 
        num_cols, cat_cols
    ])

global_summary_df = pd.DataFrame(
    global_summary, 
    columns=[
        "table", "n_rows", "n_cols", 
        "total_missing", "pct_missing", 
        "num_numeric_cols", "num_categorical_cols"
    ]
).sort_values("pct_missing", ascending=False)

display(global_summary_df)



Global summary by table:


Unnamed: 0,table,n_rows,n_cols,total_missing,pct_missing,num_numeric_cols,num_categorical_cols
4,status,7043,11,10348,13.36,5,6
3,services,7043,30,5403,2.56,11,19
0,demographics,7043,9,0,0.0,3,6
1,location,7043,9,0,0.0,4,5
2,population,1671,3,0,0.0,3,0
