# =====================================
# 01. Data Cleaning & Merging
# =====================================

PURPOSE: Clean and merge the 5 raw Telco datasets into a single customer-level table (df_master) with consistent column names and a standard key (customer_id), ready for EDA and modeling.

### Dataset Summary
| 
| **Demographics**  | 7,043 √ó 6 | Key: Customer ID  | Demographic attributes (gender, age, marital status). 
| **Location**      | 7,043 √ó 8 | Key: Customer ID  | Geographic features including country, city, and ZIP code. 
| **Services**      | 7,043 √ó 11| Key: Customer ID  | Subscribed telecom and streaming services. 
| **Status**        | 7,043 √ó 9 | Key: Customer ID  | Account details, tenure, churn label, and churn reason. 
| **Population**    | 1,671 √ó 3 | Key: Zip Code     | ZIP-level population counts; joined later via `Zip Code` from the location table. 

**Merge logic:**
Customer-level tables (`demographics`, `location`, `services`, `status`) are merged one-to-one on `Customer ID`.  
`Population` is an auxiliary dataset used for enrichment through `Zip Code`.

## 1. Imports

In [1]:
import sys
from pathlib import Path
import pandas as pd

## 2. Path Configuration 

In [2]:
ROOT = Path.cwd().resolve().parent
sys.path.append(str(ROOT / "src"))

from config import RAW_DIR
from utils_data import save_df, quick_overview

## 3. Load raw tables

In [3]:
FILES = {
    "demographics": "Telco_customer_churn_demographics.xlsx",
    "location":     "Telco_customer_churn_location.xlsx",
    "population":   "Telco_customer_churn_population.xlsx",
    "services":     "Telco_customer_churn_services.xlsx",
    "status":       "Telco_customer_churn_status.xlsx",
}

dfs = {name: pd.read_excel(RAW_DIR / file) for name, file in FILES.items()}

for name, df in dfs.items():
    print(f"{name:12s}: {df.shape[0]} rows √ó {df.shape[1]} columns")

demographics = dfs["demographics"]
location     = dfs["location"]
population   = dfs["population"]
services     = dfs["services"]
status       = dfs["status"]


demographics: 7043 rows √ó 9 columns
location    : 7043 rows √ó 9 columns
population  : 1671 rows √ó 3 columns
services    : 7043 rows √ó 30 columns
status      : 7043 rows √ó 11 columns


## 4. Normalise column names and key 

In [4]:
KEY = "customer_id"

def normalize(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out.columns = (
        out.columns
        .str.strip()
        .str.replace(r"\s+", "_", regex=True)
        .str.lower()
    )
    # Map variations of the customer key
    for cand in ("customerid", "customer_id", "customer id"):
        if cand in out.columns and cand != KEY:
            out = out.rename(columns={cand: KEY})
    return out

demographics = normalize(demographics)
location     = normalize(location)
population   = normalize(population)
services     = normalize(services)
status       = normalize(status)

# Assert key presence only for customer-level tables
for name, df in {
    "demographics": demographics,
    "location":     location,
    "services":     services,
    "status":       status,
}.items():
    assert KEY in df.columns, f"{name} does not contain '{KEY}' after normalization."


## 5. Basic Cleaning and Consistency Checks 

In [5]:
# Ensure unique customer IDs
for name, df in {
    "demographics": demographics,
    "location":     location,
    "services":     services,
    "status":       status,
}.items():
    assert df[KEY].is_unique, f"{name}: duplicate {KEY}s found."

# Handle whitespace or formatting issues if any
for name, df in dfs.items():
    if KEY in df.columns:
        df[KEY] = df[KEY].astype(str).str.strip()


## 6. Prefix Columns and Merge Customer-Level Tables 

In [6]:
def add_prefix_except(df: pd.DataFrame, prefix: str, keep=(KEY,)) -> pd.DataFrame:
    return df.rename(columns={c: (prefix + c) if c not in keep else c for c in df.columns})

demo_ = add_prefix_except(demographics, "demo_")
loc_  = add_prefix_except(location,     "loc_")
svc_  = add_prefix_except(services,     "svc_")
st_   = add_prefix_except(status,       "st_")

df_master = (
    demo_
    .merge(loc_, on=KEY, how="inner", validate="one_to_one")
    .merge(svc_, on=KEY, how="inner", validate="one_to_one")
    .merge(st_,  on=KEY, how="inner", validate="one_to_one")
).set_index(KEY)

print(df_master.shape)
df_master.head()


(7043, 55)


Unnamed: 0_level_0,demo_count,demo_gender,demo_age,demo_under_30,demo_senior_citizen,demo_married,demo_dependents,demo_number_of_dependents,loc_count,loc_country,...,st_count,st_quarter,st_satisfaction_score,st_customer_status,st_churn_label,st_churn_value,st_churn_score,st_cltv,st_churn_category,st_churn_reason
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8779-QRDMV,1,Male,78,No,Yes,No,No,0,1,United States,...,1,Q3,3,Churned,Yes,1,91,5433,Competitor,Competitor offered more data
7495-OOKFY,1,Female,74,No,Yes,Yes,Yes,1,1,United States,...,1,Q3,3,Churned,Yes,1,69,5302,Competitor,Competitor made better offer
1658-BYGOY,1,Male,71,No,Yes,No,Yes,3,1,United States,...,1,Q3,2,Churned,Yes,1,81,3179,Competitor,Competitor made better offer
4598-XLKNJ,1,Female,78,No,Yes,Yes,Yes,1,1,United States,...,1,Q3,2,Churned,Yes,1,88,5337,Dissatisfaction,Limited range of services
4846-WHAFZ,1,Female,80,No,Yes,Yes,Yes,1,1,United States,...,1,Q3,2,Churned,Yes,1,67,2793,Price,Extra data charges


## 7. Enrich with Zip Code 

In [7]:
if "loc_zip_code" in df_master.columns:
    # Reset index temporarily for merge
    df_master = (
        df_master.reset_index()
        .merge(
            population.rename(columns={
                "zip_code": "loc_zip_code",
                "population": "zip_population"
            })[["loc_zip_code", "zip_population"]],
            on="loc_zip_code",
            how="left"
        )
        .set_index("customer_id")  # restore index
    )


## Data Cleaning Phase 1 ‚Äì Structural & Integrity Checks
 Verify the integrity of the merged dataset before any transformations:

### Verify dataset shape and unique IDs

In [8]:
print(f"Shape: {df_master.shape}")
print(f"Index name: {df_master.index.name}")

# Check index uniqueness and completeness
assert df_master.index.is_unique, "Duplicate customer_id detected."
assert df_master.index.notna().all(), "Missing customer_id in index."
print("customer_id index is unique and complete.")


Shape: (7043, 56)
Index name: customer_id
customer_id index is unique and complete.


### Remove redundant columns

In [9]:
redundant_cols = ["loc_lat_long", "demo_count", "loc_count", "svc_count", "st_count"]
existing = [c for c in redundant_cols if c in df_master.columns]

if existing:
    df_master.drop(columns=existing, inplace=True)
    print(f"Removed redundant columns: {existing}")
else:
    print("No redundant columns found.")

Removed redundant columns: ['loc_lat_long', 'demo_count', 'loc_count', 'svc_count', 'st_count']


### Remove constant columns

In [10]:
constant_cols = df_master.columns[df_master.nunique() <= 1].tolist()

if constant_cols:
    print(f"‚ö†Ô∏è Constant columns detected: {constant_cols}")
else:
    print("‚úÖ No constant columns found.")

‚ö†Ô∏è Constant columns detected: ['loc_country', 'loc_state', 'svc_quarter', 'st_quarter']


In [11]:
if constant_cols:
    df_master.drop(columns=constant_cols, inplace=True)

In [12]:
print(f"‚úÖ Phase 1 complete - new dataset shape: {df_master.shape}")

‚úÖ Phase 1 complete - new dataset shape: (7043, 47)


## Data Cleaning Phase 2 ‚Äì Data Type & Value Normalization
Make sure each column is in the correct format for EDA and modeling.

### Inspect current dtypes

In [13]:
dtype_summary = (
    df_master.dtypes
    .reset_index()
    .rename(columns={'index': 'column', 0: 'dtype'})
    .sort_values('dtype')
)
display(dtype_summary)

Unnamed: 0,column,dtype
46,zip_population,int64
20,svc_avg_monthly_gb_download,int64
35,svc_total_extra_data_charges,int64
38,st_satisfaction_score,int64
12,svc_number_of_referrals,int64
8,loc_zip_code,int64
13,svc_tenure_in_months,int64
41,st_churn_value,int64
42,st_churn_score,int64
43,st_cltv,int64


### Numeric Columns (19 total)
- zip_population
    - Population by ZIP code.
    - ‚úÖ Numeric. Optional: scale later for modeling.
- svc_avg_monthly_gb_download
    - Internet data usage
    - ‚úÖ Numeric
    - üí° May contain outliers - check distribution.
- svc_total_extra_data_charges
    - Charges due to data overages
    - ‚úÖ Numeric
    - üí° Inspect for zeros - potential imbalance.
- svc_number_of_referrals
    - Number of friends referred
    - ‚úÖ Numeric
    - üí° Likely skewed - may be zero for most customers.
- loc_zip_code
    - ‚ö†Ô∏è ZIP codes are identifiers, not numeric - convert to str to avoid losing leading 0
- st_satisfaction_score
    - Customer satisfaction rating (1‚Äì5).
    - ‚úÖ Numeric
    - üí° Check correlation with churn
- demo_number_of_dependents
    - Number of dependents
    - ‚úÖ Numeric
- svc_tenure_in_months
    - Time as customer
    - ‚úÖ Numeric
    - üí° Might be key churn driver - possible binning for interpretability
- st_churn_score
    - Internal churn risk (0‚Äì100)	
    - ‚ö†Ô∏è Potential target leakage ‚Äî may be derived from or highly correlated with the actual churn label
- st_cltv
    - Customer lifetime value	
    - ‚úÖ Numeric
    - üí° Scale or log-transform if very skewed
- st_churn_value
    - Churn indicator (0,1) 
    - ‚ö†Ô∏è Redundant with st_churn_label - keep only one.
- demo_age	
    - ‚úÖ Numeric
    - üí° scale or bin
- svc_avg_monthly_long_distance_charges	
    - ‚úÖ Numeric
- loc_latitude, loc_longitude	
    - ‚úÖ Numeric
    - üí° Drop if not doing geospatial analysis; or keep for clustering/region mapping.
- svc_total_revenue, svc_total_long_distance_charges, svc_total_refunds, svc_total_charges, svc_monthly_charge
    - ‚úÖ Numeric
    - Financial metrics	
    - üí° High correlation expected - may later perform feature selection to reduce redundancy.


#### Converting **loc_zip_code** to str

In [14]:
df_master["loc_zip_code"] = df_master["loc_zip_code"].astype(str).str.zfill(5)
df_master["loc_zip_code"].dtype


dtype('O')

#### Inspecting **st_churn_score**

In [15]:
df_master.groupby("st_churn_label")["st_churn_score"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
st_churn_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,5174.0,50.098183,17.702277,5.0,35.0,50.0,66.0,80.0
Yes,1869.0,81.777956,9.16687,65.0,74.0,83.0,90.0,96.0


In [16]:
# st_churn_score dropped due to confirmed target leakage
# Churned customers have significantly higher scores (mean ‚âà 82 vs 50)
df_master = df_master.drop(columns=["st_churn_score"])


#### Inspecting **st_churn_value**

In [17]:
# Inspect unique values
print("Unique values in st_churn_label:", df_master["st_churn_label"].unique())
print("Unique values in st_churn_value:", df_master["st_churn_value"].unique())


Unique values in st_churn_label: ['Yes' 'No']
Unique values in st_churn_value: [1 0]


In [18]:
# Compare the 2 churn indicators
df_master["churn_from_label"] = (df_master["st_churn_label"] == "Yes").astype(int)
match_ratio = (df_master["churn_from_label"] == df_master["st_churn_value"]).mean()
print(f"Match ratio: {match_ratio:.2%}")

Match ratio: 100.00%


In [19]:
# Match ratio 100% means both 'st_churn_label' and 'st_churn_value' represent the same information 
# 'churn_from_label' was created temporarily for comparison ‚Äî now dropped.
# numeric indicator 'st_churn_value' is kept for modeling and renamed to 'churn_flag' (1 = churned, 0 = not churned).

df_master.drop(columns=["st_churn_label", "churn_from_label"], inplace=True)
df_master.rename(columns={"st_churn_value": "churn_flag"}, inplace=True)

### Categorical Columns (26 total)
- demo_gender	
    - Male/Female
    - ‚úÖ Categorical
    - üí° Keep as is for clarity in EDA; encode as (0,1) for modeling.
- demo_under_30, demo_senior_citizen, demo_married, demo_dependents	
    - Yes/No flags
    - ‚úÖ Categorical
    - üí° Keep as is for clarity in EDA and encode as (0,1) for modeling
    - ‚ö†Ô∏è demo_dependents might be redundant with demo_number_of_dependents
    - ‚ö†Ô∏è age related variables might be redundant with age
- loc_city	
    - ‚ö†Ô∏è High cardinality - group by region or drop.
- loc_zip_code	ZIP code	
    - ‚úÖ Categorical
    - ZIP code (identifier)
    - üí° Possible join with region-level features (already merged zip_population). Not for modeling directly.
- svc_referred_a_friend	
    - ‚úÖ Categorical
    - Yes/No	
    - üí° Keep as is for clarity in EDA and encode as (0,1) for modeling
- svc_offer	
    - Offer name or plan
    - ‚úÖ Categorical
    - üí° Use one-hot or frequency encoding
- svc_multiple_lines	
    - Yes/No
    - ‚ö†Ô∏è inspect
    - üí° Keep as is for clarity in EDA and encode as (0,1) for modeling
- svc_internet_service, svc_internet_type	
    - Type of internet connection	
    - ‚úÖ Categorical
    - Possibly hierarchical ‚Äî one-hot encode.
- svc_online_security, svc_online_backup, svc_device_protection_plan, svc_premium_tech_support, svc_streaming_tv, svc_streaming_movies, svc_streaming_music, svc_unlimited_data	
    - ‚úÖ Categorical.
    - üí° Normalize ‚ÄúNo internet‚Äù vs. ‚ÄúNo‚Äù for consistency, then encode as binary (1/0) or ordinal.
- svc_contract	
    - Contract type (Month-to-month, One year, Two year)	
    - ‚úÖ Ordered categorical.
    - üí° Encode as ordinal (0/1/2).
- svc_paperless_billing	
    - ‚úÖ Categorical.
    - üí° Keep as is for clarity in EDA; encode as (0,1) for modeling.
- svc_payment_method	
    - ‚úÖ Categorical.
    - üí° One-hot encode.
- st_customer_status	
    - Active/Churned/Joined	
    - ‚ö†Ô∏è Potential leakage ‚Äî remove for modeling. Keep for churn diagnostics only.
- st_churn_label	
    - ‚úÖ Binary target.
    - üí° Encode as (1 if ‚ÄúYes‚Äù, 0 if ‚ÄúNo‚Äù).
- st_churn_category, st_churn_reason	
    - Reason for churn	
    - ‚ö†Ô∏è Leakage ‚Äî only known post-churn. Drop before modeling.



# i stopped here
# next step: confirm and action as guidance above, as i did in the section for numeric variables

## 8. Missing Values Check

In [20]:
missing = df_master.isna().sum().sort_values(ascending=False)
missing = missing[missing > 0]

if not missing.empty:
    print("Columns with missing values:")
    display(missing)
else:
    print("No missing values found in df_master.")

Columns with missing values:


st_churn_reason      5174
st_churn_category    5174
svc_offer            3877
svc_internet_type    1526
dtype: int64

### Handling Missing Values

In [21]:
# Missing churn details (st_churn_reason & st_churn_category) is meaningful = customer didn‚Äôt churn

# Missing value summary of the before for clarity
missing_before = df_master.isna().sum()

# Handling missing values
df_master["svc_offer"] = df_master["svc_offer"].fillna("No offer")
df_master["svc_internet_type"] = df_master["svc_internet_type"].fillna("No internet service")

# Missing value summary table
missing_after = df_master.isna().sum()
missing_summary = pd.DataFrame({
    "before": missing_before,
    "after": missing_after
}).query("before > 0 or after > 0")

display(missing_summary)


Unnamed: 0,before,after
svc_offer,3877,0
svc_internet_type,1526,0
st_churn_category,5174,5174
st_churn_reason,5174,5174


## 9. Data Type Check

**Numeric columns ‚Äî checks**
From the result bellow we identified minor things to review:
    - ¬¥loc_zip_code¬¥ (int64): ZIP codes should be treated as text to avoid losing leading 0's
    - 
‚úÖ Convert to str before merging or saving.
id	int64	If this came from the population dataset (or another lookup ID), you likely don‚Äôt need it anymore once merged. Consider dropping it to avoid confusion.
demo_count, svc_count, st_count, loc_count	int64	These ‚Äúcount‚Äù columns‚Äîare they summary totals (like number of services)? If yes, keep as numeric. If they‚Äôre placeholders added during merge (like ‚Äú1‚Äù per table), they can be removed.
st_cltv, st_churn_value, st_churn_score	int64	These are fine as numeric, but check ranges later ‚Äî sometimes Excel exports these as whole numbers even though they represent dollar values or scores on a 0‚Äì1 scale.

In [22]:
df_master.head(10)

Unnamed: 0_level_0,demo_gender,demo_age,demo_under_30,demo_senior_citizen,demo_married,demo_dependents,demo_number_of_dependents,loc_city,loc_zip_code,loc_latitude,...,svc_total_extra_data_charges,svc_total_long_distance_charges,svc_total_revenue,st_satisfaction_score,st_customer_status,churn_flag,st_cltv,st_churn_category,st_churn_reason,zip_population
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8779-QRDMV,Male,78,No,Yes,No,No,0,Los Angeles,90022,34.02381,...,20,0.0,59.65,3,Churned,1,5433,Competitor,Competitor offered more data,68701
7495-OOKFY,Female,74,No,Yes,Yes,Yes,1,Los Angeles,90063,34.044271,...,0,390.8,1024.1,3,Churned,1,5302,Competitor,Competitor made better offer,55668
1658-BYGOY,Male,71,No,Yes,No,Yes,3,Los Angeles,90065,34.108833,...,0,203.94,1910.88,2,Churned,1,3179,Competitor,Competitor made better offer,47534
4598-XLKNJ,Female,78,No,Yes,Yes,Yes,1,Inglewood,90303,33.936291,...,0,494.0,2995.07,2,Churned,1,5337,Dissatisfaction,Limited range of services,27778
4846-WHAFZ,Female,80,No,Yes,Yes,Yes,1,Whittier,90602,33.972119,...,0,234.21,3102.36,2,Churned,1,2793,Price,Extra data charges,26265
4412-YLTKF,Female,72,No,Yes,No,Yes,1,Pico Rivera,90660,33.989524,...,10,89.91,2235.41,1,Churned,1,4638,Competitor,Competitor had better devices,63288
0390-DCFDQ,Female,76,No,Yes,Yes,Yes,2,Los Alamitos,90720,33.79499,...,0,15.28,85.73,2,Churned,1,3964,Other,Don't know,21343
3445-HXXGF,Male,66,No,Yes,Yes,No,0,Sierra Madre,91024,34.168686,...,0,0.0,2610.25,1,Churned,1,5444,Dissatisfaction,Service dissatisfaction,10558
2656-FMOKZ,Female,70,No,Yes,No,Yes,2,Pasadena,91106,34.139402,...,0,661.05,1806.75,2,Churned,1,5717,Dissatisfaction,Limited range of services,23742
2070-FNEXE,Female,77,No,Yes,No,Yes,2,Pasadena,91107,34.159007,...,0,188.65,681.2,2,Churned,1,4419,Price,Lack of affordable download/upload speed,32369


## 9. Post-Merge Validation and Save

In [23]:
assert df_master.index.is_unique, "customer_id duplicated after merge."
assert df_master.index.notna().all(), "customer_id contains missing values."

quick_overview(df_master, "Merged Customer Master")
save_df(df_master, "telco_master_clean", folder="interim")

print("‚úÖ Cleaned master dataset saved to data/interim/")



===== Merged Customer Master =====
Shape: 7043 rows √ó 45 columns

Data types:
demo_gender                               object
demo_age                                   int64
demo_under_30                             object
demo_senior_citizen                       object
demo_married                              object
demo_dependents                           object
demo_number_of_dependents                  int64
loc_city                                  object
loc_zip_code                              object
loc_latitude                             float64
loc_longitude                            float64
svc_referred_a_friend                     object
svc_number_of_referrals                    int64
svc_tenure_in_months                       int64
svc_offer                                 object
svc_phone_service                         object
svc_avg_monthly_long_distance_charges    float64
svc_multiple_lines                        object
svc_internet_service                  

Unnamed: 0_level_0,demo_gender,demo_age,demo_under_30,demo_senior_citizen,demo_married,demo_dependents,demo_number_of_dependents,loc_city,loc_zip_code,loc_latitude,...,svc_total_extra_data_charges,svc_total_long_distance_charges,svc_total_revenue,st_satisfaction_score,st_customer_status,churn_flag,st_cltv,st_churn_category,st_churn_reason,zip_population
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8779-QRDMV,Male,78,No,Yes,No,No,0,Los Angeles,90022,34.02381,...,20,0.0,59.65,3,Churned,1,5433,Competitor,Competitor offered more data,68701
7495-OOKFY,Female,74,No,Yes,Yes,Yes,1,Los Angeles,90063,34.044271,...,0,390.8,1024.1,3,Churned,1,5302,Competitor,Competitor made better offer,55668
1658-BYGOY,Male,71,No,Yes,No,Yes,3,Los Angeles,90065,34.108833,...,0,203.94,1910.88,2,Churned,1,3179,Competitor,Competitor made better offer,47534
4598-XLKNJ,Female,78,No,Yes,Yes,Yes,1,Inglewood,90303,33.936291,...,0,494.0,2995.07,2,Churned,1,5337,Dissatisfaction,Limited range of services,27778
4846-WHAFZ,Female,80,No,Yes,Yes,Yes,1,Whittier,90602,33.972119,...,0,234.21,3102.36,2,Churned,1,2793,Price,Extra data charges,26265


‚úÖ Guardado: /Users/dianagomes/Desktop/work/s2/EnterpriseDataScienceBootcamp_workgroup/EnterpriseDataScienceBootcamp_workgroup-2/data/interim/telco_master_clean.csv
‚úÖ Cleaned master dataset saved to data/interim/
