In [None]:
import pandas as pd

path = "../data/raw/hmda/hmda_2023.txt"

# Read ONLY 5 rows to inspect structure (safe)
df_sample = pd.read_csv(
    path,
    sep="|",
    nrows=5,
    low_memory=False
)

df_sample


Unnamed: 0,activity_year,lei,loan_type,loan_purpose,preapproval,construction_method,occupancy_type,loan_amount,action_taken,state_code,...,submission_of_application,initially_payable_to_institution,aus_1,aus_2,aus_3,aus_4,aus_5,reverse_mortgage,open_end_line_of_credit,business_or_commercial_purpose
0,2023,549300HW662MN1WU8550,2,32,2,1,1,665000,3,CA,...,2,1,1,2.0,,,,2,2,2
1,2023,549300HW662MN1WU8550,1,1,2,1,1,215000,3,TX,...,2,1,2,,,,,2,2,2
2,2023,549300HW662MN1WU8550,1,1,2,1,1,255000,3,TX,...,2,1,2,1.0,,,,2,2,2
3,2023,549300HW662MN1WU8550,2,32,2,1,1,365000,3,FL,...,2,1,1,,,,,2,2,2
4,2023,549300HW662MN1WU8550,1,1,2,1,1,185000,3,IL,...,2,1,1,2.0,,,,2,2,2


In [5]:
columns = df_sample.columns.tolist()
len(columns), columns


(85,
 ['activity_year',
  'lei',
  'loan_type',
  'loan_purpose',
  'preapproval',
  'construction_method',
  'occupancy_type',
  'loan_amount',
  'action_taken',
  'state_code',
  'county_code',
  'census_tract',
  'applicant_ethnicity_1',
  'applicant_ethnicity_2',
  'applicant_ethnicity_3',
  'applicant_ethnicity_4',
  'applicant_ethnicity_5',
  'co_applicant_ethnicity_1',
  'co_applicant_ethnicity_2',
  'co_applicant_ethnicity_3',
  'co_applicant_ethnicity_4',
  'co_applicant_ethnicity_5',
  'applicant_ethnicity_observed',
  'co_applicant_ethnicity_observed',
  'applicant_race_1',
  'applicant_race_2',
  'applicant_race_3',
  'applicant_race_4',
  'applicant_race_5',
  'co_applicant_race_1',
  'co_applicant_race_2',
  'co_applicant_race_3',
  'co_applicant_race_4',
  'co_applicant_race_5',
  'applicant_race_observed',
  'co_applicant_race_observed',
  'applicant_sex',
  'co_applicant_sex',
  'applicant_sex_observed',
  'co_applicant_sex_observed',
  'applicant_age',
  'applicant_ag

In [6]:
USE_COLS = [
    "activity_year",
    "lei",
    "loan_amount",
    "interest_rate",
    "income",
    "action_taken",
    "loan_purpose",
    "loan_type",
    "occupancy_type",
    "applicant_sex",
    "state_code",
    "county_code",
    "census_tract"
]


In [9]:
import pandas as pd
import numpy as np

path = "../data/raw/hmda/hmda_2023.txt"

USE_COLS = [
    "activity_year","lei","loan_type","loan_purpose","occupancy_type",
    "loan_amount","interest_rate","action_taken",
    "applicant_sex","co_applicant_sex","income",
    "state_code","county_code","census_tract",
    "denial_reason_1","denial_reason_2","denial_reason_3","denial_reason_4"
]

chunks = pd.read_csv(
    path,
    sep="|",
    usecols=USE_COLS,
    chunksize=200_000,          # smaller chunks = safer
    engine="python",            # key fix
    dtype=str,                  # prevents dtype inference memory spikes
    on_bad_lines="skip"         # skip malformed lines (rare)
)

clean_chunks = []
dq_log = []

for i, chunk in enumerate(chunks):
    initial_rows = len(chunk)

    # Convert needed numeric columns safely
    for col in ["loan_amount", "interest_rate", "income", "action_taken", "applicant_sex", "co_applicant_sex"]:
        if col in chunk.columns:
            chunk[col] = pd.to_numeric(chunk[col], errors="coerce")

    # Basic validity filters
    chunk = chunk[chunk["loan_amount"].notna()]
    chunk = chunk[chunk["income"].notna()]
    chunk = chunk[chunk["state_code"].notna()]

    # Gender mapping
    sex_map = {1: "Male", 2: "Female"}
    chunk["applicant_gender"] = chunk["applicant_sex"].map(sex_map)
    chunk["co_applicant_gender"] = chunk["co_applicant_sex"].map(sex_map)

    # Approval flag (1=Originated, 2=Approved not accepted)
    chunk["approved"] = chunk["action_taken"].isin([1, 2]).astype(int)

    dq_log.append({
        "chunk": i,
        "rows_before": initial_rows,
        "rows_after": len(chunk),
        "male_pct": float((chunk["applicant_gender"] == "Male").mean()),
        "female_pct": float((chunk["applicant_gender"] == "Female").mean()),
    })

    clean_chunks.append(chunk)

    # first run: process only a few chunks to confirm everything works
    if i == 5:
        break


In [11]:
import os
import pandas as pd
import numpy as np

path = "../data/raw/hmda/hmda_2023.txt"

USE_COLS = [
    "activity_year","lei","loan_type","loan_purpose","occupancy_type",
    "loan_amount","interest_rate","action_taken",
    "applicant_sex","co_applicant_sex","income",
    "state_code","county_code","census_tract",
    "denial_reason_1","denial_reason_2","denial_reason_3","denial_reason_4"
]

out_dir = "../data/clean/hmda/hmda_2023_parts"
os.makedirs(out_dir, exist_ok=True)

dq_log = []

reader = pd.read_csv(
    path,
    sep="|",
    usecols=USE_COLS,
    chunksize=200_000,
    engine="python",
    dtype=str,
    on_bad_lines="skip"
)

for i, chunk in enumerate(reader):
    initial_rows = len(chunk)

    # numeric conversions
    for col in ["loan_amount", "interest_rate", "income", "action_taken", "applicant_sex", "co_applicant_sex"]:
        chunk[col] = pd.to_numeric(chunk[col], errors="coerce")

    # filters
    chunk = chunk[chunk["loan_amount"].notna()]
    chunk = chunk[chunk["income"].notna()]
    chunk = chunk[chunk["state_code"].notna()]

    # gender mapping
    sex_map = {1: "Male", 2: "Female"}
    chunk["applicant_gender"] = chunk["applicant_sex"].map(sex_map)
    chunk["co_applicant_gender"] = chunk["co_applicant_sex"].map(sex_map)

    # approval flag
    chunk["approved"] = chunk["action_taken"].isin([1, 2]).astype(int)

    # log
    dq_log.append({
        "chunk": i,
        "rows_before": initial_rows,
        "rows_after": len(chunk),
        "male_pct": float((chunk["applicant_gender"] == "Male").mean()),
        "female_pct": float((chunk["applicant_gender"] == "Female").mean()),
    })

    # write this chunk
    chunk.to_parquet(f"{out_dir}/part_{i:05d}.parquet", index=False)

    if (i + 1) % 10 == 0:
        print(f"✅ wrote {i+1} chunks")


✅ wrote 10 chunks
✅ wrote 20 chunks
✅ wrote 20 chunks
✅ wrote 30 chunks
✅ wrote 30 chunks
✅ wrote 40 chunks
✅ wrote 40 chunks
✅ wrote 50 chunks
✅ wrote 50 chunks


In [12]:
import pandas as pd

pd.DataFrame(dq_log).to_csv("../outputs/dq_logs/dq_log_2023.csv", index=False)
print("Saved DQ log rows:", len(dq_log))


Saved DQ log rows: 58
