In [1]:
import sys
print(sys.executable)
print(sys.version)


/Users/richardanderson/Desktop/Credit-Risk-Default Model/Notebooks/.venv/bin/python
3.9.6 (default, Mar 12 2025, 20:22:46) 
[Clang 17.0.0 (clang-1700.0.13.3)]


In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

pd.__version__


'2.3.3'

In [2]:
PROJECT_ROOT = Path("..")
FILE_2016Q1 = PROJECT_ROOT / "2016Q1.csv"

assert FILE_2016Q1.exists(), f"Missing file: {FILE_2016Q1}"

# 0-based indices we identified from your file inspection
LOAN_ID_IDX = 1
REPORT_PERIOD_IDX = 2
DELINQ_IDX = 39

HORIZON_MONTHS = 24
CHUNK_SIZE = 200_000

FILE_2016Q1


PosixPath('../2016Q1.csv')

In [3]:
peek = pd.read_csv(
    FILE_2016Q1,
    sep="|",
    header=None,
    usecols=[LOAN_ID_IDX, REPORT_PERIOD_IDX, DELINQ_IDX],
    dtype=str,
    nrows=5,
    engine="c"
)
peek.columns = ["loan_id", "report_period", "delinq"]
peek


Unnamed: 0,loan_id,report_period,delinq
0,100000512540,22016,0
1,100000512540,32016,0
2,100000512540,42016,0
3,100000512540,52016,0
4,100000512540,62016,0


In [4]:
loan_months = {}   # loan_id -> months observed (capped at 24)
loan_default = {}  # loan_id -> 0/1

reader = pd.read_csv(
    FILE_2016Q1,
    sep="|",
    header=None,
    usecols=[LOAN_ID_IDX, REPORT_PERIOD_IDX, DELINQ_IDX],
    dtype=str,
    chunksize=CHUNK_SIZE,
    engine="c"
)

for chunk in reader:
    chunk.columns = ["loan_id", "report_period", "delinq"]

    # parse month and delinquency
    chunk["report_period"] = pd.to_datetime(chunk["report_period"], format="%m%Y", errors="coerce")
    chunk["delinq_num"] = pd.to_numeric(chunk["delinq"], errors="coerce")

    # sort so we process months in order per loan
    chunk = chunk.sort_values(["loan_id", "report_period"])

    # group and update state per loan
    for loan_id, g in chunk.groupby("loan_id", sort=False):
        m = loan_months.get(loan_id, 0)
        d = loan_default.get(loan_id, 0)

        if m >= HORIZON_MONTHS:
            continue

        # Only consider remaining months up to horizon
        remaining = HORIZON_MONTHS - m
        g = g.head(remaining)

        # count months observed in this chunk portion
        m += len(g)

        # default event: any delinquency >= 3 within horizon
        if d == 0:
            if (g["delinq_num"] >= 3).any():
                d = 1

        loan_months[loan_id] = m
        loan_default[loan_id] = d


In [5]:
labels_df = pd.DataFrame({
    "loan_id": list(loan_months.keys()),
    "months_observed": [loan_months[k] for k in loan_months.keys()],
    "default_24m": [loan_default.get(k, 0) for k in loan_months.keys()],
})

# censoring: require full 24 months observed
labels_df = labels_df[labels_df["months_observed"] >= HORIZON_MONTHS].copy()

labels_df["default_24m"].value_counts(dropna=False), labels_df.shape


(default_24m
 0    335824
 1      2649
 Name: count, dtype: int64,
 (338473, 3))

In [6]:
OUT_DIR = PROJECT_ROOT / "data" / "processed"
OUT_DIR.mkdir(parents=True, exist_ok=True)

out_path = OUT_DIR / "labels_2016Q1.parquet"
labels_df.to_parquet(out_path, index=False)

out_path


PosixPath('../data/processed/labels_2016Q1.parquet')