In [1]:
import glob
import pandas as pd

files = sorted(glob.glob("../data/clean/hmda/hmda_2023_parts/part_*.parquet"))

# Loading a sample first 
df = pd.concat(
    (pd.read_parquet(f) for f in files[:10]),
    ignore_index=True
)

df[["income", "applicant_gender", "loan_amount", "approved"]].head()


Unnamed: 0,income,applicant_gender,loan_amount,approved
0,133.0,,665000,0
1,78.0,Female,215000,0
2,171.0,Male,255000,0
3,122.0,Female,365000,0
4,131.0,Male,185000,0


In [2]:
df["income"].describe()


count    1.616452e+06
mean     1.517786e+02
std      5.329512e+03
min     -1.280000e+05
25%      6.500000e+01
50%      9.900000e+01
75%      1.550000e+02
max      5.004153e+06
Name: income, dtype: float64

In [3]:
# Remove invalid income
df_valid = df[df["income"] > 0]

# Check percentiles
df_valid["income"].quantile([0.01, 0.3, 0.7, 0.99])


0.01     19.0
0.30     72.0
0.70    141.0
0.99    765.0
Name: income, dtype: float64

In [4]:
df_valid["income"].quantile([0.01, 0.3, 0.7, 0.99])

# Define income buckets

0.01     19.0
0.30     72.0
0.70    141.0
0.99    765.0
Name: income, dtype: float64

In [5]:
# Remove extreme income values
df_clean = df[
    (df["income"] >= 19) &
    (df["income"] <= 765)
].copy()

# Create income buckets
def income_bucket(x):
    if x < 72:
        return "Low"
    elif x < 141:
        return "Middle"
    else:
        return "High"

df_clean["income_bucket"] = df_clean["income"].apply(income_bucket)

df_clean[["income", "income_bucket"]].head()


Unnamed: 0,income,income_bucket
0,133.0,Middle
1,78.0,Middle
2,171.0,High
3,122.0,Middle
4,131.0,Middle


In [6]:
df_clean["income_bucket"].value_counts()


income_bucket
Middle    644332
High      465769
Low       465208
Name: count, dtype: int64

In [7]:
loan_by_gender_income = (
    df_clean
    .groupby(["income_bucket", "applicant_gender"])
    .agg(
        applications=("loan_amount", "count"),
        median_loan_amount=("loan_amount", "median"),
        mean_loan_amount=("loan_amount", "mean")
    )
    .reset_index()
)

loan_by_gender_income


Unnamed: 0,income_bucket,applicant_gender,applications,median_loan_amount,mean_loan_amount
0,High,Female,129993,365000.0,398817.82096
1,High,Male,295635,385000.0,429214.318332
2,Low,Female,201978,125000.0,134847.904227
3,Low,Male,228998,135000.0,147230.019476
4,Middle,Female,225301,235000.0,236153.567894
5,Middle,Male,369019,245000.0,243457.396503


In [8]:
loan_by_gender_income_state = (
    df_clean
    .groupby(["state_code", "income_bucket", "applicant_gender"])
    .agg(
        applications=("loan_amount", "count"),
        median_loan_amount=("loan_amount", "median")
    )
    .reset_index()
)

loan_by_gender_income_state.head(10)


Unnamed: 0,state_code,income_bucket,applicant_gender,applications,median_loan_amount
0,AK,High,Female,352,415000.0
1,AK,High,Male,717,445000.0
2,AK,Low,Female,233,205000.0
3,AK,Low,Male,307,215000.0
4,AK,Middle,Female,528,310000.0
5,AK,Middle,Male,962,325000.0
6,AL,High,Female,1146,305000.0
7,AL,High,Male,3364,335000.0
8,AL,Low,Female,4206,125000.0
9,AL,Low,Male,4974,135000.0


In [9]:
approval_by_gender_income_state = (
    df_clean
    .groupby(["state_code", "income_bucket", "applicant_gender"])
    .agg(
        applications=("approved", "count"),
        approval_rate=("approved", "mean")
    )
    .reset_index()
)

approval_by_gender_income_state.head(10)


Unnamed: 0,state_code,income_bucket,applicant_gender,applications,approval_rate
0,AK,High,Female,352,0.704545
1,AK,High,Male,717,0.705718
2,AK,Low,Female,233,0.635193
3,AK,Low,Male,307,0.543974
4,AK,Middle,Female,528,0.725379
5,AK,Middle,Male,962,0.70894
6,AL,High,Female,1146,0.691099
7,AL,High,Male,3364,0.674197
8,AL,Low,Female,4206,0.511175
9,AL,Low,Male,4974,0.546844
