In [3]:
import pandas as pd
import numpy as np

agg = pd.read_parquet(
    "../data/analytics/hmda/agg_gender_income_state_purpose.parquet"
)

agg.head()


Unnamed: 0,state_code,income_bucket,loan_purpose,applicant_gender,applications,approvals,approval_rate,median_loan_amount
0,CT,Middle,1,Male,11808,8579,0.726541,275000.0
1,TX,Low,32,Female,11683,4847,0.414876,125000.0
2,WA,Middle,1,Female,16383,11975,0.730941,365000.0
3,WA,Middle,1,Male,25964,18719,0.72096,395000.0
4,WA,Low,32,Male,3794,1475,0.388772,205000.0


In [4]:
pivot = (
    agg
    .pivot_table(
        index=["state_code", "income_bucket", "loan_purpose"],
        columns="applicant_gender",
        values=["applications", "approvals", "approval_rate"]
    )
    .reset_index()
)

pivot.columns = ["_".join(col).strip("_") for col in pivot.columns]
pivot.head()


Unnamed: 0,state_code,income_bucket,loan_purpose,applications_Female,applications_Male,approval_rate_Female,approval_rate_Male,approvals_Female,approvals_Male
0,AK,High,1,926.0,1950.0,0.74946,0.74359,694.0,1450.0
1,AK,High,2,130.0,223.0,0.523077,0.547085,68.0,122.0
2,AK,High,31,56.0,123.0,0.517857,0.609756,29.0,75.0
3,AK,High,32,113.0,349.0,0.460177,0.515759,52.0,180.0
4,AK,High,4,127.0,308.0,0.456693,0.448052,58.0,138.0


In [5]:
pivot["approval_gap_male_minus_female"] = (
    pivot["approval_rate_Male"] - pivot["approval_rate_Female"]
)

pivot[[
    "state_code",
    "income_bucket",
    "loan_purpose",
    "approval_gap_male_minus_female"
]].head()


Unnamed: 0,state_code,income_bucket,loan_purpose,approval_gap_male_minus_female
0,AK,High,1,-0.00587
1,AK,High,2,0.024008
2,AK,High,31,0.091899
3,AK,High,32,0.055582
4,AK,High,4,-0.008641


In [6]:
pivot["disparate_impact"] = (
    pivot["approval_rate_Female"] / pivot["approval_rate_Male"]
)

pivot[[
    "state_code",
    "income_bucket",
    "loan_purpose",
    "disparate_impact"
]].head()


Unnamed: 0,state_code,income_bucket,loan_purpose,disparate_impact
0,AK,High,1,1.007895
1,AK,High,2,0.956116
2,AK,High,31,0.849286
3,AK,High,32,0.892232
4,AK,High,4,1.019286


In [7]:
reliable = pivot[
    (pivot["applications_Male"] >= 50) &
    (pivot["applications_Female"] >= 50)
].copy()

reliable.shape


(772, 11)

In [8]:
reliable.sort_values(
    by="approval_gap_male_minus_female",
    ascending=False
).head(10)[[
    "state_code",
    "income_bucket",
    "loan_purpose",
    "approval_gap_male_minus_female",
    "disparate_impact",
    "applications_Female",
    "applications_Male"
]]


Unnamed: 0,state_code,income_bucket,loan_purpose,approval_gap_male_minus_female,disparate_impact,applications_Female,applications_Male
785,WY,High,31,0.175159,0.740566,62.0,157.0
386,MS,High,2,0.162718,0.748444,252.0,824.0
389,MS,High,4,0.131444,0.797046,401.0,1385.0
237,IN,High,31,0.114355,0.848119,700.0,2477.0
779,WV,Middle,2,0.111744,0.803117,498.0,1110.0
432,ND,High,31,0.110252,0.865968,73.0,248.0
401,MT,High,2,0.103884,0.855443,244.0,558.0
285,LA,Low,1,0.09687,0.801248,16926.0,15072.0
282,LA,High,31,0.095346,0.87427,454.0,1978.0
431,ND,High,2,0.094872,0.886154,130.0,360.0


In [11]:
from scipy.stats import chi2_contingency


In [9]:
test_row = reliable.iloc[0]
test_row[[
    "state_code", "income_bucket", "loan_purpose",
    "applications_Female", "approvals_Female",
    "applications_Male", "approvals_Male"
]]


state_code                 AK
income_bucket            High
loan_purpose                1
applications_Female     926.0
approvals_Female        694.0
applications_Male      1950.0
approvals_Male         1450.0
Name: 0, dtype: object

In [10]:
table = [
    [
        int(test_row["approvals_Female"]),
        int(test_row["applications_Female"] - test_row["approvals_Female"])
    ],
    [
        int(test_row["approvals_Male"]),
        int(test_row["applications_Male"] - test_row["approvals_Male"])
    ]
]

table


[[694, 232], [1450, 500]]

In [11]:
from scipy.stats import chi2_contingency

chi2, p_value, dof, expected = chi2_contingency(table)

chi2, p_value, dof


(np.float64(0.08518987803405201), np.float64(0.77038368877083), 1)

In [12]:
reliable.sort_values(
    by="approval_gap_male_minus_female",
    ascending=False
).head(1)


Unnamed: 0,state_code,income_bucket,loan_purpose,applications_Female,applications_Male,approval_rate_Female,approval_rate_Male,approvals_Female,approvals_Male,approval_gap_male_minus_female,disparate_impact
785,WY,High,31,62.0,157.0,0.5,0.675159,31.0,106.0,0.175159,0.740566


In [13]:
table2 = [
    [
        int(31),                 # Female approved
        int(62 - 31)             # Female not approved
    ],
    [
        int(106),                # Male approved
        int(157 - 106)           # Male not approved
    ]
]

table2


[[31, 31], [106, 51]]

In [14]:
from scipy.stats import chi2_contingency

chi2_2, p_value_2, dof_2, expected_2 = chi2_contingency(table2)

chi2_2, p_value_2, dof_2


(np.float64(5.098142500336253), np.float64(0.023951479113269374), 1)

In [15]:
def chi_square_pvalue(row):
    table = [
        [
            row["approvals_Female"],
            row["applications_Female"] - row["approvals_Female"]
        ],
        [
            row["approvals_Male"],
            row["applications_Male"] - row["approvals_Male"]
        ]
    ]
    _, p_value, _, _ = chi2_contingency(table)
    return p_value


In [16]:
reliable["p_value"] = reliable.apply(chi_square_pvalue, axis=1)
reliable.head()


Unnamed: 0,state_code,income_bucket,loan_purpose,applications_Female,applications_Male,approval_rate_Female,approval_rate_Male,approvals_Female,approvals_Male,approval_gap_male_minus_female,disparate_impact,p_value
0,AK,High,1,926.0,1950.0,0.74946,0.74359,694.0,1450.0,-0.00587,1.007895,0.770384
1,AK,High,2,130.0,223.0,0.523077,0.547085,68.0,122.0,0.024008,0.956116,0.744617
2,AK,High,31,56.0,123.0,0.517857,0.609756,29.0,75.0,0.091899,0.849286,0.321175
3,AK,High,32,113.0,349.0,0.460177,0.515759,52.0,180.0,0.055582,0.892232,0.358181
4,AK,High,4,127.0,308.0,0.456693,0.448052,58.0,138.0,-0.008641,1.019286,0.953181


In [17]:
reliable["statistically_significant"] = reliable["p_value"] < 0.05

reliable.head()


Unnamed: 0,state_code,income_bucket,loan_purpose,applications_Female,applications_Male,approval_rate_Female,approval_rate_Male,approvals_Female,approvals_Male,approval_gap_male_minus_female,disparate_impact,p_value,statistically_significant
0,AK,High,1,926.0,1950.0,0.74946,0.74359,694.0,1450.0,-0.00587,1.007895,0.770384,False
1,AK,High,2,130.0,223.0,0.523077,0.547085,68.0,122.0,0.024008,0.956116,0.744617,False
2,AK,High,31,56.0,123.0,0.517857,0.609756,29.0,75.0,0.091899,0.849286,0.321175,False
3,AK,High,32,113.0,349.0,0.460177,0.515759,52.0,180.0,0.055582,0.892232,0.358181,False
4,AK,High,4,127.0,308.0,0.456693,0.448052,58.0,138.0,-0.008641,1.019286,0.953181,False


In [18]:
reliable["statistically_significant"].value_counts()


statistically_significant
False    461
True     311
Name: count, dtype: int64

In [19]:
significant = reliable[reliable["statistically_significant"]]

significant.shape


(311, 13)

In [20]:
significant["loan_purpose"].value_counts().sort_index()


loan_purpose
1     76
2     57
31    50
32    86
4     42
Name: count, dtype: int64

In [21]:
significant["gap_direction"] = np.where(
    significant["approval_gap_male_minus_female"] > 0,
    "Male favored",
    "Female favored"
)

significant["gap_direction"].value_counts()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  significant["gap_direction"] = np.where(


gap_direction
Male favored      182
Female favored    129
Name: count, dtype: int64

: 