In [1]:
import matplotlib.pyplot as plt
import numpy as np
import csv
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

with open('ac_hmda_2018-2021.csv', 'r', encoding='utf-8-sig') as f:
    reader = csv.DictReader(f)
    rows = list(reader)

census_tracts = set()
for row in rows:
    census_tracts.add(row['census_tract'])

In [4]:
#no debt_to_income_ratio included in this grouping
fields_to_use = ['derived_race', 'income', 'loan_amount', 'derived_sex'] #derived_loan_product_type', 'derived_dwelling_category'
xs_dict_all_years = {}
ys_dict_all_years = {}
census_tracts_list = list(census_tracts)
census_tracts_list = sorted(census_tracts_list)
for tract in (census_tracts_list):
    us = []
    vs = []
    for row in rows:
        if row['census_tract'] == tract:
            values = []
            for field in fields_to_use:
                if field == 'income':
                    if row[field] == "NA":
                        pass
                    else:
                        values.append(int(row[field]))
                if field == 'derived_race':
                    if row[field] == 'White':
                        values.append(0)
                    elif row[field] == 'Black or African American':
                        values.append(1)
                elif field == 'derived_sex':
                    if row[field] == 'Female':
                        values.append(0)
                    elif row[field] == 'Male':
                        values.append(1)
                elif field == 'loan_amount':
                    values.append(int(row[field]))
            if len(values) == len(fields_to_use):
                us.append(values)
                field = 'action_taken'
                if row[field] == '3':
                    vs.append(0)
                elif row[field] == '1':
                    vs.append(1)
            ys_dict_all_years[row['census_tract']] = np.array(vs)
            xs_dict_all_years[row['census_tract']] = np.array(us)

reduced_xs = {}
reduced_ys = {}

for tract in xs_dict_all_years:
    if len(xs_dict_all_years[tract]) <=5:
        pass
    elif len(ys_dict_all_years[tract]) == np.sum(ys_dict_all_years[tract]):
        pass
    else:
        reduced_xs[tract] = xs_dict_all_years[tract]
        reduced_ys[tract] = ys_dict_all_years[tract]

In [20]:
all_results = []
for tract in reduced_xs:
    staging_list = [tract]
    model = LogisticRegression(solver='liblinear', C=10.0, random_state=0)
    model.fit(reduced_xs[tract], reduced_ys[tract])
    for item in model.coef_:
        for items in item:
            staging_list.append(items)
    staging_list.append(model.score(reduced_xs[tract], reduced_ys[tract]))
    all_results.append(staging_list)

In [21]:
all_results

[['42003020100',
  -2.8706921333486574e-05,
  0.01213020851382483,
  -2.0272105552597097e-06,
  6.32206540458268e-05,
  0.8414634146341463],
 ['42003020300',
  2.0916626022073536e-05,
  0.004221036527616015,
  6.3089837439713165e-06,
  -6.274011251811247e-06,
  0.9571428571428572],
 ['42003030500',
  -0.00025732784371669694,
  0.012507228377106314,
  2.0929727521422155e-06,
  -0.00019388400251349353,
  0.7894736842105263],
 ['42003040400',
  0.0003384580469726843,
  0.037545705604049316,
  -1.2340350049188009e-05,
  -0.004417889279623143,
  0.8235294117647058],
 ['42003040900',
  3.806566148102134e-05,
  0.0032453406433934524,
  5.9400066779207115e-06,
  0.00011928479469559245,
  0.8571428571428571],
 ['42003050100',
  -8.947699472922713e-07,
  -1.3931560379026628e-05,
  2.742307700168444e-05,
  5.74064483992617e-07,
  0.9333333333333333],
 ['42003050600',
  -9.481157781721874e-05,
  0.008395838027287337,
  5.132353053771015e-06,
  4.5406460277842836e-05,
  0.7941176470588235],
 ['4200

In [15]:
reduced_xs['42003020100']

array([[      0,     107,  265000,       1],
       [      0,      84,  145000,       1],
       [      0,     120,  255000,       1],
       [      0,      80,  125000,       1],
       [      0,     294,  415000,       0],
       [      0,     140,  255000,       0],
       [      0,     114,  275000,       1],
       [      0,      36,  105000,       1],
       [      0,     119,  275000,       1],
       [      0,      65,  235000,       1],
       [      0,     235,  145000,       1],
       [      0,      86,  105000,       0],
       [      1,     122,  405000,       1],
       [      0,      54,  135000,       0],
       [      0,     389,  225000,       0],
       [      0,      39,  115000,       0],
       [      0,     124,  235000,       1],
       [      0,      81,  305000,       1],
       [      0,     670, 1305000,       1],
       [      0,     601,  355000,       1],
       [      1,     299,  375000,       0],
       [      1,     299,  395000,       0],
       [  

In [22]:
headers = ["tract"]
for item in fields_to_use:
    headers.append(f'{item}_coef')
headers.append("score")
print(headers)

['tract', 'derived_race_coef', 'income_coef', 'loan_amount_coef', 'derived_sex_coef', 'score']


In [25]:
all_results_df = pd.DataFrame(all_results)
all_results_df.to_csv("HMDA_results.csv", header=headers, index=False)

In [None]:
#debt_to_income_ratio IS included in this grouping
fields_to_use = ['derived_race', 'income', 'loan_amount', 'derived_sex', 'debt_to_income_ratio'] #derived_loan_product_type', 'derived_dwelling_category'
xs_dict_all_years = {}
ys_dict_all_years = {}
census_tracts_list = list(census_tracts)
census_tracts_list = sorted(census_tracts_list)
for tract in (census_tracts_list):
    us = []
    vs = []
    for row in rows:
        if row['census_tract'] == tract:
            values = []
            for field in fields_to_use:
                if field == 'income':
                    if row[field] == "NA":
                        pass
                    else:
                        values.append(int(row[field]))
                if field == 'derived_race':
                    if row[field] == 'White':
                        values.append(0)
                    elif row[field] == 'Black or African American':
                        values.append(1)
                elif field == 'derived_sex':
                    if row[field] == 'Female':
                        values.append(0)
                    elif row[field] == 'Male':
                        values.append(1)
                elif field == 'loan_amount':
                    values.append(int(row[field]))
                if field == 'debt_to_income_ratio':
                    if row[field] == "<20%":
                        values.append(1)
                    elif row[field] == "20%-<30%":
                        values.append(2)
                    elif row[field] == "30%-<36%":
                        values.append(3)
                    elif row[field] == "36" or "37":
                        values.
            if len(values) == len(fields_to_use):
                us.append(values)
                field = 'action_taken'
                if row[field] == '3':
                    vs.append(0)
                elif row[field] == '1':
                    vs.append(1)
            ys_dict_all_years[row['census_tract']] = np.array(vs)
            xs_dict_all_years[row['census_tract']] = np.array(us)

reduced_xs = {}
reduced_ys = {}

for tract in xs_dict_all_years:
    if len(xs_dict_all_years[tract]) <=5:
        pass
    elif len(ys_dict_all_years[tract]) == np.sum(ys_dict_all_years[tract]):
        pass
    else:
        reduced_xs[tract] = xs_dict_all_years[tract]
        reduced_ys[tract] = ys_dict_all_years[tract]

for tract in reduced_xs:
    model = LogisticRegression(solver='liblinear', C=10.0, random_state=0)
    model.fit(reduced_xs[tract], reduced_ys[tract])
    model.predict_proba(reduced_xs[tract])
    print(f'Tract: {tract}, slope: {model.coef_}, intercept: {model.intercept_}')