# Setup & Environment

## Scripts

In [1]:
# code to download the file within your Python IDE
import json, requests, urllib, urllib.request
urllib.request.urlretrieve("https://raw.githubusercontent.com/the-markup/investigation-redlining/main/utils/categorize_data.py", "categorize_data.py")
urllib.request.urlretrieve("https://raw.githubusercontent.com/the-markup/investigation-redlining/main/utils/clean_data.py", "clean_data.py")
urllib.request.urlretrieve("https://raw.githubusercontent.com/the-markup/investigation-redlining/main/utils/use_regression.py", "use_regression.py")

('use_regression.py', <http.client.HTTPMessage at 0x790ef0162770>)

In [2]:
from categorize_data import *
from clean_data import *
from use_regression import *

## Data

In [4]:
import pandas as pd
from dask import dataframe as dtf
hmda19_df = dtf.read_csv("/content/drive/MyDrive/2-S24/EoC II/for-students/mar21/markup/2019_public_lar_csv.csv", dtype=str)
hmda19_df.head()

Unnamed: 0,activity_year,lei,derived_msa_md,state_code,county_code,census_tract,conforming_loan_limit,derived_loan_product_type,derived_dwelling_category,derived_ethnicity,...,denial_reason_2,denial_reason_3,denial_reason_4,tract_population,tract_minority_population_percent,ffiec_msa_md_median_family_income,tract_to_msa_income_percentage,tract_owner_occupied_units,tract_one_to_four_family_homes,tract_median_age_of_housing_units
0,2019,549300BQPSF250Q8KQ79,19740,CO,8059.0,8059010402.0,C,Conventional:Subordinate Lien,Single Family (1-4 Units):Site-Built,Not Hispanic or Latino,...,,,,5450,32.17,92800,59,983,1521,42
1,2019,549300BQPSF250Q8KQ79,19740,CO,8059.0,8059009851.0,C,Conventional:First Lien,Single Family (1-4 Units):Site-Built,Not Hispanic or Latino,...,,,,3838,12.58,92800,146,1419,1459,20
2,2019,549300BQPSF250Q8KQ79,0,CO,,,C,Conventional:Subordinate Lien,Single Family (1-4 Units):Site-Built,Not Hispanic or Latino,...,,,,0,0.0,0,0,0,0,0
3,2019,549300BQPSF250Q8KQ79,19740,CO,8059.0,8059012046.0,C,Conventional:Subordinate Lien,Single Family (1-4 Units):Site-Built,Not Hispanic or Latino,...,,,,2112,21.92,92800,100,784,837,39
4,2019,549300BQPSF250Q8KQ79,0,CO,,,C,Conventional:First Lien,Single Family (1-4 Units):Site-Built,Not Hispanic or Latino,...,,,,0,0.0,0,0,0,0,0


# Data Cleaning

## Subset

In [5]:
remove_cols = ['derived_loan_product_type', 'derived_dwelling_category', 'derived_ethnicity',
               'derived_race', 'derived_sex',
               'applicant_ethnicity_2','applicant_ethnicity_3', 'applicant_ethnicity_4', 'applicant_ethnicity_5',
               'co_applicant_ethnicity_2', 'co_applicant_ethnicity_3', 'co_applicant_ethnicity_4',
               'co_applicant_ethnicity_5',
               'applicant_race_2','applicant_race_3', 'applicant_race_4', 'applicant_race_5',
               'co_applicant_race_2', 'co_applicant_race_3', 'co_applicant_race_4',
               'co_applicant_race_5']

new_headers = []
for column in hmda19_df.columns:
    if column not in remove_cols:
        new_headers.append(column)

print(len(new_headers))

78


In [6]:
hmda19_df2 = hmda19_df[new_headers].copy()
del hmda19_df

hmda19_df2.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 78 entries, activity_year to tract_median_age_of_housing_units
dtypes: object(78)

## Location

In [7]:
location_df = hmda19_df2.groupby(by = ['county_code', 'census_tract'], dropna = False).size().reset_index()
location_df = location_df.rename(columns={0:'count'})
# location_df = location_df.compute()

In [8]:
### Replacing the Nulls with other text so that the function works, Keeping the Nulls seperate from the "NAs"
location_df = location_df.replace(to_replace = 'Na', value = 'ii-ii')
location_df = location_df.fillna('00-00')
location_df.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 3 entries, county_code to count
dtypes: object(2), int64(1)

In [9]:
location_df['location_code'] = location_df.apply(clean_location, axis = 1)

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=(None, 'object'))



In [10]:
### Split location column for state and county fips codes
location_df['state_fips'] = location_df['location_code'].str[0:2]
location_df['county_fips'] = location_df['location_code'].str[2:5]

### Number of records with no county code and census tract information
nulls_df = location_df[(location_df['state_fips'] == '--') & (location_df['county_fips'] == '---')]
print('Number of records with location nulls: ' + str(nulls_df['count'].sum()))

### Remove columns that are no longer needed
# location_df2 = location_df.drop(columns = ['count', 'location_code'], axis = 1)
del location_df['count']
del location_df['location_code']

Number of records with location nulls: dd.Scalar<series-..., dtype=int64>


In [11]:
### Replace two dashes and three dashes data points with NaN
location_df = location_df.replace(to_replace = '--', value = np.nan)
location_df = location_df.replace(to_replace = '---', value = np.nan)

### Replace '00-00' and 'ii-ii' with the orginal data points to join back to the orginal HMDA data
location_df = location_df.replace(to_replace = '00-00', value = np.nan)
location_df = location_df.replace(to_replace = 'ii-ii', value = 'Na')

# location_df.head(1)

In [12]:
hmda19_df4 = dtf.merge(hmda19_df2, location_df, how = 'left', on = ['county_code', 'census_tract'])

nulls_records = (hmda19_df4['county_fips'].isnull() & hmda19_df4['state_fips'].isnull()).values.sum()

### This number matches the one from above:
print('Null Records that don\'t have fips data: ' + str((nulls_records)))

Null Records that don't have fips data: dask.array<sum-aggregate, shape=(), dtype=int64, chunksize=(), chunktype=numpy.ndarray>


## Race & Ethnicity

In [13]:
### Group race and ethnicity for all unique combinations
main_race_eth = hmda19_df2.groupby(by = ['applicant_race_1', 'applicant_ethnicity_1'],
                             dropna = False).size().reset_index()
main_race_eth = main_race_eth.rename(columns={0:'count'})
### Replace NAs with 000 for cleaning purposes
main_race_eth = main_race_eth.fillna('000')
main_race_eth

Unnamed: 0_level_0,applicant_race_1,applicant_ethnicity_1,count
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,object,object,int64
,...,...,...


In [14]:
### Apply clean_race_ethnicity function for the r/e dataframe
main_race_eth['app_race_ethnicity'] = main_race_eth.apply(clean_race_ethnicity, axis = 1)

### Replace 000 with NaN to join back with HMDA data
main_race_eth = main_race_eth.replace(to_replace = '000', value = np.nan)
### Drop Count Column
# main_race_eth = main_race_eth.drop(columns = ['count'], axis = 1)
del main_race_eth['count']

hmda19_df2 = dtf.merge(hmda19_df2, main_race_eth, how = 'left', on = ['applicant_race_1', 'applicant_ethnicity_1'])

hmda19_df2['app_race_ethnicity'].value_counts(dropna = False)

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=(None, 'object'))



Dask Series Structure:
npartitions=1
    int64
      ...
Name: app_race_ethnicity, dtype: int64
Dask Name: value-counts-agg, 15 graph layers

In [15]:
coapp_race_ethnicity = hmda19_df2.groupby(by = ['co_applicant_race_1', 'co_applicant_ethnicity_1'],
                                    dropna = False).size().reset_index()
coapp_race_ethnicity = coapp_race_ethnicity.rename(columns={0:'count'})
coapp_race_ethnicity = coapp_race_ethnicity.fillna('000')
coapp_race_ethnicity

Unnamed: 0_level_0,co_applicant_race_1,co_applicant_ethnicity_1,count
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,object,object,int64
,...,...,...


In [16]:
### Using clean_race_ethnicity function for the coapp r/e dataframe, it has a no co-app flag
coapp_race_ethnicity['coapp_race_ethnicity'] = coapp_race_ethnicity.apply(clean_race_ethnicity, axis = 1)
del coapp_race_ethnicity['count']
# coapp_race_ethnicity = coapp_race_ethnicity.drop(columns = ['count'], axis = 1)
coapp_race_ethnicity = coapp_race_ethnicity.replace(to_replace = '000', value = np.nan)

hmda19_df2 = dtf.merge(hmda19_df2, coapp_race_ethnicity, how = 'left',
                      on = ['co_applicant_race_1', 'co_applicant_ethnicity_1'])

hmda19_df2['coapp_race_ethnicity'].value_counts(dropna = False)

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=(None, 'object'))



Dask Series Structure:
npartitions=1
    int64
      ...
Name: coapp_race_ethnicity, dtype: int64
Dask Name: value-counts-agg, 25 graph layers

In [17]:
### group all instances of main applicants and co-applicants races and ethnicities
coapp_same_race = hmda19_df2.groupby(by = ['app_race_ethnicity', 'coapp_race_ethnicity'],
                               dropna = False).size().reset_index()
coapp_same_race = coapp_same_race.rename(columns={0:'count'})
# coapp_same_race.sample(2, random_state = 303)
coapp_same_race

Unnamed: 0_level_0,app_race_ethnicity,coapp_race_ethnicity,count
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,object,object,int64
,...,...,...


In [18]:
### Find records where applicant and co-applicant are the same
coapp_same_race['coapp_same_race'] = coapp_same_race.apply(find_same_race, axis = 1)

del coapp_same_race['count']
# coapp_same_race = coapp_same_race.drop(columns = ['count'], axis = 1)
hmda19_df2 = dtf.merge(hmda19_df2, coapp_same_race, how = 'left',
                    on = ['app_race_ethnicity', 'coapp_race_ethnicity'])

hmda19_df2['coapp_same_race'].value_counts(dropna = False)

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=(None, 'object'))



Dask Series Structure:
npartitions=1
    int64
      ...
Name: coapp_same_race, dtype: int64
Dask Name: value-counts-agg, 33 graph layers

## Credit Models

In [19]:
credit_models = hmda19_df2.groupby(by = ['applicant_credit_score_type'],
                dropna = False).size().reset_index()
credit_models = credit_models.rename(columns={0:'count'})
credit_models

Unnamed: 0_level_0,applicant_credit_score_type,count
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1
,object,int64
,...,...


In [20]:
### Using function to standardize credit model
credit_models['app_credit_model'] = credit_models.apply(clean_credit_model, axis = 1)

# credit_models = credit_models.drop(columns = ['count'], axis = 1)
del credit_models['count']

hmda19_df2 = dtf.merge(hmda19_df2, credit_models, how = 'left', on = ['applicant_credit_score_type'])

hmda19_df2['app_credit_model'].value_counts(dropna = False)

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=(None, 'object'))



Dask Series Structure:
npartitions=1
    int64
      ...
Name: app_credit_model, dtype: int64
Dask Name: value-counts-agg, 41 graph layers

## Co-Applicants

In [21]:
coapp_cols = ['coapp_race_ethnicity', 'co_applicant_sex', 'co_applicant_age', 'co_applicant_credit_score_type']

coapp_comb_df = hmda19_df2.groupby(by = coapp_cols, dropna = False).size().reset_index()
coapp_comb_df = coapp_comb_df.rename(columns={0:'count'})
# coapp_comb_df.head(1)
coapp_comb_df

Unnamed: 0_level_0,coapp_race_ethnicity,co_applicant_sex,co_applicant_age,co_applicant_credit_score_type,count
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,object,object,object,object,int64
,...,...,...,...,...


In [22]:
### Run function to find co-applicants
coapp_comb_df['co_applicant'] = coapp_comb_df.apply(find_coapplicants, axis = 1)

# coapp_comb_df = coapp_comb_df.drop(columns = ['count'], axis = 1)
del coapp_comb_df['count']

hmda19_df2 = dtf.merge(hmda19_df2, coapp_comb_df, how = 'left', on = coapp_cols)

hmda19_df2['co_applicant'].value_counts(dropna = False)

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=(None, 'object'))



Dask Series Structure:
npartitions=1
    int64
      ...
Name: co_applicant, dtype: int64
Dask Name: value-counts-agg, 49 graph layers

## Action Taken

In [23]:
action_taken = hmda19_df2['action_taken'].value_counts(dropna = False).reset_index()
action_taken = action_taken.rename(columns = {'index': 'action_taken', 'action_taken': 'count'})
# action_taken.head(2)
action_taken

Unnamed: 0_level_0,action_taken,count
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1
,object,int64
,...,...


In [24]:
### Clean Outcomes
action_taken['loan_outcome'] = action_taken.apply(clean_outcomes, axis = 1)

del action_taken['count']
# action_taken = action_taken.drop(columns = ['count'], axis = 1)

hmda19_df2 = dtf.merge(hmda19_df2, action_taken, how = 'left', on = ['action_taken'])

hmda19_df2['loan_outcome'].value_counts(dropna = False)

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=(None, 'object'))



Dask Series Structure:
npartitions=1
    int64
      ...
Name: loan_outcome, dtype: int64
Dask Name: value-counts-agg, 58 graph layers

## Underwriter

In [25]:
aus = ['aus_1', 'aus_2', 'aus_3', 'aus_4', 'aus_5']
aus_df = hmda19_df2.groupby(by=aus, dropna=False).size().reset_index()
aus_df = aus_df.rename(columns={0:'count'})
del aus_df['count']
# aus_df = aus_df.compute()
aus_df

Unnamed: 0_level_0,aus_1,aus_2,aus_3,aus_4,aus_5
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,object,object,object,object,object
,...,...,...,...,...


In [26]:
### Calculate unique values and nulls
aus_df = find_aus_patterns(aus_df)
aus_df

KeyboardInterrupt: 

In [27]:
def find_aus_patterns2(df):
    df_container = []

    for index_num in df.index:
        single_row_df = df.loc[[index_num]].copy()

        row = dtf.from_pandas(single_row_df, npartitions=1)  # Convert to dask dataframe
        valuescount_df = row.value_counts(dropna=False)
        num_unqiue_values = valuescount_df.index.nunique(dropna=False)

        try:
            number_nulls = valuescount_df[(valuescount_df.index.isnull())].values[0][0]
        except IndexError:
            number_nulls = 0

        single_row_df['number_of_values'] = num_unqiue_values
        single_row_df['number_of_nulls'] = number_nulls
        df_container.append(single_row_df)

    df = dtf.concat(df_container)

    return df

In [28]:
def find_aus_patterns3(df):
    df_dask = dtf.from_pandas(df, npartitions=10)  # Specify the number of partitions

    def process_row(row):
        values_count = row.value_counts(dropna=False)
        num_unique_values = values_count.index.nunique(dropna=False)

        number_nulls = values_count.get(None, 0)

        row['number_of_values'] = num_unique_values
        row['number_of_nulls'] = number_nulls

        return row

    df_processed = df_dask.map_partitions(lambda part: part.apply(process_row, axis=1))
    df_result = df_processed.compute()

    return df_result

In [None]:
aus_df = find_aus_patterns3(aus_df)
aus_df

In [None]:
def find_aus_patterns4(df):
    df_container = []

    for index_num in df.index:
        single_row_df = df.loc[[index_num]].copy()

        row = dtf.from_pandas(single_row_df, npartitions=10)  # Convert to dask dataframe
        valuescount_df = row.value_counts(dropna=False)
        num_unqiue_values = valuescount_df.index.nunique(dropna=False)

        try:
            number_nulls = valuescount_df[(valuescount_df.index.isnull())].values[0][0]
        except IndexError:
            number_nulls = 0

        single_row_df['number_of_values'] = num_unqiue_values
        single_row_df['number_of_nulls'] = number_nulls
        df_container.append(single_row_df)

    df = dtf.concat(df_container)

    return df  # Return the Dask DataFrame without calling compute()

In [None]:
def find_aus_patterns5(df):
    df_container = []

    for index_num in df.index:
        single_row_df = df.loc[[index_num]].copy()

        row = dtf.from_pandas(single_row_df, npartitions=1)  # Convert to Dask dataframe
        values_count_df = row.value_counts(dropna=False)
        num_unique_values = values_count_df.index.nunique(dropna=False)

        number_nulls = values_count_df.loc[None].values[0] if None in values_count_df.index else 0

        single_row_df['number_of_values'] = num_unique_values
        single_row_df['number_of_nulls'] = number_nulls
        df_container.append(single_row_df)

    df_concat = dtf.concat(df_container)

    return df_concat  # Return the Dask DataFrame without calling compute()

In [None]:
aus_df = find_aus_patterns5(aus_df)
aus_df

In [None]:
### Categorize AUS
aus_df['aus_cat'] = aus_df.apply(clean_aus, axis = 1)
aus_df

In [None]:
# delete columns
del aus_df['number_of_values']
del aus_df['number_of_nulls']

# show output
aus_df

In [None]:
hmda19_df2 = dtf.merge(hmda19_df2, aus_df, how = 'left', on = aus)
hmda19_df2['aus_cat'].value_counts(dropna = False)
hmda19_df2 # inspect output

# Final Outputs

In [None]:
hmda19_df2.info()

In [None]:
hmda19_df2.to_parquet("/content/drive/MyDrive/2-S24/EoC II/for-students/mar21/output.parquet", engine="pyarrow")

In [None]:
hmda19_df2.to_csv("/content/drive/MyDrive/2-S24/EoC II/for-students/mar21/hmdaOutput.csv", index=False)
hmda19_df2.info()