## Load the HMDA Dataset

In [1]:
ls -l

total 11964
-rw-rw-r-- 1 satish satish 12246834 Jul  4 22:08  [0m[01;31mmortgage_data_balanced.pkl.gz[0m
-rw-rw-r-- 1 satish satish     2417 Jul  4 22:18 'PARTIAL1. Load the HMDA dataset into a pandas dataframe.ipynb'


In [2]:
# Mount Google Drive locally 
# Using the instructions found here https://colab.research.google.com/notebooks/io.ipynb#scrollTo=u22w3BFiOveA&line=1&uniqifier=1

!gunzip mortgage_data_balanced.pkl.gz

In [3]:
ls -l

total 94596
-rw-rw-r-- 1 satish satish 96859478 Jul  4 22:08  mortgage_data_balanced.pkl
-rw-rw-r-- 1 satish satish     2417 Jul  4 22:18 'PARTIAL1. Load the HMDA dataset into a pandas dataframe.ipynb'


In [6]:
import pickle
import pandas as pd

In [7]:
# Load the pickled dataframe using the "read_pickle()"" function from "pandas"
df = pd.read_pickle('mortgage_data_balanced.pkl')

## Inspect the Dataset

In [9]:
# The "shape" property on a data frame outputs the number of rows and columns
df.shape

(165950, 78)

In [10]:
# The "head()" on a dataframe outputs the first five rows
df.head()

Unnamed: 0,as_of_year,respondent_id,agency_name,agency_abbr,agency_code,loan_type_name,loan_type,property_type_name,property_type,loan_purpose_name,...,edit_status_name,edit_status,sequence_number,population,minority_population,hud_median_family_income,tract_to_msamd_income,number_of_owner_occupied_units,number_of_1_to_4_family_units,application_date_indicator
0,2011,9179,Office of the Comptroller of the Currency,OCC,1,Conventional,1,One-to-four family dwelling (other than manufa...,1,Refinancing,...,,,1653,5972.0,1.89,66600.0,106.129997,1973.0,2194.0,0
1,2011,476810,Consumer Financial Protection Bureau,CFPB,9,Conventional,1,One-to-four family dwelling (other than manufa...,1,Refinancing,...,,,31273,4188.0,4.66,49500.0,106.169998,1331.0,3113.0,0
2,2011,451965,Consumer Financial Protection Bureau,CFPB,9,Conventional,1,One-to-four family dwelling (other than manufa...,1,Refinancing,...,,,1164651,9565.0,42.529999,61200.0,100.330002,2504.0,3149.0,0
3,2011,30052,Federal Deposit Insurance Corporation,FDIC,3,Conventional,1,One-to-four family dwelling (other than manufa...,1,Home improvement,...,,,515,5928.0,9.99,67400.0,170.520004,1755.0,2102.0,0
4,2011,2888798,Federal Reserve System,FRS,2,FHA-insured,2,One-to-four family dwelling (other than manufa...,1,Home purchase,...,,,44491,4021.0,3.56,54200.0,139.789993,1296.0,1487.0,0


In [11]:
# The "columns" property on a dataframe outputs the name of the columns
# Check https://files.consumerfinance.gov/hmda-historic-data-dictionaries/lar_record_codes.pdf for more details
df.columns

Index(['as_of_year', 'respondent_id', 'agency_name', 'agency_abbr',
       'agency_code', 'loan_type_name', 'loan_type', 'property_type_name',
       'property_type', 'loan_purpose_name', 'loan_purpose',
       'owner_occupancy_name', 'owner_occupancy', 'loan_amount_000s',
       'preapproval_name', 'preapproval', 'action_taken_name', 'action_taken',
       'msamd_name', 'msamd', 'state_name', 'state_abbr', 'state_code',
       'county_name', 'county_code', 'census_tract_number',
       'applicant_ethnicity_name', 'applicant_ethnicity',
       'co_applicant_ethnicity_name', 'co_applicant_ethnicity',
       'applicant_race_name_1', 'applicant_race_1', 'applicant_race_name_2',
       'applicant_race_2', 'applicant_race_name_3', 'applicant_race_3',
       'applicant_race_name_4', 'applicant_race_4', 'applicant_race_name_5',
       'applicant_race_5', 'co_applicant_race_name_1', 'co_applicant_race_1',
       'co_applicant_race_name_2', 'co_applicant_race_2',
       'co_applicant_race_name_

In [12]:
# The "dtypes" property on a dataframe outputs the types of each columns
df.dtypes

as_of_year                          int64
respondent_id                      object
agency_name                        object
agency_abbr                        object
agency_code                         int64
                                   ...   
hud_median_family_income          float64
tract_to_msamd_income             float64
number_of_owner_occupied_units    float64
number_of_1_to_4_family_units     float64
application_date_indicator          int64
Length: 78, dtype: object

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165950 entries, 0 to 165949
Data columns (total 78 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   as_of_year                      165950 non-null  int64  
 1   respondent_id                   165950 non-null  object 
 2   agency_name                     165950 non-null  object 
 3   agency_abbr                     165950 non-null  object 
 4   agency_code                     165950 non-null  int64  
 5   loan_type_name                  165950 non-null  object 
 6   loan_type                       165950 non-null  int64  
 7   property_type_name              165950 non-null  object 
 8   property_type                   165950 non-null  int64  
 9   loan_purpose_name               165950 non-null  object 
 10  loan_purpose                    165950 non-null  int64  
 11  owner_occupancy_name            165950 non-null  object 
 12  owner_occupancy 