In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
import numpy as np

In [2]:
df=pd.read_csv("Resources/allloans_str.csv")

In [3]:
retypes={'agency_code':'str'
,'loan_type':'str'
,'property_type':'str'
,'loan_amount_000s':'int64'
,'preapproval':'str'
,'action_taken':'int8'
,'state_code':'str'
,'county_code':'str'
,'census_tract_number':'float64'
,'applicant_ethnicity':'str'
,'co_applicant_ethnicity':'str'
,'applicant_race_1':'str'
,'co_applicant_race_1':'str'
,'applicant_sex':'str'
,'co_applicant_sex':'str'
,'applicant_income_000s':'int64'
,'purchaser_type':'str'}

In [4]:
# categorical fields to strings for one-hot encoding
df = df.astype(retypes)


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17744550 entries, 0 to 17744549
Data columns (total 19 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   Unnamed: 0              int64  
 1   agency_code             object 
 2   loan_type               object 
 3   property_type           object 
 4   loan_purpose            int64  
 5   loan_amount_000s        int64  
 6   preapproval             object 
 7   action_taken            int8   
 8   state_code              object 
 9   county_code             object 
 10  census_tract_number     float64
 11  applicant_ethnicity     object 
 12  co_applicant_ethnicity  object 
 13  applicant_race_1        object 
 14  co_applicant_race_1     object 
 15  applicant_sex           object 
 16  co_applicant_sex        object 
 17  applicant_income_000s   int64  
 18  purchaser_type          object 
dtypes: float64(1), int64(4), int8(1), object(13)
memory usage: 2.4+ GB


In [8]:
X = df.drop(columns=['action_taken','Unnamed: 0','state_code','county_code','census_tract_number'])
X

Unnamed: 0,agency_code,loan_type,property_type,loan_purpose,loan_amount_000s,preapproval,applicant_ethnicity,co_applicant_ethnicity,applicant_race_1,co_applicant_race_1,applicant_sex,co_applicant_sex,applicant_income_000s,purchaser_type
0,5,1,1,1,144,2,2,2,5,5,1,2,154,0
1,3,1,1,1,300,3,2,5,5,8,1,5,119,1
2,5,1,1,1,263,3,1,5,5,8,1,5,141,7
3,5,1,1,1,187,1,2,5,5,8,1,5,76,1
4,5,1,1,1,438,1,3,3,6,6,2,1,225,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17744545,7,1,1,1,76,3,2,2,5,5,2,1,80,7
17744546,7,1,1,1,195,2,2,2,5,5,2,1,45,1
17744547,9,3,1,1,480,3,2,2,5,5,2,1,225,2
17744548,7,1,1,1,627,3,2,5,2,8,1,5,191,6


In [9]:
X_dummies = pd.get_dummies(X)
print(X_dummies.columns)
X_dummies

Index(['loan_purpose', 'loan_amount_000s', 'applicant_income_000s',
       'agency_code_1', 'agency_code_2', 'agency_code_3', 'agency_code_5',
       'agency_code_7', 'agency_code_9', 'loan_type_1', 'loan_type_2',
       'loan_type_3', 'loan_type_4', 'property_type_1', 'property_type_2',
       'preapproval_1', 'preapproval_2', 'preapproval_3',
       'applicant_ethnicity_1', 'applicant_ethnicity_2',
       'applicant_ethnicity_3', 'applicant_ethnicity_4',
       'co_applicant_ethnicity_1', 'co_applicant_ethnicity_2',
       'co_applicant_ethnicity_3', 'co_applicant_ethnicity_4',
       'co_applicant_ethnicity_5', 'applicant_race_1_1', 'applicant_race_1_2',
       'applicant_race_1_3', 'applicant_race_1_4', 'applicant_race_1_5',
       'applicant_race_1_6', 'applicant_race_1_7', 'co_applicant_race_1_1',
       'co_applicant_race_1_2', 'co_applicant_race_1_3',
       'co_applicant_race_1_4', 'co_applicant_race_1_5',
       'co_applicant_race_1_6', 'co_applicant_race_1_7',
       'co_app

Unnamed: 0,loan_purpose,loan_amount_000s,applicant_income_000s,agency_code_1,agency_code_2,agency_code_3,agency_code_5,agency_code_7,agency_code_9,loan_type_1,...,purchaser_type_0,purchaser_type_1,purchaser_type_2,purchaser_type_3,purchaser_type_4,purchaser_type_5,purchaser_type_6,purchaser_type_7,purchaser_type_8,purchaser_type_9
0,1,144,154,0,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,0
1,1,300,119,0,0,1,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
2,1,263,141,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,1,187,76,0,0,0,1,0,0,1,...,0,1,0,0,0,0,0,0,0,0
4,1,438,225,0,0,0,1,0,0,1,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17744545,1,76,80,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,1,0,0
17744546,1,195,45,0,0,0,0,1,0,1,...,0,1,0,0,0,0,0,0,0,0
17744547,1,480,225,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
17744548,1,627,191,0,0,0,0,1,0,1,...,0,0,0,0,0,0,1,0,0,0
