In [2]:
# --- Data manipulation ---
import numpy as np
import pandas as pd
from scipy.stats.mstats import winsorize

import warnings
warnings.filterwarnings('ignore')

# --- Data Processing ---
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer

from sklearn import set_config

In [5]:
file = "raw_data/Washington_State_HDMA-2016.csv"
data_original = pd.read_csv(file, decimal=',')

FileNotFoundError: [Errno 2] No such file or directory: 'raw_data/Washington_State_HDMA-2016.csv'

In [3]:
# --- Data manipulation ---
import numpy as np
import pandas as pd
from scipy.stats.mstats import winsorize

def clean_data(data: pd.DataFrame):
    # Define list of columns to filter for "Information not provided" values
    filter_cols = ['applicant_ethnicity_name', 'applicant_race_name_1', 'applicant_sex_name',
                   'co_applicant_ethnicity_name', 'co_applicant_race_name_1', 'co_applicant_sex_name']

    # Filter for rows where "action_taken_name" is not equal to certain values
    data = data.loc[~data['action_taken_name'].isin(['Application withdrawn by applicant', 'Loan purchased by the institution', 'File closed for incompleteness'])]

    # Filter for rows where values in specified columns are not equal to "Information not provided"
    for col in filter_cols:
        data = data.loc[~data[col].isin(['Information not provided by applicant in mail, Internet, or telephone application'])]

        # Create a dictionary of county to region mappings
    county_to_region = {
        'Whatcom County': 'Northern Cascades','Skagit County': 'Northern Cascades','Snohomish County': 'Northern Cascades',
        'King County': 'Western Region','Pierce County': 'Western Region','Kitsap County': 'Western Region',
        'Island County': 'Western Region','San Juan County': 'Western Region','Jefferson County': 'Western Region',
        'Clallam County': 'Western Region','Mason County': 'Olympic Peninsula','Clark County': 'Southwest Washington',
        'Cowlitz County': 'Southwest Washington','Wahkiakum County': 'Southwest Washington','Skamania County': 'Southwest Washington',
        'Adams County': 'Eastern Washington','Asotin County': 'Eastern Washington','Benton County': 'Eastern Washington',
        'Chelan County': 'Eastern Washington','Columbia County': 'Eastern Washington','Douglas County': 'Eastern Washington',
        'Ferry County': 'Eastern Washington','Franklin County': 'Eastern Washington','Garfield County': 'Eastern Washington',
        'Grant County': 'Eastern Washington','Kittitas County': 'Eastern Washington','Klickitat County': 'Eastern Washington',
        'Lincoln County': 'Eastern Washington','Okanogan County': 'Eastern Washington','Pend Oreille County': 'Eastern Washington',
        'Spokane County': 'Eastern Washington','Stevens County': 'Eastern Washington','Walla Walla County': 'Eastern Washington',
        'Whitman County': 'Eastern Washington','Yakima County': 'Eastern Washington','Thurston County':'Western Region',
        'Lewis County': 'Western Region','Grays Harbor County': 'Western Region','Pacific County': 'Southwest Washington',
        'Seattle, Bellevue, Everett': 'Western Region','Tacoma, Lakewood':'Northern Cascades','Portland, Vancouver, Hillsboro':'Northern Cascades',
        'Spokane, Spokane Valley':'Eastern Washington'
    }

    # Add a new column to your dataframe containing the region for each county
    data['region'] = data['county_name'].map(county_to_region)
    
    # Drop rows with missing values in "county_name" column
    data = data.dropna(subset=['county_name'])

    # Create new column "loan_status" based on "action_taken_name"
    data['loan_status'] = np.where(data['action_taken_name'] == 'Loan originated', 'approved', 'not approved')

    # Drop irrelevant columns
    drop_cols = ['applicant_race_name_5', 'applicant_race_name_4', 'applicant_race_name_3',
                 'applicant_race_name_2', 'co_applicant_race_name_5', 'co_applicant_race_name_4',
                 'co_applicant_race_name_3', 'co_applicant_race_name_2', 'denial_reason_name_3',
                 'denial_reason_name_2', 'denial_reason_name_1', 'rate_spread', 'edit_status_name',
                 'state_abbr', 'respondent_id', 'agency_abbr', 'as_of_year', 'application_date_indicator',
                 'state_name', 'sequence_number', 'census_tract_number', 'action_taken_name', 'purchaser_type_name',
                 'county_name','msamd_name']
    
    data = data.drop(columns=drop_cols)

    # Winsorize numeric columns
    data.select_dtypes(exclude=['object']).apply(lambda x: winsorize(x, limits=[0.05, 0.05]), axis=0, raw=True)

    # create a list of columns to be converted
    cols_to_convert = [
        'tract_to_msamd_income','population','minority_population','number_of_owner_occupied_units',
        'number_of_1_to_4_family_units','loan_amount_000s','hud_median_family_income','applicant_income_000s'
    ]

    # use the astype() method to convert the dtype of columns
    data[cols_to_convert] = data[cols_to_convert].astype('float')

    return data

In [4]:
data = clean_data(data_original)

In [5]:
data.head(2)

Unnamed: 0,tract_to_msamd_income,population,minority_population,number_of_owner_occupied_units,number_of_1_to_4_family_units,loan_amount_000s,hud_median_family_income,applicant_income_000s,property_type_name,preapproval_name,...,hoepa_status_name,co_applicant_sex_name,co_applicant_race_name_1,co_applicant_ethnicity_name,applicant_sex_name,applicant_race_name_1,applicant_ethnicity_name,agency_name,region,loan_status
1,83.370003,4915.0,23.99,1268.0,1777.0,240.0,57900.0,42.0,One-to-four family dwelling (other than manufa...,Not applicable,...,Not a HOEPA loan,No co-applicant,No co-applicant,No co-applicant,Male,White,Hispanic or Latino,Department of Housing and Urban Development,Eastern Washington,approved
2,91.129997,5075.0,11.82,1136.0,1838.0,241.0,73300.0,117.0,One-to-four family dwelling (other than manufa...,Not applicable,...,Not a HOEPA loan,Female,White,Not Hispanic or Latino,Male,White,Not Hispanic or Latino,Department of Housing and Urban Development,Southwest Washington,approved


In [6]:
data.shape

(283225, 24)

In [1]:
for col in data.columns:
    print(col)

NameError: name 'data' is not defined

In [7]:
# numerical = [
#     'tract_to_msamd_income','population','minority_population','number_of_owner_occupied_units',
#     'number_of_1_to_4_family_units','loan_amount_000s','hud_median_family_income','applicant_income_000s'
# ]

# categorical = [
#     'purchaser_type_name','property_type_name','preapproval_name','owner_occupancy_name','msamd_name',
#     'loan_type_name','loan_purpose_name','lien_status_name','hoepa_status_name','county_name',
#     'co_applicant_sex_name','co_applicant_race_name_1','co_applicant_ethnicity_name','applicant_sex_name',
#     'applicant_race_name_1','applicant_ethnicity_name','agency_name'
# ]

# preprocessor = make_column_transformer(
#     (num_pipe, numerical),
#     (cat_pipe, categorical),
#     remainder='passthrough'
# )

# preprocessor

In [8]:
X = data.drop(columns='loan_status')
y = data['loan_status']

In [9]:
num_pipe = make_pipeline(SimpleImputer(strategy='median'),StandardScaler())
num_col = make_column_selector(dtype_include=['float64'])

cat_pipe = OneHotEncoder()
cat_col = make_column_selector(dtype_include=['object'])

preprocessor = make_column_transformer(
    (num_pipe, num_col),
    (cat_pipe, cat_col),
    remainder='passthrough',
    sparse_threshold=0.1
)

preprocessor

In [10]:
X_proc = pd.DataFrame(preprocessor.fit_transform(X), columns=preprocessor.get_feature_names_out())
X_proc.head()

Unnamed: 0,pipeline__tract_to_msamd_income,pipeline__population,pipeline__minority_population,pipeline__number_of_owner_occupied_units,pipeline__number_of_1_to_4_family_units,pipeline__loan_amount_000s,pipeline__hud_median_family_income,pipeline__applicant_income_000s,onehotencoder__purchaser_type_name_Affiliate institution,"onehotencoder__purchaser_type_name_Commercial bank, savings bank or savings association",...,onehotencoder__applicant_race_name_1_White,onehotencoder__applicant_ethnicity_name_Hispanic or Latino,onehotencoder__applicant_ethnicity_name_Not Hispanic or Latino,onehotencoder__applicant_ethnicity_name_Not applicable,onehotencoder__agency_name_Consumer Financial Protection Bureau,onehotencoder__agency_name_Department of Housing and Urban Development,onehotencoder__agency_name_Federal Deposit Insurance Corporation,onehotencoder__agency_name_Federal Reserve System,onehotencoder__agency_name_National Credit Union Administration,onehotencoder__agency_name_Office of the Comptroller of the Currency
0,-0.808809,-0.22709,-0.042841,-0.242257,-0.069575,-0.075359,-1.443497,-0.592029,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,-0.543652,-0.131155,-0.849096,-0.502244,0.019678,-0.074134,-0.25859,0.068782,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1.89402,-0.066399,-0.936546,0.618456,0.40888,0.141471,0.110732,0.04235,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.430187,-0.311033,-0.380713,-0.86071,-0.720684,-0.117011,-0.335532,-0.186731,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.130881,-0.329021,0.558704,-0.106355,-0.559735,0.066744,1.049424,-0.186731,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [12]:
from imblearn.over_sampling import SMOTE

# instantiate the SMOTE function with 'minority' sampling strategy
sm = SMOTE(sampling_strategy='minority')

# apply SMOTE to the data
X_sm, y_sm = sm.fit_resample(X_proc, y)

In [13]:
X_sm.shape

(443986, 126)

In [14]:
y_sm.value_counts()

approved        221993
not approved    221993
Name: loan_status, dtype: int64

In [16]:
y_sm.head(2)

0    approved
1    approved
Name: loan_status, dtype: object

In [15]:
def preprocess_features(X: pd.DataFrame):
    
    def create_sklearn_preprocessor():
        num_pipe = make_pipeline(SimpleImputer(strategy='median'),StandardScaler())
        num_col = make_column_selector(dtype_include=['float64'])

        cat_pipe = OneHotEncoder()
        cat_col = make_column_selector(dtype_include=['object'])

        preprocessor = make_column_transformer(
            (num_pipe, num_col),
            (cat_pipe, cat_col),
            remainder='passthrough',
            sparse_threshold=0.1
        )
        
        return preprocessor
    
    preprocessor = create_sklearn_preprocessor()
    
    X_proc = pd.DataFrame(preprocessor.fit_transform(X), columns=preprocessor.get_feature_names_out())
    
    # instantiate the SMOTE function with 'minority' sampling strategy
sm = SMOTE(sampling_strategy='minority')

# apply SMOTE to the data
X_sm, y_sm = sm.fit_resample(X_proc, y)