In [1]:
import pandas as pd

from scipy.stats.mstats import winsorize

import warnings
warnings.filterwarnings('ignore')

# Washington State HDMA 2016

In [2]:
file = "data/Washington_State_HDMA-2016.csv"
data_original = pd.read_csv(file, decimal=',')

In [3]:
data = data_original.copy()
data.shape

(466566, 47)

In [6]:
data.shape

(283225, 24)

In [7]:
data.columns.to_list()

['tract_to_msamd_income',
 'population',
 'minority_population',
 'number_of_owner_occupied_units',
 'number_of_1_to_4_family_units',
 'loan_amount_000s',
 'hud_median_family_income',
 'applicant_income_000s',
 'property_type_name',
 'preapproval_name',
 'owner_occupancy_name',
 'loan_type_name',
 'loan_purpose_name',
 'lien_status_name',
 'hoepa_status_name',
 'co_applicant_sex_name',
 'co_applicant_race_name_1',
 'co_applicant_ethnicity_name',
 'applicant_sex_name',
 'applicant_race_name_1',
 'applicant_ethnicity_name',
 'agency_name',
 'Region',
 'loan_status']

## Cleaning Data

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 283225 entries, 1 to 466565
Data columns (total 24 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   tract_to_msamd_income           283152 non-null  float64
 1   population                      283155 non-null  float64
 2   minority_population             283155 non-null  float64
 3   number_of_owner_occupied_units  283150 non-null  float64
 4   number_of_1_to_4_family_units   283154 non-null  float64
 5   loan_amount_000s                283225 non-null  float64
 6   hud_median_family_income        283157 non-null  float64
 7   applicant_income_000s           260191 non-null  float64
 8   property_type_name              283225 non-null  object 
 9   preapproval_name                283225 non-null  object 
 10  owner_occupancy_name            283225 non-null  object 
 11  loan_type_name                  283225 non-null  object 
 12  loan_purpose_nam

### Check for missing values: Use functions such as isna(), isnull(), or notnull() to detect missing values in your dataset.

In [9]:
null_df = pd.concat([data.isnull().sum().sort_values(ascending=False),
                     data.notnull().sum().sort_values(ascending=True)],
                    axis=1, keys=['is_null', 'not_null'])

null_df['percent_null'] = round((null_df['is_null'] / len(data)) * 100, 2)

null_df

Unnamed: 0,is_null,not_null,percent_null
applicant_income_000s,23034,260191,8.13
number_of_owner_occupied_units,75,283150,0.03
tract_to_msamd_income,73,283152,0.03
number_of_1_to_4_family_units,71,283154,0.03
minority_population,70,283155,0.02
population,70,283155,0.02
hud_median_family_income,68,283157,0.02
co_applicant_sex_name,0,283225,0.0
Region,0,283225,0.0
agency_name,0,283225,0.0


In [10]:
y_df = pd.DataFrame(data['action_taken_name'].value_counts())

y_df['percentage_column'] = y_df['action_taken_name'] / y_df['action_taken_name'].sum() * 100

round(y_df, 2)

KeyError: 'action_taken_name'

In [None]:
cols_drop = ['applicant_race_name_5', 'applicant_race_name_4','applicant_race_name_3','applicant_race_name_2',
            'co_applicant_race_name_5','co_applicant_race_name_4','co_applicant_race_name_3','co_applicant_race_name_2',
            'denial_reason_name_3','denial_reason_name_2','denial_reason_name_1','rate_spread','edit_status_name',
            'respondent_id', 'state_abbr', 'agency_abbr','as_of_year', 'state_name', 'sequence_number'
           ]

In [None]:
data = data[data.columns.difference(cols_drop)]

In [None]:
print(f'data shape: {data.shape}')

data.isnull().sum().sort_values(ascending=False)

### Check for duplicates: Use the duplicated() function to detect duplicate rows in your dataset.

In [None]:
data[data.duplicated()]

In [None]:
print(f'Duplicates with respondant ID: {data_original.duplicated().sum()}')
print(f'Duplicates without respondant ID: {data.duplicated().sum()}')

- Same applicant applied twice as they had a different respondent ID

### Check for inconsistent values: Look for inconsistent values across different columns in your dataset.

In [None]:
# for i in data:
#     print("\n",i,":\n",data[i].unique())

### Handle inconsistent values: Deleting the rows with inconsistent values.

In [None]:
data = data[~data['action_taken_name'].isin(['Application withdrawn by applicant', 'Loan purchased by the institution', 'File closed for incompleteness'])]
data = data[~data['applicant_ethnicity_name'].isin(['Information not provided by applicant in mail, Internet, or telephone application'])]
data = data[~data['applicant_race_name_1'].isin(['Information not provided by applicant in mail, Internet, or telephone application'])]
data = data[~data['applicant_sex_name'].isin(['Information not provided by applicant in mail, Internet, or telephone application'])]
data = data[~data['co_applicant_ethnicity_name'].isin(['Information not provided by applicant in mail, Internet, or telephone application'])]
data = data[~data['co_applicant_race_name_1'].isin(['Information not provided by applicant in mail, Internet, or telephone application'])]
data = data[~data['co_applicant_sex_name'].isin(['Information not provided by applicant in mail, Internet, or telephone application'])]
data =  data.dropna(subset=['county_name'])

In [None]:
print(f'Data Shape: {data.shape}')

### Check for outliers: Look for data points that are significantly different from the other data points in your dataset.

In [None]:
data_original.describe()

In [None]:
df_selected = data.drop(data.select_dtypes('object').columns.tolist(),axis=1)

for i in df_selected:
    data[i] = winsorize(data[i], limits=[0.05, 0.05])

data.describe()

- we're replacing the lowest 5% of values with the value at the 5th percentile, and the highest 5% of values with the value at the 95th percentile, using the 'winsorize' function from 'scipy.stats.mstats'.

In [None]:
data.isnull().sum()

In [None]:
# --- Data manipulation ---
import numpy as np
import pandas as pd
from scipy.stats.mstats import winsorize

def clean_data(data: pd.DataFrame):
    # Define list of columns to filter for "Information not provided" values
    filter_cols = ['applicant_ethnicity_name', 'applicant_race_name_1', 'applicant_sex_name',
                   'co_applicant_ethnicity_name', 'co_applicant_race_name_1', 'co_applicant_sex_name']

    # Filter for rows where "action_taken_name" is not equal to certain values
    data = data.loc[~data['action_taken_name'].isin(['Application withdrawn by applicant', 'Loan purchased by the institution', 'File closed for incompleteness'])]

    # Filter for rows where values in specified columns are not equal to "Information not provided"
    for col in filter_cols:
        data = data.loc[~data[col].isin(['Information not provided by applicant in mail, Internet, or telephone application'])]

        # Create a dictionary of county to region mappings
    county_to_region = {
        'Whatcom County': 'Northern Cascades','Skagit County': 'Northern Cascades','Snohomish County': 'Northern Cascades',
        'King County': 'Western Region','Pierce County': 'Western Region','Kitsap County': 'Western Region',
        'Island County': 'Western Region','San Juan County': 'Western Region','Jefferson County': 'Western Region',
        'Clallam County': 'Western Region','Mason County': 'Olympic Peninsula','Clark County': 'Southwest Washington',
        'Cowlitz County': 'Southwest Washington','Wahkiakum County': 'Southwest Washington','Skamania County': 'Southwest Washington',
        'Adams County': 'Eastern Washington','Asotin County': 'Eastern Washington','Benton County': 'Eastern Washington',
        'Chelan County': 'Eastern Washington','Columbia County': 'Eastern Washington','Douglas County': 'Eastern Washington',
        'Ferry County': 'Eastern Washington','Franklin County': 'Eastern Washington','Garfield County': 'Eastern Washington',
        'Grant County': 'Eastern Washington','Kittitas County': 'Eastern Washington','Klickitat County': 'Eastern Washington',
        'Lincoln County': 'Eastern Washington','Okanogan County': 'Eastern Washington','Pend Oreille County': 'Eastern Washington',
        'Spokane County': 'Eastern Washington','Stevens County': 'Eastern Washington','Walla Walla County': 'Eastern Washington',
        'Whitman County': 'Eastern Washington','Yakima County': 'Eastern Washington','Thurston County':'Western Region',
        'Lewis County': 'Western Region','Grays Harbor County': 'Western Region','Pacific County': 'Southwest Washington',
        'Seattle, Bellevue, Everett': 'Western Region','Tacoma, Lakewood':'Northern Cascades','Portland, Vancouver, Hillsboro':'Northern Cascades',
        'Spokane, Spokane Valley':'Eastern Washington'
    }

    # Add a new column to your dataframe containing the region for each county
    data['Region'] = data['county_name'].map(county_to_region)
    
    # Drop rows with missing values in "county_name" column
    data = data.dropna(subset=['county_name'])

    # Create new column "loan_status" based on "action_taken_name"
    data['loan_status'] = np.where(data['action_taken_name'] == 'Loan originated', 'approved', 'not approved')

    # Drop irrelevant columns
    drop_cols = ['applicant_race_name_5', 'applicant_race_name_4', 'applicant_race_name_3',
                 'applicant_race_name_2', 'co_applicant_race_name_5', 'co_applicant_race_name_4',
                 'co_applicant_race_name_3', 'co_applicant_race_name_2', 'denial_reason_name_3',
                 'denial_reason_name_2', 'denial_reason_name_1', 'rate_spread', 'edit_status_name',
                 'state_abbr', 'respondent_id', 'agency_abbr', 'as_of_year', 'application_date_indicator',
                 'state_name', 'sequence_number', 'census_tract_number', 'action_taken_name', 'purchaser_type_name',
                 'county_name','msamd_name']
    
    data = data.drop(columns=drop_cols)

    # Winsorize numeric columns
    data.select_dtypes(exclude=['object']).apply(lambda x: winsorize(x, limits=[0.05, 0.05]), axis=0, raw=True)

    # create a list of columns to be converted
    cols_to_convert = [
        'tract_to_msamd_income','population','minority_population','number_of_owner_occupied_units',
        'number_of_1_to_4_family_units','loan_amount_000s','hud_median_family_income','applicant_income_000s'
    ]

    # use the astype() method to convert the dtype of columns
    data[cols_to_convert] = data[cols_to_convert].astype('float')

    return data