In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from typing import Tuple, Dict, List
import warnings
from sklearn.preprocessing import LabelEncoder, StandardScaler
import seaborn as sns
import itertools
from torch.utils.data import Dataset, random_split
import torch
import torch.nn as nn
from tqdm import tqdm
import time

warnings.filterwarnings('ignore')

In [3]:
# part 1

def load_and_prepare_data(main_file: str, sample_size: int = 500000) -> Tuple[pd.DataFrame, Dict]:
    """
    Load and prepare the HMDA dataset using chunked processing and sampling.
    Returns processed dataframe and data statistics dictionary.
    """
    # Define only the columns we need
    needed_columns = [
        'action_taken',
        'loan_amount_000s',
        'applicant_income_000s',
        'applicant_race_name_1',
        'applicant_ethnicity',
        'applicant_sex',
        'hud_median_family_income',
        'denial_reason_1',
        'state_code',
        'county_code',
        'minority_population',
        'tract_to_msamd_income'
    ]
    
    print("Loading and sampling main HMDA data...")
    # Initialize empty list to store chunks
    chunks = []
    chunk_size = 100000  # Adjust this based on your available memory

    # Initialize the tqdm progress bar
    for chunk in tqdm(pd.read_csv(main_file, 
                                usecols=needed_columns, 
                                chunksize=chunk_size),
                    total=143, 
                    desc="Processing chunks"):

        # Sample from each chunk proportionally
        chunk_sample_size = int(sample_size * (chunk_size / 14285496))  # Adjust for total record count
        sampled_chunk = chunk.sample(n=min(chunk_sample_size, len(chunk)))
        chunks.append(sampled_chunk)

        # Print progress every 10 chunks
        # if count % 10 == 0:
            # print(f"Processed chunk, current sample size: {sum(len(chunk) for chunk in chunks)}")

        # Break if we've reached the desired sample size
        if sum(len(chunk) for chunk in chunks) >= sample_size:
            break
    
    # Combine chunks
    df = pd.concat(chunks, ignore_index=True)
    
    # Basic data cleaning
    # print("Performing initial data cleaning...")
    
    # Convert action_taken to binary (approved = 1, denied = 0)
    df['approved'] = df['action_taken'].isin([1, 2]).astype(int)
    
    # Calculate approval rates by different demographics
    stats = {}
    
    # Overall approval rate
    stats['overall_approval_rate'] = df['approved'].mean()
    
    # Approval rates by race (using primary race)
    race_approvals = df.groupby('applicant_race_name_1')['approved'].agg(['mean', 'count'])
    stats['race_approval_rates'] = race_approvals
    
    # Approval rates by income bracket
    df['income_bracket'] = pd.qcut(df['applicant_income_000s'].fillna(-1), 
                                 q=5, 
                                 labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
    income_approvals = df.groupby('income_bracket')['approved'].agg(['mean', 'count'])
    stats['income_approval_rates'] = income_approvals
    
    # Calculate loan amount approval rates
    # Create loan amount bins
    df['loan_bin'] = pd.qcut(df['loan_amount_000s'].dropna(), q=10, labels=False)
    loan_approval_rates = df.groupby('loan_bin')['approved'].mean()
    stats['loan_amount_approval_rates'] = loan_approval_rates.tolist()
    
    print("\nBasic dataset statistics:")
    print(f"Number of applications: {len(df):,}")
    print(f"Number of approved loans: {df['approved'].sum():,}")
    print(f"Overall approval rate: {df['approved'].mean():.2%}")
    
    return df, stats

# Main execution
if __name__ == "__main__":
    # File paths
    main_file = 'HDMA/hmda_2017_nationwide_all-records_labels.csv'
    
    # Load and prepare data
    df, stats = load_and_prepare_data(main_file)

Loading and sampling main HMDA data...


Processing chunks:  99%|█████████▉| 142/143 [01:04<00:00,  2.21it/s]



Basic dataset statistics:
Number of applications: 500,500
Number of approved loans: 271,308
Overall approval rate: 54.21%


In [53]:
class MortgageDataset(Dataset):
    def __init__(self, df: pd.DataFrame, features: List[str], label_col: str):
        """
        Args:
            df (pd.DataFrame): Processed DataFrame
            features (List[str]): List of feature columns
            label_col (str): Name of the target column
        """
        self.features = torch.tensor(df[features].values, dtype=torch.float32)
        self.labels = torch.tensor(df[label_col].values, dtype=torch.float32)

        print(self.features.shape)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]
    
# Preprocessing function
def preprocess_data(df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
    """
    Preprocess the HMDA dataset and create feature groups for analysis.
    Returns preprocessed DataFrame and feature groups.
    """
    # print("Starting data preprocessing...")

    # Create a copy of the data
    data = df.copy()

    # Define feature groups
    feature_groups = {
        'baseline': [
            'loan_amount_000s',
            'applicant_income_000s',
            'hud_median_family_income',
            'tract_to_msamd_income',
            'applicant_race_name_1',
            # 'applicant_ethnicity'
            # 'income_to_loan_ratio',  # Will be created
            # 'area_income_ratio'      # Will be created
        # ],
        # 'location': [
        #     'state_code',
        #     'county_code',
        #     'minority_population'
        # ],
        # 'sensitive': [
        #     'applicant_race_name_1',
        #     'applicant_ethnicity',
        #     'applicant_sex'
        ]
    }

    columns_to_keep = [
        'loan_amount_000s',
        'applicant_income_000s',
        'hud_median_family_income',
        'tract_to_msamd_income',
        'applicant_race_name_1',
        # 'applicant_ethnicity',
        'approved'
    ]   

    data = data[columns_to_keep]

    # remove entries that are missing entries
    data = data.dropna()

    data = pd.get_dummies(data, columns=['applicant_race_name_1'], prefix='race')

    to_remove = ['race_Information not provided by applicant in mail, Internet, or telephone application', 'race_Not applicable']

    data = data.drop(columns=to_remove)

    # print(data.columns)

    # data['loan_amount_000s'] = data['loan_amount_000s'] / 1000
    # data['applicant_income_000s'] = data['applicant_income_000s'] / 1000
    # data['hud_median_family_income'] = data['hud_median_family_income'] / 100000
    # data['tract_to_msamd_income'] = data['tract_to_msamd_income'] / 1000

    # normalize values 
    data['loan_amount_000s'] = data['loan_amount_000s'] / data['loan_amount_000s'].mean()
    data['applicant_income_000s'] = data['applicant_income_000s'] / data['applicant_income_000s'].mean()
    data['hud_median_family_income'] = data['hud_median_family_income'] / data['hud_median_family_income'].mean()
    data['tract_to_msamd_income'] = data['tract_to_msamd_income'] / data['tract_to_msamd_income'].mean()

    # print(data['loan_amount_000s'].mean())
    # print(data['applicant_income_000s'].mean())
    # print(data['hud_median_family_income'].mean())
    # print(data['tract_to_msamd_income'].mean())
    
    feature_groups = {
        'baseline': ['loan_amount_000s', 'applicant_income_000s', 'hud_median_family_income',
       'tract_to_msamd_income',
       'race_American Indian or Alaska Native', 'race_Asian',
       'race_Black or African American',
       'race_Native Hawaiian or Other Pacific Islander', 'race_White']
    }


    return data, feature_groups

# Dataset creation function
def create_pytorch_datasets(
    data: pd.DataFrame,
    feature_groups: Dict[str, List[str]],
    label_col: str = 'approved',
    test_size: float = 0.1
) -> Dict[str, Dict[str, Dataset]]:
    """
    Create PyTorch datasets for different feature groups.
    Returns a dictionary of train-test datasets.
    """
    # print("Creating PyTorch datasets...")

    datasets = {}
    le = LabelEncoder()

    print(len(feature_groups))
    for group_name, features in feature_groups.items():
        # print(f"Processing dataset: {group_name}")

        # Select features
        X = data[features].copy()

        # Handle categorical variables
        # categorical_cols = X.select_dtypes(include=['object']).columns
        # for col in categorical_cols:
        #     X[col] = le.fit_transform(X[col])

        # Create the target variable
        y = data[label_col]

        # Combine features and labels into a DataFrame
        full_data = pd.concat([X, y], axis=1)


        # Create PyTorch Dataset
        full_dataset = MortgageDataset(full_data, features, label_col)

        # Split into train and test datasets
        total_size = len(full_dataset)
        test_size_split = int(total_size * test_size)
        train_size_split = total_size - test_size_split
        train_dataset, test_dataset = random_split(full_dataset, [train_size_split, test_size_split])

        datasets[group_name] = {
            'train': train_dataset,
            'test': test_dataset
        }

    return datasets

if __name__ == "__main__":

    # Preprocess the data
    data, feature_groups = preprocess_data(df)

    # Create PyTorch datasets
    datasets = create_pytorch_datasets(data, feature_groups)

1
torch.Size([425169, 9])


In [56]:
import torch.optim as optim
from torch.utils.data import DataLoader

class MortgageClassifier(nn.Module):
    def __init__(self, input_dim: int):
        """
        Args:
            input_dim (int): Number of input features
        """
        super(MortgageClassifier, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 32),  # Hidden layer with 64 units
            nn.ReLU(),
            nn.Linear(32, 64),  # Hidden layer with 64 units
            nn.ReLU(),
            nn.Linear(64, 256),  # Hidden layer with 64 units
            nn.ReLU(),
            # nn.Linear(256, 1028),  # Hidden layer with 64 units
            # nn.ReLU(),
            # nn.Linear(1028, 256),        # Hidden layer with 32 units
            # nn.ReLU(),
            nn.Linear(256, 64),        # Hidden layer with 32 units
            nn.ReLU(),
            nn.Linear(64, 1),         # Output layer
        )

    def forward(self, x):
        return self.network(x)



def train_model(model, train_loader, criterion, optimizer, num_epochs=20):
    """
    Train the model and record epoch times.
    Args:
        model (nn.Module): Neural network model
        train_loader (DataLoader): DataLoader for training data
        criterion (nn.Module): Loss function
        optimizer (torch.optim.Optimizer): Optimizer
        num_epochs (int): Number of training epochs
    """
    model.train()  # Set model to training mode
    total_training_time = 0  # Total training time

    for epoch in range(num_epochs):
        epoch_start_time = time.time()  # Record start time for the epoch

        total_loss = 0
        for features, labels in train_loader:
            # Zero the parameter gradients

            optimizer.zero_grad()

            # Forward pass
            outputs = model(features).squeeze()

            if outputs.shape != labels.shape:
                outputs = torch.reshape(outputs, labels.shape)

            loss = criterion(outputs, labels)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # Calculate epoch time
        epoch_time = time.time() - epoch_start_time
        total_training_time += epoch_time

        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss / len(train_loader):.4f}, Time: {epoch_time:.2f}s")

    # Print total training time
    print(f"Total Training Time: {total_training_time:.2f}s")

def evaluate_model(model, test_loader):
    """
    Evaluate the model on the test set.
    Args:
        model (nn.Module): Trained neural network model
        test_loader (DataLoader): DataLoader for testing data
    """
    model.eval()  # Set model to evaluation mode
    correct = 0
    total = 0
    with torch.no_grad():  # Disable gradient computation for evaluation
        for features, labels in test_loader:

            outputs = torch.sigmoid(model(features).squeeze())
            predictions = (outputs >= 0.5).float()  # Threshold for binary classification
            # print('preds', predictions)
            # print('labels', labels)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    print(f"Test Accuracy: {accuracy * 100:.2f}%")

In [None]:
batch_size = 100
train_loader = DataLoader(datasets['baseline']['train'], batch_size=batch_size, shuffle=True, drop_last=True)
test_loader = DataLoader(datasets['baseline']['test'], batch_size=batch_size, shuffle=True, drop_last=True)

# Initialize the model, loss function, and optimizer
input_dim = len(feature_groups['baseline'])
print(input_dim)
model = MortgageClassifier(input_dim)
criterion = nn.BCEWithLogitsLoss()  # Use BCEWithLogitsLoss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
print("Training the model...")
train_model(model, train_loader, criterion, optimizer, num_epochs=5)

# Evaluate the model
print("\nEvaluating the model...")
evaluate_model(model, test_loader)

9
Training the model...
