# Neural Network model training and evaluation without race as a feature

#### Import libraries needed

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from typing import Tuple, Dict, List
import warnings
from sklearn.preprocessing import LabelEncoder, StandardScaler
import seaborn as sns
import itertools
from torch.utils.data import Dataset, random_split, DataLoader
import torch
import torch.nn as nn
from tqdm import tqdm
import time
import torch.optim as optim

warnings.filterwarnings('ignore')

#### Part 1: Load in and prepare the HMDA data for use in our NN

In [17]:
def load_and_prepare_data(main_file: str, sample_size: int = 500000) -> Tuple[pd.DataFrame, Dict]:
    """
    Load and prepare the HMDA dataset using chunked processing and sampling.
    Returns processed dataframe and data statistics dictionary.

    Args:
        main_file (str): The path to the main HMDA dataset file.
        sample_size (int): The number of samples to take from the dataset.

    Returns:
        Tuple[pd.DataFrame, Dict]: A tuple containing the processed dataframe and data statistics dictionary.
    """
    # Define only the columns we need
    needed_columns = [
        'action_taken',
        'loan_amount_000s',
        'applicant_income_000s',
        'applicant_race_name_1',
        'applicant_ethnicity',
        'applicant_sex',
        'hud_median_family_income',
        'denial_reason_1',
        'state_code',
        'county_code',
        'minority_population',
        'tract_to_msamd_income'
    ]
    
    print("Loading and sampling main HMDA data...")
    # Initialize empty list to store chunks
    chunks = []
    # chunk size set depending on available memory
    chunk_size = 100000

    # Initialize the tqdm progress bar and do chunking
    for chunk in tqdm(pd.read_csv(main_file, 
                                usecols=needed_columns, 
                                chunksize=chunk_size),
                    total=143, 
                    desc="Processing chunks"):

        # Sample from each chunk proportionally
        chunk_sample_size = int(sample_size * (chunk_size / 14285496))  # Adjust for total record count
        sampled_chunk = chunk.sample(n=min(chunk_sample_size, len(chunk)))
        chunks.append(sampled_chunk)

        # Break if we've reached the desired sample size
        if sum(len(chunk) for chunk in chunks) >= sample_size:
            break
    
    # Combine chunks
    df = pd.concat(chunks, ignore_index=True)
    
    # Convert action_taken to binary (approved = 1, denied = 0)
    df['approved'] = df['action_taken'].isin([1, 2]).astype(int)
    
    # Calculate approval rates by different demographics
    stats = {}
    
    # Overall approval rate
    stats['overall_approval_rate'] = df['approved'].mean()
    
    # Approval rates by race (using primary race)
    race_approvals = df.groupby('applicant_race_name_1')['approved'].agg(['mean', 'count'])
    stats['race_approval_rates'] = race_approvals
    
    # Approval rates by income bracket
    df['income_bracket'] = pd.qcut(df['applicant_income_000s'].fillna(-1), 
                                 q=5, 
                                 labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
    income_approvals = df.groupby('income_bracket')['approved'].agg(['mean', 'count'])
    stats['income_approval_rates'] = income_approvals
    
    # Calculate loan amount approval rates
    # Create loan amount bins
    df['loan_bin'] = pd.qcut(df['loan_amount_000s'].dropna(), q=10, labels=False)
    loan_approval_rates = df.groupby('loan_bin')['approved'].mean()
    stats['loan_amount_approval_rates'] = loan_approval_rates.tolist()
    
    print("\nBasic dataset statistics:")
    print(f"Number of applications: {len(df):,}")
    print(f"Number of approved loans: {df['approved'].sum():,}")
    print(f"Overall approval rate: {df['approved'].mean():.2%}")
    
    return df, stats

#### Part 2: Create our dataset class for the HMDA data, as well as define functions for preprocessing and further preparing our data

In [18]:
class MortgageDataset(Dataset):
    def __init__(self, df: pd.DataFrame, features: List[str], label_col: str):
        """
        Args:
            df (pd.DataFrame): Processed DataFrame. Output of preprocess_data().
            features (List[str]): List of feature columns.
            label_col (str): Name of the target column.
        """
        self.features = torch.tensor(df[features].values, dtype=torch.float32)
        self.labels = torch.tensor(df[label_col].values, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

def preprocess_data(df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
    """
    Preprocess the HMDA dataset and create feature groups for analysis.
    Returns preprocessed DataFrame and feature groups.

    Args:
        df (pd.DataFrame): The HMDA dataset. Output of load_and_prepare_data().

    Returns:
        Tuple[pd.DataFrame, Dict[str, List[str]]]: A tuple containing the preprocessed DataFrame and feature groups.
    """
    # Create a copy of the data
    data = df.copy()

    # Define feature groups
    feature_groups = {
        'baseline': [
            'loan_amount_000s',
            'applicant_income_000s',
            'hud_median_family_income',
            'tract_to_msamd_income',
            # 'income_to_loan_ratio',  # Will be created
            # 'area_income_ratio'      # Will be created
        # ],
        # 'location': [
        #     'state_code',
        #     'county_code',
        #     'minority_population'
        # ],
        # 'sensitive': [
        #     'applicant_race_name_1',
        #     'applicant_ethnicity',
        #     'applicant_sex'
        ]
    }

    columns_to_keep = [
        'loan_amount_000s',
        'applicant_income_000s',
        'hud_median_family_income',
        'tract_to_msamd_income',
        'approved'
    ]   

    data = data[columns_to_keep]

    # remove entries that are missing entries
    data = data.dropna()

    # normalize values 
    data['loan_amount_000s'] = data['loan_amount_000s'] / data['loan_amount_000s'].mean()
    data['applicant_income_000s'] = data['applicant_income_000s'] / data['applicant_income_000s'].mean()
    data['hud_median_family_income'] = data['hud_median_family_income'] / data['hud_median_family_income'].mean()
    data['tract_to_msamd_income'] = data['tract_to_msamd_income'] / data['tract_to_msamd_income'].mean()
    
    return data, feature_groups

def create_pytorch_datasets(
    data: pd.DataFrame,
    feature_groups: Dict[str, List[str]],
    label_col: str = 'approved',
    test_size: float = 0.2,
    random_seed: int = 42
    ) -> Dict[str, Dict[str, Dataset]]:
    """
    Create PyTorch datasets for different feature groups.
    Returns a dictionary of train-test datasets.

    Args:
        data (pd.DataFrame): The preprocessed DataFrame. Output of preprocess_data().
        feature_groups (Dict[str, List[str]]): Dictionary of feature groups. Output of preprocess_data().
        label_col (str): The target column name.
        test_size (float): The proportion of data to include in the test split.
        random_seed (int): The random seed to use for reproducibility.

    Returns:
        Dict[str, Dict[str, Dataset]]: A dictionary containing train-test datasets for each feature group
    """
    datasets = {}
    le = LabelEncoder()

    for group_name, features in feature_groups.items():
        # Select features
        X = data[features].copy()

        # Create the target variable
        y = data[label_col]

        # Combine features and labels into a DataFrame
        full_data = pd.concat([X, y], axis=1)

        # Create PyTorch Dataset
        full_dataset = MortgageDataset(full_data, features, label_col)

        # Split into train and test datasets
        total_size = len(full_dataset)
        test_size_split = int(total_size * test_size)
        train_size_split = total_size - test_size_split
        train_dataset, test_dataset = random_split(full_dataset, [train_size_split, test_size_split], generator=torch.Generator().manual_seed(random_seed))

        datasets[group_name] = {
            'train': train_dataset,
            'test': test_dataset
        }

    return datasets

#### Part 3: Define our model, as well as the training and evaluation functions for it

In [19]:
class MortgageClassifier(nn.Module):
    def __init__(self, input_dim: int):
        """
        Args:
            input_dim (int): Number of input features
        """
        super(MortgageClassifier, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 16),  # Hidden layer with 64 units
            nn.ReLU(),
            nn.Linear(16, 64),  # Hidden layer with 64 units
            nn.ReLU(),
            nn.Linear(64, 256),  # Hidden layer with 64 units
            nn.ReLU(),
            # nn.Linear(256, 1028),  # Hidden layer with 64 units
            # nn.ReLU(),
            # nn.Linear(1028, 256),        # Hidden layer with 32 units
            # nn.ReLU(),
            nn.Linear(256, 64),        # Hidden layer with 32 units
            nn.ReLU(),
            nn.Linear(64, 1),         # Output layer
        )

    def forward(self, x):
        """
        Forward pass of the neural network.

        Args:
            x (torch.Tensor): Input tensor

        Returns:
            torch.Tensor: Output tensor
        """
        return self.network(x)



def train_model(model, train_loader, criterion, optimizer, num_epochs=20):
    """
    Train the model and record epoch times.
    Args:
        model (nn.Module): Neural network model (instance of MortgageClassifier)
        train_loader (DataLoader): DataLoader for training data
        criterion (nn.Module): Loss function
        optimizer (torch.optim.Optimizer): Optimizer
        num_epochs (int): Number of training epochs

    Returns:
        None
    """
    model.train()  # Set model to training mode
    total_training_time = 0  # Total training time

    for epoch in range(num_epochs):
        epoch_start_time = time.time()  # Record start time for the epoch

        total_loss = 0
        for features, labels in train_loader:
            # Zero the parameter gradients

            optimizer.zero_grad()

            # Forward pass
            outputs = model(features).squeeze()

            if outputs.shape != labels.shape:
                outputs = torch.reshape(outputs, labels.shape)

            loss = criterion(outputs, labels)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # Calculate epoch time
        epoch_time = time.time() - epoch_start_time
        total_training_time += epoch_time

        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss / len(train_loader):.4f}, Time: {epoch_time:.2f}s")

    # Print total training time
    print(f"Total Training Time: {total_training_time:.2f}s")

def evaluate_model(model, test_loader):
    """
    Evaluate the model on the test set.
    Args:
        model (nn.Module): Trained neural network model
        test_loader (DataLoader): DataLoader for testing data

    Returns:
        None
    """
    model.eval()  # Set model to evaluation mode
    correct = 0
    total = 0
    with torch.no_grad():  # Disable gradient computation for evaluation
        for features, labels in test_loader:

            outputs = torch.sigmoid(model(features).squeeze())
            predictions = (outputs >= 0.5).float()  # Threshold for binary classification
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    print(f"Test Accuracy: {accuracy * 100:.2f}%")

#### Execution block to run all model training, evaluation, and analysis

In [20]:
if __name__ == "__main__":
    # Set random seeds for reproducibility
    RANDOM_SEED = 42
    torch.manual_seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)

    main_file = 'HDMA/hmda_2017_nationwide_all-records_labels.csv'
    # Load and prepare data
    df, stats = load_and_prepare_data(main_file)
    # Preprocess the data
    data, feature_groups = preprocess_data(df)
    # Create PyTorch datasets
    datasets = create_pytorch_datasets(data, feature_groups, test_size=0.2, random_seed=RANDOM_SEED)
    
    batch_size = 100
    test_dataset = datasets['baseline']['test']
    test_indices = test_dataset.indices
    
    # Important: Create test loader without dropping last batch
    train_loader = DataLoader(datasets['baseline']['train'], batch_size=batch_size, shuffle=True, drop_last=True)
    test_loader = DataLoader(datasets['baseline']['test'], batch_size=batch_size, shuffle=False, drop_last=False)  # Changed these parameters
    
    # Rest of the initialization code...
    input_dim = len(feature_groups['baseline'])
    print(input_dim)
    model = MortgageClassifier(input_dim)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    print("Training the model...")
    train_model(model, train_loader, criterion, optimizer, num_epochs=5)
    
    # Evaluation
    print("\nEvaluating the model...")
    model.eval()
    predictions = []
    used_labels = []  # Keep track of the actual labels we use
    total_correct = 0
    total_samples = 0
    
    # Get predictions batch by batch
    with torch.no_grad():
        for features, labels in test_loader:
            outputs = torch.sigmoid(model(features).squeeze())
            preds = (outputs >= 0.5).float()
            predictions.extend(preds.numpy())
            used_labels.extend(labels.numpy())  # Save the actual labels we used
            total_correct += (preds == labels).sum().item()
            total_samples += len(labels)  # Use actual batch size instead of fixed size
    
    # Convert to numpy arrays
    predictions = np.array(predictions)
    used_labels = np.array(used_labels)
    
    # Compute overall accuracy
    accuracy = total_correct / total_samples
    print(f"Test Accuracy: {accuracy * 100:.2f}%")
    
    # Save predictions and the actual labels used
    print("\nSaving predictions and labels...")
    if 'race' in feature_groups['baseline'][-1]:
        np.save('nn_predictions_with_race.npy', predictions)
        np.save('nn_test_labels_with_race.npy', used_labels)
    else:
        np.save('nn_predictions_no_race.npy', predictions)
        np.save('nn_test_labels_no_race.npy', used_labels)

Loading and sampling main HMDA data...


Processing chunks:  99%|█████████▉| 142/143 [00:40<00:00,  3.55it/s]



Basic dataset statistics:
Number of applications: 500,500
Number of approved loans: 271,149
Overall approval rate: 54.18%
4
Training the model...
Epoch [1/5], Loss: 0.6657, Time: 3.62s
Epoch [2/5], Loss: 0.6641, Time: 3.25s
Epoch [3/5], Loss: 0.6633, Time: 3.50s
Epoch [4/5], Loss: 0.6631, Time: 3.37s
Epoch [5/5], Loss: 0.6630, Time: 3.56s
Total Training Time: 17.31s

Evaluating the model...
Test Accuracy: 61.34%

Saving predictions and labels...
