# Loan Default Prediction with Neural Networks

This notebook implements a neural network model for predicting loan defaults using PyTorch. The model processes both numerical and categorical features using proper preprocessing techniques.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import time

# Set device to GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Custom Dataset Class
We create a custom PyTorch Dataset class to handle our loan data efficiently. This class:
- Converts features to PyTorch tensors
- Handles both numpy arrays and sparse matrices
- Supports both training data (with labels) and test data (without labels)

In [5]:
class LoanDataset(Dataset):
    def __init__(self, features, labels=None):
        # Convert features to tensor depending on input type
        if isinstance(features, np.ndarray):
            self.features = torch.FloatTensor(features)
        else:  # Sparse matrix from ColumnTransformer
            self.features = torch.FloatTensor(features.toarray())

        # Convert labels to tensor if provided
        self.labels = None if labels is None else torch.LongTensor(labels)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if self.labels is None:
            return self.features[idx]
        return self.features[idx], self.labels[idx]

## Neural Network Architecture
Our model uses multiple fully connected layers with:
- ReLU activation
- Batch normalization for better training stability
- Dropout for regularization
- Binary classification output (Default vs No Default)

In [6]:
class LoanDefaultNN(nn.Module):
    def __init__(self, input_dim, hidden_dims=[256, 128, 64], dropout=0.3):
        """
        Neural Network for loan default prediction
        Args:
            input_dim: Number of input features after preprocessing
            hidden_dims: List of hidden layer dimensions
            dropout: Dropout rate
        """
        super().__init__()

        layers = []
        prev_dim = input_dim

        # Create hidden layers
        for dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, dim),
                nn.ReLU(),
                nn.BatchNorm1d(dim),
                nn.Dropout(dropout)
            ])
            prev_dim = dim

        # Output layer
        layers.append(nn.Linear(prev_dim, 2))  # Binary classification

        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

## Data Preprocessing
Here we:
1. Load and preprocess the data
2. Split features into numerical and categorical
3. Apply appropriate scaling and encoding
4. Create train/validation splits and data loaders

In [7]:
# Load the data
data = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Define categorical and numerical features
categorical_features = ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage',
                       'HasDependents', 'LoanPurpose', 'HasCoSigner']
numerical_features = ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed',
                     'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio']

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

# Prepare data
X = preprocessor.fit_transform(data.drop(columns=['LoanID', 'Default']))
X_test = preprocessor.transform(test_df.drop(columns=['LoanID']))
y = data['Default'].values

# Split training data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Create datasets
train_dataset = LoanDataset(X_train, y_train)
val_dataset = LoanDataset(X_val, y_val)
test_dataset = LoanDataset(X_test)  # No labels for test set

# Create dataloaders
batch_size = 512
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

## Training and Validation Functions
These functions handle:
- Training epochs
- Validation
- Prediction
- Progress tracking with tqdm

In [8]:
def train_epoch(model, dataloader, criterion, optimizer):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc='Training'):
        features, labels = batch
        features, labels = features.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

def validate(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    predictions = []
    actuals = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Validating'):
            features, labels = batch
            features, labels = features.to(device), labels.to(device)

            outputs = model(features)
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            predictions.extend(torch.argmax(outputs, dim=1).cpu().numpy())
            actuals.extend(labels.cpu().numpy())

    accuracy = (np.array(predictions) == np.array(actuals)).mean()
    return total_loss / len(dataloader), accuracy

def predict(model, dataloader):
    model.eval()
    predictions = []

    with torch.no_grad():
        for features in tqdm(dataloader, desc='Predicting'):
            features = features.to(device)
            outputs = model(features)
            predictions.extend(torch.argmax(outputs, dim=1).cpu().numpy())

    return predictions

## Model Training
Train the model with:
- Cross-entropy loss
- Adam optimizer
- Model checkpointing for best validation accuracy

In [9]:
# Initialize model
input_dim = X_train.shape[1]
model = LoanDefaultNN(input_dim).to(device)

# Training parameters
n_epochs = 30
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Training loop
best_val_acc = 0
for epoch in range(n_epochs):
    start_time = time.time()

    train_loss = train_epoch(model, train_loader, criterion, optimizer)
    val_loss, val_acc = validate(model, val_loader, criterion)

    epoch_time = time.time() - start_time
    print(f"Epoch {epoch+1}/{n_epochs} | Time: {epoch_time:.2f}s")
    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), 'best_model.pth')

Training: 100%|██████████| 320/320 [00:02<00:00, 118.53it/s]
Validating: 100%|██████████| 80/80 [00:00<00:00, 259.06it/s]


Epoch 1/30 | Time: 3.03s
Train Loss: 0.4533 | Val Loss: 0.3204 | Val Acc: 0.8843


Training: 100%|██████████| 320/320 [00:01<00:00, 170.05it/s]
Validating: 100%|██████████| 80/80 [00:00<00:00, 290.23it/s]


Epoch 2/30 | Time: 2.18s
Train Loss: 0.3307 | Val Loss: 0.3185 | Val Acc: 0.8841


Training: 100%|██████████| 320/320 [00:01<00:00, 170.62it/s]
Validating: 100%|██████████| 80/80 [00:00<00:00, 282.59it/s]


Epoch 3/30 | Time: 2.18s
Train Loss: 0.3222 | Val Loss: 0.3167 | Val Acc: 0.8843


Training: 100%|██████████| 320/320 [00:01<00:00, 174.13it/s]
Validating: 100%|██████████| 80/80 [00:00<00:00, 276.57it/s]


Epoch 4/30 | Time: 2.15s
Train Loss: 0.3193 | Val Loss: 0.3168 | Val Acc: 0.8845


Training: 100%|██████████| 320/320 [00:02<00:00, 141.89it/s]
Validating: 100%|██████████| 80/80 [00:00<00:00, 178.12it/s]


Epoch 5/30 | Time: 2.73s
Train Loss: 0.3179 | Val Loss: 0.3162 | Val Acc: 0.8843


Training: 100%|██████████| 320/320 [00:02<00:00, 140.46it/s]
Validating: 100%|██████████| 80/80 [00:00<00:00, 291.34it/s]


Epoch 6/30 | Time: 2.57s
Train Loss: 0.3164 | Val Loss: 0.3160 | Val Acc: 0.8846


Training: 100%|██████████| 320/320 [00:01<00:00, 172.89it/s]
Validating: 100%|██████████| 80/80 [00:00<00:00, 293.85it/s]


Epoch 7/30 | Time: 2.14s
Train Loss: 0.3160 | Val Loss: 0.3162 | Val Acc: 0.8850


Training: 100%|██████████| 320/320 [00:01<00:00, 171.26it/s]
Validating: 100%|██████████| 80/80 [00:00<00:00, 295.49it/s]


Epoch 8/30 | Time: 2.16s
Train Loss: 0.3153 | Val Loss: 0.3158 | Val Acc: 0.8849


Training: 100%|██████████| 320/320 [00:01<00:00, 171.20it/s]
Validating: 100%|██████████| 80/80 [00:00<00:00, 275.27it/s]


Epoch 9/30 | Time: 2.18s
Train Loss: 0.3150 | Val Loss: 0.3166 | Val Acc: 0.8848


Training: 100%|██████████| 320/320 [00:01<00:00, 173.18it/s]
Validating: 100%|██████████| 80/80 [00:00<00:00, 230.15it/s]


Epoch 10/30 | Time: 2.22s
Train Loss: 0.3148 | Val Loss: 0.3159 | Val Acc: 0.8847


Training: 100%|██████████| 320/320 [00:02<00:00, 118.24it/s]
Validating: 100%|██████████| 80/80 [00:00<00:00, 261.49it/s]


Epoch 11/30 | Time: 3.04s
Train Loss: 0.3138 | Val Loss: 0.3160 | Val Acc: 0.8848


Training: 100%|██████████| 320/320 [00:01<00:00, 175.14it/s]
Validating: 100%|██████████| 80/80 [00:00<00:00, 284.58it/s]


Epoch 12/30 | Time: 2.13s
Train Loss: 0.3131 | Val Loss: 0.3165 | Val Acc: 0.8852


Training: 100%|██████████| 320/320 [00:01<00:00, 172.23it/s]
Validating: 100%|██████████| 80/80 [00:00<00:00, 287.23it/s]


Epoch 13/30 | Time: 2.15s
Train Loss: 0.3130 | Val Loss: 0.3161 | Val Acc: 0.8849


Training: 100%|██████████| 320/320 [00:01<00:00, 173.74it/s]
Validating: 100%|██████████| 80/80 [00:00<00:00, 293.88it/s]


Epoch 14/30 | Time: 2.13s
Train Loss: 0.3130 | Val Loss: 0.3164 | Val Acc: 0.8847


Training: 100%|██████████| 320/320 [00:01<00:00, 170.25it/s]
Validating: 100%|██████████| 80/80 [00:00<00:00, 287.50it/s]


Epoch 15/30 | Time: 2.18s
Train Loss: 0.3128 | Val Loss: 0.3176 | Val Acc: 0.8848


Training: 100%|██████████| 320/320 [00:02<00:00, 145.13it/s]
Validating: 100%|██████████| 80/80 [00:00<00:00, 174.75it/s]


Epoch 16/30 | Time: 2.69s
Train Loss: 0.3119 | Val Loss: 0.3168 | Val Acc: 0.8842


Training: 100%|██████████| 320/320 [00:02<00:00, 143.31it/s]
Validating: 100%|██████████| 80/80 [00:00<00:00, 282.64it/s]


Epoch 17/30 | Time: 2.54s
Train Loss: 0.3123 | Val Loss: 0.3166 | Val Acc: 0.8847


Training: 100%|██████████| 320/320 [00:01<00:00, 168.64it/s]
Validating: 100%|██████████| 80/80 [00:00<00:00, 286.49it/s]


Epoch 18/30 | Time: 2.20s
Train Loss: 0.3115 | Val Loss: 0.3167 | Val Acc: 0.8850


Training: 100%|██████████| 320/320 [00:01<00:00, 171.65it/s]
Validating: 100%|██████████| 80/80 [00:00<00:00, 287.07it/s]


Epoch 19/30 | Time: 2.16s
Train Loss: 0.3114 | Val Loss: 0.3167 | Val Acc: 0.8853


Training: 100%|██████████| 320/320 [00:01<00:00, 171.40it/s]
Validating: 100%|██████████| 80/80 [00:00<00:00, 286.93it/s]


Epoch 20/30 | Time: 2.16s
Train Loss: 0.3111 | Val Loss: 0.3170 | Val Acc: 0.8850


Training: 100%|██████████| 320/320 [00:01<00:00, 170.19it/s]
Validating: 100%|██████████| 80/80 [00:00<00:00, 192.65it/s]


Epoch 21/30 | Time: 2.32s
Train Loss: 0.3103 | Val Loss: 0.3173 | Val Acc: 0.8851


Training: 100%|██████████| 320/320 [00:02<00:00, 119.72it/s]
Validating: 100%|██████████| 80/80 [00:00<00:00, 285.13it/s]


Epoch 22/30 | Time: 2.97s
Train Loss: 0.3100 | Val Loss: 0.3177 | Val Acc: 0.8850


Training: 100%|██████████| 320/320 [00:01<00:00, 170.24it/s]
Validating: 100%|██████████| 80/80 [00:00<00:00, 251.39it/s]


Epoch 23/30 | Time: 2.22s
Train Loss: 0.3100 | Val Loss: 0.3175 | Val Acc: 0.8849


Training: 100%|██████████| 320/320 [00:01<00:00, 172.67it/s]
Validating: 100%|██████████| 80/80 [00:00<00:00, 265.17it/s]


Epoch 24/30 | Time: 2.18s
Train Loss: 0.3098 | Val Loss: 0.3191 | Val Acc: 0.8840


Training: 100%|██████████| 320/320 [00:01<00:00, 173.09it/s]
Validating: 100%|██████████| 80/80 [00:00<00:00, 285.41it/s]


Epoch 25/30 | Time: 2.15s
Train Loss: 0.3093 | Val Loss: 0.3177 | Val Acc: 0.8849


Training: 100%|██████████| 320/320 [00:01<00:00, 170.52it/s]
Validating: 100%|██████████| 80/80 [00:00<00:00, 294.12it/s]


Epoch 26/30 | Time: 2.17s
Train Loss: 0.3094 | Val Loss: 0.3170 | Val Acc: 0.8851


Training: 100%|██████████| 320/320 [00:02<00:00, 132.83it/s]
Validating: 100%|██████████| 80/80 [00:00<00:00, 162.75it/s]


Epoch 27/30 | Time: 2.93s
Train Loss: 0.3086 | Val Loss: 0.3175 | Val Acc: 0.8851


Training: 100%|██████████| 320/320 [00:01<00:00, 161.47it/s]
Validating: 100%|██████████| 80/80 [00:00<00:00, 289.33it/s]


Epoch 28/30 | Time: 2.28s
Train Loss: 0.3085 | Val Loss: 0.3189 | Val Acc: 0.8846


Training: 100%|██████████| 320/320 [00:02<00:00, 146.38it/s]
Validating: 100%|██████████| 80/80 [00:00<00:00, 271.49it/s]


Epoch 29/30 | Time: 2.50s
Train Loss: 0.3076 | Val Loss: 0.3181 | Val Acc: 0.8849


Training: 100%|██████████| 320/320 [00:02<00:00, 148.34it/s]
Validating: 100%|██████████| 80/80 [00:00<00:00, 289.28it/s]

Epoch 30/30 | Time: 2.45s
Train Loss: 0.3074 | Val Loss: 0.3184 | Val Acc: 0.8850





## Generate Predictions
Generate predictions on the test set using the best model and create submission file.

In [10]:
# Load best model
model.load_state_dict(torch.load('best_model.pth'))
predictions = predict(model, test_loader)

# Create submission file
submission = pd.DataFrame({
    'LoanID': test_df['LoanID'],
    'Default': predictions
})
submission.to_csv('neural_network_submission.csv', index=False)

  model.load_state_dict(torch.load('best_model.pth'))
Predicting: 100%|██████████| 100/100 [00:00<00:00, 531.11it/s]
