# Machine Learning Notebook for Group 8


## Classification Neural Network based on Income Group using PyTorch


In [1]:
import pandas as pd
import numpy as np
dataset = pd.read_csv('datasets/fies_df_for_ML.csv')

In [3]:
# Get all expenditure columns as features
expenditure_columns = [
    "family_size",
    "losses_from_entrepreneurial_activities",
    "expenditure_cereal_preparations",
    "expenditure_meat_preparations",
    "expenditure_fish_marine_products",
    "expenditure_dairy_eggs",
    "expenditure_oils_fats",
    "expenditure_fruits_vegetables",
    "expenditure_vegetables",
    "expenditure_sugar_jam_honey",
    "expenditure_other_food",
    "expenditure_fruit_vegetable_juices",
    "expenditure_coffee_cocoa_tea",
    "expenditure_tea",
    "expenditure_cocoa",
    "main_water_supply_second_visit",
    "expenditure_softdrinks",
    "expenditure_non_alcoholic_beverages",
    "expenditure_alcoholic_beverages",
    "expenditure_tobacco",
    "expenditure_other_vegetables",
    "expenditure_services_primary_goods",
    "expenditure_alcohol_production_services",
    "total_food_consumed_home",
    "food_consumed_outside_home",
    "household_food_expenditure",
    "expenditure_clothing_footwear",
    "expenditure_housing_water",
    "actual_house_rent",
    "imputed_house_rental_value",
    "imputed_housing_benefit_rental_value",
    "house_rent_rental_value",
    "expenditure_furnishings_household_maintenance",
    "expenditure_health",
    "expenditure_transportation",
    "expenditure_communication",
    "expenditure_recreation_culture",
    "expenditure_education",
    "expenditure_insurance",
    "expenditure_miscellaneous_goods_services",
    "expenditure_durable_furniture",
    "expenditure_special_family_occasion",
    "other_expenditure",
    "other_disbursements",
    "expenditure_accommodation_services",
    "total_non_food_expenditure",
    "total_household_expenditures",
    "total_household_disbursements",
    "urban_rural",
    'income_group', # include target variable
]

In [4]:
# Only use expenditure columns
expenditure_dataset = dataset[expenditure_columns]

In [5]:
# Stratified split to get equal representation of all income statuses:
from sklearn.model_selection import train_test_split

# 70% for training, and 30% for test
train, test = train_test_split(
    expenditure_dataset,
    test_size=0.3,
    stratify=dataset['income_group'],
    random_state=42
)
print(f'Training Set Shape: ', train.shape)
print(f'Test Set Shape: ', test.shape)

Training Set Shape:  (114287, 50)
Test Set Shape:  (48981, 50)


In [6]:
# Split the dataset into training, validation, and testing sets
target_column = ['income_group']
X_train, y_train = train.drop(columns=target_column), train['income_group']
X_test, y_test = test.drop(columns=target_column), test['income_group']

In [7]:
# Scale all data 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
scaler = StandardScaler()

# For Training
X_train_scaled = scaler.fit_transform(X_train)

# For Testing
X_test_scaled = scaler.transform(X_test)

# Encoding the Target Variable
label_encoder = LabelEncoder()
# For Training
y_train = label_encoder.fit_transform(y_train)

# For Testing
y_test = label_encoder.transform(y_test)

In [8]:
# Convert to PyTorch Tensors
import torch
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

In [9]:
from torch.utils.data import TensorDataset, DataLoader
# Define the TensorDataset and DataLoader along with a constant BATCH_SIZE
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

BATCH_SIZE = 32
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [10]:
# Defining the ANN Architecture
import torch.nn as nn

class FIESANN(nn.Module):
    def __init__(self, input_size, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 16)
        self.output_neuron = nn.Linear(16, num_classes)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = self.output_neuron(x)
        return x

In [11]:
# Custom EarlyStopping function
class EarlyStopping:
    def __init__(self, patience=7, verbose=False, delta=0, path=None):
        """
        Args:
            patience (int): How long to wait after last time the F1 score improved.
                            Default: 7
            verbose (bool): If True, prints a message for each F1 score improvement.
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for saving the model if it improves. If None, stores the state in memory.
        """
        self.patience = patience
        self.verbose = verbose
        self.delta = delta
        self.path = path
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.best_val_score = -np.inf
        self.best_model_state = None

    def __call__(self, val_score, model):
        if not np.isfinite(val_score):
            # Skip NaN or infinite values
            if self.verbose:
                print("Warning: Invalid score (NaN or Inf). Skipping update.")
            return

        if self.best_score is None:
            self.best_score = val_score
            self.save_checkpoint(val_score, model)
        elif val_score > self.best_score + self.delta: # adjust this to < when tracking a metric instead if loss
            self.counter += 1
            if self.verbose:
                print(f"EarlyStopping counter: {self.counter} out of {self.patience}")
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = val_score
            self.save_checkpoint(val_score, model)
            self.counter = 0

    def save_checkpoint(self, val_score, model):
        """Saves model when validation score improves."""
        if self.verbose:
            print(f"Tracked Validation Score improved ({self.best_val_score:.4f} --> {val_score:.4f}). Saving model...")
        if self.path:
            torch.save(model.state_dict(), self.path)
        else:
            self.best_model_state = model.state_dict()
        self.best_val_score = val_score

In [None]:
# Setting up K-Fold Validation for a more robust evaluation
from sklearn.model_selection import KFold
EPOCHS = 100
k_folds = 5
initial_lr = 0.001

kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42)

# Metrics storage
fold_metrics = {
    "train_loss": [],
    "train_accuracy": [],
    "train_precision": [],
    "train_recall": [],
    "train_f1": [],
}

fold_performance = {
    "train_loss": [],
    "train_accuracy": [],
    "train_precision": [],
    "train_recall": [],
    "train_f1": [],
}

In [13]:
# Start K-Fold Training
import torch.nn as nn
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Subset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train)):
    print(f"Fold {fold + 1}/{k_folds}")

    # Split the data into training and validation sets
    train_subset = Subset(train_dataset, train_idx)
    val_subset = Subset(train_dataset, val_idx)
    
    train_loader = DataLoader(train_subset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_subset, batch_size=BATCH_SIZE, shuffle=False)

    # Initialize the model, optimizer and utility methods
    model = FIESANN(input_size=X_train.shape[1], num_classes=dataset['income_group'].nunique())
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()
    lr_scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=15)
    early_stopping = EarlyStopping(patience=30, delta=0.00001, path=f'checkpoints/fies_ann_checkpoint_fold{fold+1}.pth', verbose=True)

    best_train_loss = np.inf
    best_train_accuracy = 0.0
    best_train_precision = 0.0
    best_train_recall = 0.0
    best_train_f1 = 0.0

    for epoch in range(EPOCHS):
        # Training phase
        model.train()
        train_loss = 0
        y_train_true = []
        y_train_pred = []
        
        for batch in train_dataloader:
            inputs, labels = batch
            outputs = model(inputs)  # Outputs shape: (batch_size, num_classes)
            
            # Use CrossEntropyLoss for multi-class classification
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            
            # Get predicted class (index of the max logit)
            preds = torch.argmax(outputs, dim=1)
            
            y_train_true.extend(labels.cpu().numpy())
            y_train_pred.extend(preds.cpu().numpy())
        
        train_loss /= len(train_dataloader)
        
        # Calculate metrics (multi-class version)
        train_accuracy = accuracy_score(y_train_true, y_train_pred)
        train_precision = precision_score(y_train_true, y_train_pred, average='macro')
        train_recall = recall_score(y_train_true, y_train_pred, average='macro')
        train_f1 = f1_score(y_train_true, y_train_pred, average='macro')

        fold_metrics["train_loss"].append(np.round(train_loss, 4))
        fold_metrics["train_accuracy"].append(train_accuracy)
        fold_metrics["train_precision"].append(train_precision)
        fold_metrics["train_recall"].append(train_recall)
        fold_metrics["train_f1"].append(train_f1)

        print(f'Epoch [{epoch+1}/{EPOCHS}] with Learning Rate: {lr_scheduler.get_last_lr()} \nLoss: {train_loss:.4f}' +  
            f'\nTrain Accuracy: {train_accuracy:.2%} Train Precision: {train_precision:.2%}, Train Recall: {train_recall:.2%}, Train F1 Score: {train_f1:.2%}\n')
        
        lr_scheduler.step(train_loss)
        early_stopping(train_loss, model)
        if train_accuracy > best_train_accuracy:
            best_train_loss = train_loss
            best_train_accuracy = train_accuracy
            best_train_precision = train_precision
            best_train_recall = train_recall
            best_train_f1 = train_f1
        if early_stopping.early_stop:
            print("Early stopping triggered")
            break

    fold_performance["train_loss"].append(np.round(best_train_loss, 4))
    fold_performance["train_accuracy"].append(best_train_accuracy)
    fold_performance["train_precision"].append(best_train_precision)
    fold_performance["train_recall"].append(best_train_recall)
    fold_performance["train_f1"].append(best_train_f1)
    print(f"\nFold {fold + 1} Performance:\n")
    print(f"Best Train Loss: {best_train_loss:.4f}")
    print(f"Best Train Accuracy: {best_train_accuracy:.2%}")
    print(f"Best Train Precision: {best_train_precision:.2%}")
    print(f"Best Train Recall: {best_train_recall:.2%}")
    print(f"Best Train F1 Score: {best_train_f1:.2%}")


    

Fold 1/10
Epoch [1/100] with Learning Rate: [0.001] 
Loss: 0.6241
Train Accuracy: 74.73% Train Precision: 64.04%, Train Recall: 53.61%, Train F1 Score: 56.72%

Tracked Validation Score improved (-inf --> 0.6241). Saving model...
Epoch [2/100] with Learning Rate: [0.001] 
Loss: 0.5595
Train Accuracy: 76.69% Train Precision: 66.19%, Train Recall: 60.93%, Train F1 Score: 63.04%

Tracked Validation Score improved (0.6241 --> 0.5595). Saving model...
Epoch [3/100] with Learning Rate: [0.001] 
Loss: 0.5528
Train Accuracy: 76.99% Train Precision: 67.42%, Train Recall: 62.33%, Train F1 Score: 64.52%

Tracked Validation Score improved (0.5595 --> 0.5528). Saving model...
Epoch [4/100] with Learning Rate: [0.001] 
Loss: 0.5490
Train Accuracy: 77.13% Train Precision: 68.98%, Train Recall: 63.68%, Train F1 Score: 65.76%

Tracked Validation Score improved (0.5528 --> 0.5490). Saving model...
Epoch [5/100] with Learning Rate: [0.001] 
Loss: 0.5446
Train Accuracy: 77.22% Train Precision: 69.56%, Trai

In [14]:
# Aggregate metrics across all folds
avg_train_loss = np.mean(fold_metrics["train_loss"])
avg_train_accuracy = np.mean(fold_metrics["train_accuracy"])
avg_train_precision = np.mean(fold_metrics["train_precision"])
avg_train_recall = np.mean(fold_metrics["train_recall"])
avg_train_f1 = np.mean(fold_metrics["train_f1"])

print(f"Average Train Loss: {avg_train_loss:.4f}")
print(f"Average Train Accuracy: {avg_train_accuracy:.2%}")
print(f"Average Train Precision: {avg_train_precision:.2%}")
print(f"Average Train Recall: {avg_train_recall:.2%}")
print(f"Average Train F1 Score: {avg_train_f1:.2%}")

# Print the Best performance across all folds
best_fold = np.argmin(fold_performance["train_loss"])
best_train_loss = fold_performance["train_loss"][best_fold]
best_train_accuracy = fold_performance["train_accuracy"][best_fold]
best_train_precision = fold_performance["train_precision"][best_fold]
best_train_recall = fold_performance["train_recall"][best_fold]
best_train_f1 = fold_performance["train_f1"][best_fold]

print(f"Best Fold: {best_fold + 1}")
print(f"Best Train Loss: {best_train_loss:.4f}")
print(f"Best Train Accuracy: {best_train_accuracy:.2%}")
print(f"Best Train Precision: {best_train_precision:.2%}")
print(f"Best Train Recall: {best_train_recall:.2%}")
print(f"Best Train F1 Score: {best_train_f1:.2%}")

Average Train Loss: 0.5124
Average Train Accuracy: 78.50%
Average Train Precision: 73.69%
Average Train Recall: 66.68%
Average Train F1 Score: 69.72%
Best Fold: 3
Best Train Loss: 0.4923
Best Train Accuracy: 79.34%
Best Train Precision: 74.66%
Best Train Recall: 67.57%
Best Train F1 Score: 70.66%


In [15]:
# Evaluate the model on the test set
model.eval()
test_loss = 0
y_test_true = []
y_test_pred = []
model.load_state_dict(torch.load(f'checkpoints/fies_ann_checkpoint_fold{best_fold+1}.pth'))
print(f"Loading model from checkpoint: checkpoints/fies_ann_checkpoint_fold{best_fold+1}.pth")
with torch.no_grad():
    for batch in test_dataloader:
        inputs, labels = batch
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        test_loss += loss.item()

        # Get predicted class (index of the max logit)
        preds = torch.argmax(outputs, dim=1)

        y_test_true.extend(labels.cpu().numpy())
        y_test_pred.extend(preds.cpu().numpy())

test_loss /= len(test_dataloader)
test_accuracy = accuracy_score(y_test_true, y_test_pred)
test_precision = precision_score(y_test_true, y_test_pred, average='macro')
test_recall = recall_score(y_test_true, y_test_pred, average='macro')
test_f1 = f1_score(y_test_true, y_test_pred, average='macro')

print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy:.2%}')
print(f'Test Precision: {test_precision:.2%}')
print(f'Test Recall: {test_recall:.2%}')
print(f'Test F1 Score: {test_f1:.2%}')

Loading model from checkpoint: checkpoints/fies_ann_checkpoint_fold3.pth


  model.load_state_dict(torch.load(f'checkpoints/fies_ann_checkpoint_fold{best_fold+1}.pth'))


Test Loss: 0.5875
Test Accuracy: 76.61%
Test Precision: 70.28%
Test Recall: 63.83%
Test F1 Score: 66.63%


In [16]:
# Confusion Matrix
cm = confusion_matrix(y_test_true, y_test_pred)
cm_df = pd.DataFrame(cm, index=dataset['income_group'].unique(), columns=dataset['income_group'].unique())
print(cm_df)

                             Middle Middle Class  Lower Middle Income  \
Middle Middle Class                        16993                 1707   
Lower Middle Income                         3013                 9707   
Low Income (but not poor)                     37                 1280   
Upper Middle Income                         1852                   10   
Upper Income (but not rich)                    2                    0   
Poor                                           1                    1   
Rich                                           1                   40   

                             Low Income (but not poor)  Upper Middle Income  \
Middle Middle Class                                 14                 1857   
Lower Middle Income                                677                   25   
Low Income (but not poor)                         2737                    0   
Upper Middle Income                                  0                 7267   
Upper Income (but no

In [17]:
# Summary of the model
from IPython.display import display, HTML

html_table = f"""
<table border="1" style="border-collapse: collapse; width: 100%;">
    <thead>
        <tr>
            <th>Train Loss</th>
            <th>Train Accuracy</th>
            <th>Train Precision</th>
            <th>Train Recall</th>
            <th>Train F1 Score</th>
            <th>Test Loss</th>
            <th>Test Accuracy</th>
            <th>Test Precision</th>
            <th>Test Recall</th>
            <th>Test F1 Score</th>
        </tr>
    </thead>

    <tbody>
        <tr>
            <td>{best_train_loss:.4f}</td>
            <td>{best_train_accuracy:.2%}</td>
            <td>{best_train_precision:.2%}</td>
            <td>{best_train_recall:.2%}</td>
            <td>{best_train_f1:.2%}</td>
            <td>{test_loss:.4f}</td>
            <td>{test_accuracy:.2%}</td>
            <td>{test_precision:.2%}</td>
            <td>{test_recall:.2%}</td>
            <td>{test_f1:.2%}</td>
        </tr>
    </tbody>
</table>
"""

display(HTML(html_table))

Train Loss,Train Accuracy,Train Precision,Train Recall,Train F1 Score,Test Loss,Test Accuracy,Test Precision,Test Recall,Test F1 Score
0.4923,79.34%,74.66%,67.57%,70.66%,0.5875,76.61%,70.28%,63.83%,66.63%
