In [49]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()

np.set_printoptions(precision=2, suppress=True)
torch.set_printoptions(precision=2, sci_mode=False)

In [9]:
# It is important that your model and all data are on the same device.
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Check that MPS is available - Added for MacOS
if not torch.backends.mps.is_available():
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not "
              "built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")

else:
    device = torch.device("mps")

device

device(type='mps')

In [80]:
def get_data1():
    '''
    returns:
    - train_data_input: Tensor[N_train_samples, C, F]
    - train_data_label: Tensor[N_train_samples, C, P]
    - test_data_input:  Tensor[N_test_samples,  C, F]
    - test_date_label:  Tensor[N_test_samples,  C, P]

    where 
     - N_train_samples = number of train/test samples
     - C = number of channels (here 1)
     - F = number of features
     - P = number of predicted features
    '''
    
    data    = pd.read_csv("../../ICPSR_IHDS-1/Simon_Personal_Aggregates/basic_features.csv")
    
    # Step 1: Replace ' ' and '' with -1
    data = data.replace([' ', ''], -1)
    # Step 2: Try to convert everything to numeric, forcing errors to NaN
    data = data.apply(pd.to_numeric, errors='coerce')
    # Step 3: Replace NaNs with -1
    data = data.fillna(-1)

    data_label = data['household_income']
    data_input = data.drop(['household_income', 'person_id'], axis=1)
    # print(data_label.head(2), "\n", data_input.head(2))
    
    datainp_np = data_input.to_numpy()
    datainp_np = datainp_np.astype(np.long)
    datainp_to = torch.tensor(datainp_np, dtype=torch.long)
    
    datalab_np = data_label.to_numpy()
    datalab_np = datalab_np.astype(np.float32)
    datalab_to = torch.tensor(datalab_np, dtype=torch.float32)
    # print(data.columns, data.head(2))

    input_dict = {col: torch.tensor(data_input[col].values, dtype=torch.long) for col in data_input.columns}
    # input_dict = input_dict.pop('Unnamed')
    
    return datainp_to, datalab_to, input_dict
    
    

datainp_to, datalab_to, input_dict = get_data1()
get_data1()

(tensor([[     0,      1,      2,  ...,      6,  48000,     14],
         [     1,      1,      2,  ...,      6,     -1,     14],
         [     2,      1,      2,  ...,      6,  18000,     14],
         ...,
         [215751,     34,      0,  ...,      3,     -1,     15],
         [215752,     34,      0,  ...,      3,     -1,     15],
         [215753,     34,      0,  ...,      3,     -1,     15]]),
 tensor([52871.68, 52871.68, 52871.68,  ..., 84100.00, 64800.00, 64800.00]),
 {'Unnamed: 0': tensor([     0,      1,      2,  ..., 215751, 215752, 215753]),
  'stateid': tensor([ 1,  1,  1,  ..., 34, 34, 34]),
  'distid': tensor([2, 2, 2,  ..., 0, 0, 0]),
  'distname': tensor([ 102,  102,  102,  ..., 3400, 3400, 3400]),
  'household_id': tensor([ 1,  1,  1,  ..., 14, 15, 15]),
  'sex': tensor([1, 2, 1,  ..., 2, 2, 2]),
  'age': tensor([50, 45, 22,  ...,  9, 60, 23]),
  'attended_school': tensor([0, 0, 1,  ..., 1, 1, 1]),
  'enrolled_or_completed': tensor([-1, -1,  0,  ...,  1,  0,  0]),


In [81]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()

        # Define embedding sizes according to your "unique" numbers
        self.embeddings = nn.ModuleDict({
            'stateid': nn.Embedding(33, 8),
            'distid': nn.Embedding(61, 8),
            'distname': nn.Embedding(375, 16),
            'household_id': nn.Embedding(52, 8),
            'sex': nn.Embedding(2, 2),
            'attended_school': nn.Embedding(2, 2),
            'enrolled_or_completed': nn.Embedding(2, 2),
            'ever_repeated': nn.Embedding(2, 2),
            'englisch_ability': nn.Embedding(3, 2),
            'highest_degree': nn.Embedding(6, 4),
            'caste': nn.Embedding(8, 4),
            'hhassets': nn.Embedding(31, 8),
        })

        # Define linear layers for numerical features
        self.numerical_features = ['age', 'years_of_education', 'Ann_earnings_tot']
        
        # Final linear layer after concatenating all embeddings + numericals
        embedding_dim = (
            8 + 8 + 16 + 8 + 2 + 2 + 2 + 2 + 2 + 4 + 4 + 8  # sum of embedding dimensions
        )
        numerical_dim = len(self.numerical_features)  # 3

        total_input_dim = embedding_dim + numerical_dim

        # Example simple MLP after concatenation
        self.fc = nn.Sequential(
            nn.Linear(total_input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1)  # Output: predict household_income
        )

    def forward(self, x):
        # x is expected to be a dict with keys matching the feature names
        embedded_features = []

        for key, emb_layer in self.embeddings.items():
            embedded = emb_layer(x[key])
            embedded_features.append(embedded)

        # concatenate all embeddings
        embedded_features = torch.cat(embedded_features, dim=1)

        # concatenate numerical features
        numerical_data = [x[feature].unsqueeze(1) for feature in self.numerical_features]
        numerical_data = torch.cat(numerical_data, dim=1)

        # concatenate embeddings and numerical features
        all_features = torch.cat([embedded_features, numerical_data], dim=1)

        output = self.fc(all_features)
        return output.squeeze(1)  # squeeze for regression

In [None]:
%tensorboard --logdir=runs

In [87]:
def train_model(input_dict, y, n_epochs=100):
    # --- 1. Initialize model ---
    model = Model()
    model.train()
    model.to(device)

    # --- 2. Set up training elements ---
    criterion = torch.nn.MSELoss()  # for regression (predicting continuous household_income)
    # criterion = torch.nn.L1Loss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)
    batch_size = 64

    # --- 3. Prepare Dataset and DataLoader ---
    # Convert input_dict into one big tensor, since TensorDataset expects tensors
    input_features = torch.cat(
        [input_dict[key].unsqueeze(1) for key in input_dict.keys()], dim=1
    )

    dataset = TensorDataset(input_features, y)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # --- 4. Training loop ---
    for epoch in range(n_epochs):
        epoch_loss = 0.0
        
        for batch_inputs, batch_targets in tqdm(
            data_loader, desc=f"Training Epoch {epoch}", leave=False):

            # Now split batch_inputs back into dictionary, for model input
            batch_input_dict = {}
            idx = 0
            for key in input_dict.keys():
                batch_input_dict[key] = batch_inputs[:, idx].to(device).long() if batch_inputs[:, idx].dtype == torch.long else batch_inputs[:, idx].to(device)
                idx += 1

            batch_targets = batch_targets.to(device)

            optimizer.zero_grad()
            outputs = model(batch_input_dict)
            loss = criterion(outputs, batch_targets)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        scheduler.step()

        epoch_loss = epoch_loss/len(data_loader)
        print(f'Epoch {epoch+1}/{n_epochs} - Loss: {epoch_loss:.4f}')
        writer.add_scalar("Loss/train", epoch_loss, epoch)

        if (epoch % 10 == 9):
            print("Saving current state of the model")
            torch.save(model.state_dict(), 'NN_models/model_state.pt')
        

    return model

train_model(input_dict, datalab_to)

                                                                                                                                                                                     

Epoch 1/100 - Loss: 9908532075.2361
Saving current state of the model


                                                                                                                                                                                     

Epoch 2/100 - Loss: 8021059654.8707


                                                                                                                                                                                     

Epoch 3/100 - Loss: 7594948185.4140


                                                                                                                                                                                     

Epoch 4/100 - Loss: 7368180045.9122


                                                                                                                                                                                     

Epoch 5/100 - Loss: 7227621180.9063


                                                                                                                                                                                     

Epoch 6/100 - Loss: 7121728182.3203


                                                                                                                                                                                     

Epoch 7/100 - Loss: 7038876201.8126


                                                                                                                                                                                     

Epoch 8/100 - Loss: 6973943199.8292


                                                                                                                                                                                     

Epoch 9/100 - Loss: 6925229002.4958


                                                                                                                                                                                     

KeyboardInterrupt: 