In [103]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.nn.functional as F

# To split the data
from sklearn.model_selection import train_test_split

from torch.utils.tensorboard import SummaryWriter

np.set_printoptions(precision=2, suppress=True)
torch.set_printoptions(precision=2, sci_mode=False)

In [9]:
# It is important that your model and all data are on the same device.
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Check that MPS is available - Added for MacOS
if not torch.backends.mps.is_available():
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not "
              "built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")

else:
    device = torch.device("mps")

device

device(type='mps')

In [91]:
def get_data1(test_size=0.2, random_seed=42):
    '''
    returns:
    - train_data_input: Tensor[N_train_samples, C, F]
    - train_data_label: Tensor[N_train_samples, C, P]
    - test_data_input:  Tensor[N_test_samples,  C, F]
    - test_date_label:  Tensor[N_test_samples,  C, P]

    where 
     - N_train_samples = number of train/test samples
     - C = number of channels (here 1)
     - F = number of features
     - P = number of predicted features
    '''
    
    data    = pd.read_csv("../../ICPSR_IHDS-1/Simon_Personal_Aggregates/basic_features.csv")
    
    # Step 1: Replace ' ' and '' with -1
    data = data.replace([' ', ''], -1)
    # Step 2: Try to convert everything to numeric, forcing errors to NaN
    data = data.apply(pd.to_numeric, errors='coerce')
    # Step 3: Replace NaNs with -1
    data = data.fillna(-1)

    data_label = data['household_income']
    data_input = data.drop(['household_income', 'person_id'], axis=1)

    # Split data to get test and train set -> For generalization error prediction
    X_train, X_test, y_train, y_test = train_test_split(
        data_input, data_label,
        test_size=test_size,
        random_state=random_seed
    )
    
    # Convert to numpy arrays
    X_train_np = X_train.to_numpy().astype(np.long)
    X_test_np = X_test.to_numpy().astype(np.long)

    y_train_np = y_train.to_numpy().astype(np.float32)
    y_test_np = y_test.to_numpy().astype(np.float32)
    
    train_y_to = torch.tensor(y_train_np, dtype=torch.float32)
    test_y_to = torch.tensor(y_test_np, dtype=torch.float32)

    # Also prepare input_dict (for models expecting dict inputs)
    X_dict_train = {col: torch.tensor(X_train[col].values, dtype=torch.long) for col in X_train.columns}
    X_dict_test  = {col: torch.tensor(X_test[col].values, dtype=torch.long)  for col in X_test.columns}

    return X_dict_train, train_y_to, X_dict_test, test_y_to
    
    # return datainp_to, datalab_to, input_dict
    
    

X_train, y_train, X_test, y_test = get_data1()
# get_data1()

In [81]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()

        # Define embedding sizes according to your "unique" numbers
        self.embeddings = nn.ModuleDict({
            'stateid': nn.Embedding(33, 8),
            'distid': nn.Embedding(61, 8),
            'distname': nn.Embedding(375, 16),
            'household_id': nn.Embedding(52, 8),
            'sex': nn.Embedding(2, 2),
            'attended_school': nn.Embedding(2, 2),
            'enrolled_or_completed': nn.Embedding(2, 2),
            'ever_repeated': nn.Embedding(2, 2),
            'englisch_ability': nn.Embedding(3, 2),
            'highest_degree': nn.Embedding(6, 4),
            'caste': nn.Embedding(8, 4),
            'hhassets': nn.Embedding(31, 8),
        })

        # Define linear layers for numerical features
        self.numerical_features = ['age', 'years_of_education', 'Ann_earnings_tot']
        
        # Final linear layer after concatenating all embeddings + numericals
        embedding_dim = (
            8 + 8 + 16 + 8 + 2 + 2 + 2 + 2 + 2 + 4 + 4 + 8  # sum of embedding dimensions
        )
        numerical_dim = len(self.numerical_features)  # 3

        total_input_dim = embedding_dim + numerical_dim

        # Example simple MLP after concatenation
        self.fc = nn.Sequential(
            nn.Linear(total_input_hdim, 64),
            nn.ReLU(),
            nn.Linear(64, 1)  # Output: predict household_income
        )

    def forward(self, x):
        # x is expected to be a dict with keys matching the feature names
        embedded_features = []

        for key, emb_layer in self.embeddings.items():
            embedded = emb_layer(x[key])
            embedded_features.append(embedded)

        # concatenate all embeddings
        embedded_features = torch.cat(embedded_features, dim=1)

        # concatenate numerical features
        numerical_data = [x[feature].unsqueeze(1) for feature in self.numerical_features]
        numerical_data = torch.cat(numerical_data, dim=1)

        # concatenate embeddings and numerical features
        all_features = torch.cat([embedded_features, numerical_data], dim=1)

        output = self.fc(all_features)
        return output.squeeze(1)  # squeeze for regression

In [97]:
%load_ext tensorboard
# !rm -rf ./runs/
%tensorboard --logdir=runs

'''
Income
• Mean: 59659.41
• Minimum: -108327.80
• Maximum: 6520261.00
• Standard Deviation: 94614.89
'''

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 91001), started 1:23:54 ago. (Use '!kill 91001' to kill it.)

In [104]:
writer = SummaryWriter()
def train_model(input_dict_train, y_train, input_dict_test, y_test, n_epochs=30):
    # --- 1. Initialize model ---
    model = Model()
    model.train()
    model.to(device)

    # --- 2. Set up training elements ---
    criterion = torch.nn.MSELoss()  # For regression
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)
    batch_size = 64

    # --- 3. Prepare training Dataset and DataLoader ---
    input_features_train = torch.cat(
        [input_dict_train[key].unsqueeze(1) for key in input_dict_train.keys()], dim=1
    )
    dataset_train = TensorDataset(input_features_train, y_train)
    train_loader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)

    # --- 4. Prepare test data ---
    input_features_test = torch.cat(
        [input_dict_test[key].unsqueeze(1) for key in input_dict_test.keys()], dim=1
    )
    dataset_test = TensorDataset(input_features_test, y_test)
    test_loader = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)

    # --- 5. Training loop ---
    for epoch in range(n_epochs):
        model.train()
        epoch_loss = 0.0
        
        for batch_inputs, batch_targets in tqdm(train_loader, desc=f"Training Epoch {epoch}", leave=False):
            # Split batch back into dict
            batch_input_dict = {}
            idx = 0
            for key in input_dict_train.keys():
                batch_input_dict[key] = batch_inputs[:, idx].to(device).long()
                idx += 1

            batch_targets = batch_targets.to(device)

            optimizer.zero_grad()
            outputs = model(batch_input_dict)
            loss = criterion(outputs, batch_targets)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        scheduler.step()
        epoch_loss = epoch_loss / len(train_loader)

        # --- Evaluate on test set ---
        model.eval()
        test_loss = 0.0
        with torch.no_grad():
            for batch_inputs, batch_targets in tqdm(test_loader, desc=f"Testing model in Epoch {epoch}", leave=False):
                batch_input_dict = {}
                idx = 0
                for key in input_dict_test.keys():
                    batch_input_dict[key] = batch_inputs[:, idx].to(device).long()
                    idx += 1

                batch_targets = batch_targets.to(device)
                outputs = model(batch_input_dict)
                loss = criterion(outputs, batch_targets)
                test_loss += loss.item()

        test_loss = test_loss / len(test_loader)

        # --- Print training and test loss ---
        print(f'Epoch {epoch+1}/{n_epochs} - Train Loss: {epoch_loss:.4f} - Test Loss: {test_loss:.4f}')
        
        # Optional: log to tensorboard
        writer.add_scalar("Loss/train", epoch_loss, epoch)
        writer.add_scalar("Loss/test", test_loss, epoch)

        # Save model every 10 epochs
        if (epoch % 10 == 9):
            print("Saving current state of the model")
            torch.save(model.state_dict(), 'NN_models/model_state.pt')

    return model

trained_model = train_model(X_train, y_train, X_test, y_test)
writer.close()

                                                                                                                                                                                     

Epoch 1/100 - Train Loss: 10375127926.7215 - Test Loss: 7969774521.2681


                                                                                                                                                                                     

Epoch 2/100 - Train Loss: 8294437176.6199 - Test Loss: 7410380710.5896


                                                                                                                                                                                     

Epoch 3/100 - Train Loss: 7914178398.8728 - Test Loss: 7076260210.1570


                                                                                                                                                                                     

Epoch 4/100 - Train Loss: 7666876834.5273 - Test Loss: 6893344716.2311


                                                                                                                                                                                     

Epoch 5/100 - Train Loss: 7519837332.7401 - Test Loss: 6763514214.4000


                                                                                                                                                                                     

Epoch 6/100 - Train Loss: 7414227238.0156 - Test Loss: 6669449389.9852


                                                                                                                                                                                     

Epoch 7/100 - Train Loss: 7325945712.8128 - Test Loss: 6591909202.9630


                                                                                                                                                                                     

Epoch 8/100 - Train Loss: 7255353080.5488 - Test Loss: 6529062913.9437


                                                                                                                                                                                     

Epoch 9/100 - Train Loss: 7194111392.9848 - Test Loss: 6477950382.8385


                                                                                                                                                                                     

Epoch 10/100 - Train Loss: 7142009443.5714 - Test Loss: 6441281684.4326
Saving current state of the model


                                                                                                                                                                                     

Epoch 11/100 - Train Loss: 7105185495.2555 - Test Loss: 6412032908.3259


                                                                                                                                                                                     

Epoch 12/100 - Train Loss: 7084901824.3797 - Test Loss: 6392507363.2711


                                                                                                                                                                                     

Epoch 13/100 - Train Loss: 7067462968.6911 - Test Loss: 6377787908.2667


                                                                                                                                                                                     

Epoch 14/100 - Train Loss: 7048717192.6852 - Test Loss: 6361872094.2933


                                                                                                                                                                                     

Epoch 15/100 - Train Loss: 7034375751.4156 - Test Loss: 6346760004.1719


                                                                                                                                                                                     

Epoch 16/100 - Train Loss: 7018482161.1806 - Test Loss: 6334201001.5289


                                                                                                                                                                                     

Epoch 17/100 - Train Loss: 7005511100.6541 - Test Loss: 6324681175.0400


                                                                                                                                                                                     

Epoch 18/100 - Train Loss: 6992817942.7809 - Test Loss: 6312049323.6148


                                                                                                                                                                                     

Epoch 19/100 - Train Loss: 6981858867.3044 - Test Loss: 6301838302.9096


                                                                                                                                                                                     

Epoch 20/100 - Train Loss: 6971121173.5706 - Test Loss: 6300750363.7333
Saving current state of the model


                                                                                                                                                                                     

Epoch 21/100 - Train Loss: 6961944211.0078 - Test Loss: 6289214013.2504


                                                                                                                                                                                     

Epoch 22/100 - Train Loss: 6956097704.8395 - Test Loss: 6292134701.0370


                                                                                                                                                                                     

Epoch 23/100 - Train Loss: 6952783954.7942 - Test Loss: 6280678870.9452


                                                                                                                                                                                     

Epoch 24/100 - Train Loss: 6947200874.3344 - Test Loss: 6277924008.0119


                                                                                                                                                                                     

Epoch 25/100 - Train Loss: 6942961605.4816 - Test Loss: 6272719456.7585


                                                                                                                                                                                     

Epoch 26/100 - Train Loss: 6938808988.5947 - Test Loss: 6269638287.0281


                                                                                                                                                                                     

Epoch 27/100 - Train Loss: 6935262167.6707 - Test Loss: 6265303677.4874


                                                                                                                                                                                     

Epoch 28/100 - Train Loss: 6931517697.0085 - Test Loss: 6262138916.2667


                                                                                                                                                                                     

Epoch 29/100 - Train Loss: 6926730934.5554 - Test Loss: 6259492518.6844


                                                                                                                                                                                     

Epoch 30/100 - Train Loss: 6923563431.9140 - Test Loss: 6258990154.6193
Saving current state of the model


                                                                                                                                                                                     

KeyboardInterrupt: 