In [1]:
import hyperparams
import mlp_regressor
from mlp_regressor import MLPRegressor

# Load vectors

In [2]:
import json

import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader

In [3]:
# Load X data
INPUT_NPZ_PATH = 'data/filtered_vectors.npz' 

try:
    with np.load(INPUT_NPZ_PATH) as data:
        norm_reps = data['representations_normalized']
        row_indices = data['row_indices']
        
except FileNotFoundError:
    print(f"Error: The file '{INPUT_NPZ_PATH}' was not found.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

In [4]:
# Load y data

with open('data/score_by_entity.json', encoding='utf-8') as file:
    data = json.load(file)

# Create a dictionary mapping vector_index to score for O(1) lookup
score_dict = {item['vector_index']: item['score'] for item in data}

# Create scores list in the order determined by row_indices
scores = np.array([score_dict[idx] for idx in row_indices])

In [5]:
# Load stratification data
with open('data/score_by_entity.json', encoding='utf-8') as file:
    data = json.load(file)

# Create a dictionary mapping vector_index to score for O(1) lookup
country_dict = {item['vector_index']: item['pais'] for item in data}

# Create scores list in the order determined by row_indices
countries = np.array([country_dict[idx] for idx in row_indices])

In [6]:
X, y = norm_reps, scores

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=hyperparams.RANDOM_SEED, stratify=countries
    )

# Convert to PyTorch tensors and create DataLoader
train_dataset = TensorDataset(
    torch.tensor(X_train, dtype=torch.float32),
    torch.tensor(y_train, dtype=torch.float32)
)

# Create DataLoader with batch size
train_loader = DataLoader(train_dataset, batch_size=hyperparams.BATCH_SIZE, shuffle=True)

# Probe model

In [6]:
# Instantiate MLPRegressor with the hyperparameters
model = MLPRegressor(
    input_size=X.shape[1],
    optimizer=hyperparams.OPTIM,
    learning_rate=hyperparams.LR,
    max_iter=hyperparams.MAX_ITER
)
model.cuda()

MLPRegressor(
  (criterion): MSELoss()
  (layers): ModuleList(
    (0): Linear(in_features=3072, out_features=1, bias=False)
  )
)

In [9]:
model.fit(train_loader, y_train, X_test, y_test)

Epoch 1/100


  test_pearson_corr, test_pearson_p_value = pearsonr(result_df["preds"], result_df["target"])


{'test_loss': 0.03678825497627258, 'train_loss': 0.0011479860366662672, 'test_pearson_corr': np.float32(nan)}
Epoch 2/100
{'test_loss': 0.03678825497627258, 'train_loss': 0.0011073691190873452, 'test_pearson_corr': np.float32(0.0011790118)}
Epoch 3/100
{'test_loss': 0.03678825497627258, 'train_loss': 0.001105576884605257, 'test_pearson_corr': np.float32(0.008325754)}
Epoch 4/100
{'test_loss': 0.03678825497627258, 'train_loss': 0.0011049671735838, 'test_pearson_corr': np.float32(0.007995705)}
Epoch 5/100
{'test_loss': 0.03678825497627258, 'train_loss': 0.001104767539610617, 'test_pearson_corr': np.float32(0.008222784)}
Epoch 6/100
{'test_loss': 0.03678825497627258, 'train_loss': 0.001105273840775444, 'test_pearson_corr': np.float32(0.008445206)}
Epoch 7/100
{'test_loss': 0.03678825497627258, 'train_loss': 0.001105280389907182, 'test_pearson_corr': np.float32(0.0086628925)}
Epoch 8/100
{'test_loss': 0.03678825497627258, 'train_loss': 0.001109763798164672, 'test_pearson_corr': np.float32(

In [7]:
import os
import torch

save_dir = 'models'
os.makedirs(save_dir, exist_ok=True)

# recommended: save state_dict (binary)
state_path = os.path.join(save_dir, 'mlp_regressor_state_dict.pth')
torch.save(model.state_dict(), state_path)

print(f"Saved state_dict -> {state_path}")

Saved state_dict -> models\mlp_regressor_state_dict.pth
