In [1]:
import hyperparams
import mlp_regressor
from mlp_regressor import MLPRegressor

# Load vectors

In [13]:
import json

import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader

In [14]:
# Load X data
INPUT_NPZ_PATH = 'data/filtered_vectors.npz' 

try:
    with np.load(INPUT_NPZ_PATH) as data:
        norm_reps = data['representations_normalized']
        row_indices = data['row_indices']
        
except FileNotFoundError:
    print(f"Error: The file '{INPUT_NPZ_PATH}' was not found.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

In [15]:
# Load y data

with open('data/score_by_entity.json', encoding='utf-8') as file:
    data = json.load(file)

# Create a dictionary mapping vector_index to score for O(1) lookup
score_dict = {item['vector_index']: item['score'] for item in data}

# Create scores list in the order determined by row_indices
scores = np.array([score_dict[idx] for idx in row_indices])

In [16]:
# Load stratification data

# Create a dictionary mapping vector_index to score for O(1) lookup
country_dict = {item['vector_index']: item['pais'] for item in data}

# Create scores list in the order determined by row_indices
countries = np.array([country_dict[idx] for idx in row_indices])

In [17]:
# Split data into training and testing sets with stratification
# Splitting indices to maintain alignment across multiple arrays

entity_indices = np.arange(len(norm_reps))
train_idx, test_idx = train_test_split(
    entity_indices,
    test_size=0.2,
    random_state=hyperparams.RANDOM_SEED,
    stratify=countries
)

# Index into arrays using the computed indices
X_train, X_test = norm_reps[train_idx], norm_reps[test_idx]
y_train, y_test = scores[train_idx], scores[test_idx]
row_indices_train, row_indices_test = row_indices[train_idx], row_indices[test_idx]

# Convert to PyTorch tensors and create DataLoader
train_dataset = TensorDataset(
    torch.tensor(X_train, dtype=torch.float32),
    torch.tensor(y_train, dtype=torch.float32)
)


In [18]:
# Convert to PyTorch tensors and create DataLoader
train_dataset = TensorDataset(
    torch.tensor(X_train, dtype=torch.float32),
    torch.tensor(y_train, dtype=torch.float32)
)

# Create DataLoader with batch size
train_loader = DataLoader(train_dataset, batch_size=hyperparams.BATCH_SIZE, shuffle=True)

# Probe model

In [10]:
import os

In [19]:
# Instantiate MLPRegressor with the hyperparameters
model = MLPRegressor(
    input_size=X_train.shape[1],
    optimizer=hyperparams.OPTIM,
    learning_rate=hyperparams.LR,
    max_iter=hyperparams.MAX_ITER
)
model.cuda()

MLPRegressor(
  (criterion): MSELoss()
  (layers): ModuleList(
    (0): Linear(in_features=3072, out_features=1, bias=False)
  )
)

In [20]:
model.fit(train_loader, y_train, X_test, y_test)

Epoch 1/100


  test_pearson_corr, test_pearson_p_value = pearsonr(result_df["preds"], result_df["target"])


{'test_loss': 0.029070734977722168, 'train_loss': 0.0009594603416509636, 'test_pearson_corr': np.float32(nan)}
Epoch 2/100
{'test_loss': 0.029070734977722168, 'train_loss': 0.000954836876614504, 'test_pearson_corr': np.float32(0.0030897106)}
Epoch 3/100
{'test_loss': 0.029070734977722168, 'train_loss': 0.0009609045821876062, 'test_pearson_corr': np.float32(0.003280031)}
Epoch 4/100
{'test_loss': 0.029070734977722168, 'train_loss': 0.000955092746160433, 'test_pearson_corr': np.float32(0.0036366396)}
Epoch 5/100
{'test_loss': 0.029070734977722168, 'train_loss': 0.000961120047283312, 'test_pearson_corr': np.float32(0.0039797565)}
Epoch 6/100
{'test_loss': 0.029070734977722168, 'train_loss': 0.000955341035562089, 'test_pearson_corr': np.float32(0.004307433)}
Epoch 7/100
{'test_loss': 0.029070734977722168, 'train_loss': 0.0009559252484608161, 'test_pearson_corr': np.float32(0.004622474)}
Epoch 8/100
{'test_loss': 0.029070734977722168, 'train_loss': 0.0009549126725520033, 'test_pearson_corr'

In [21]:
save_dir = 'models'
os.makedirs(save_dir, exist_ok=True)

# recommended: save state_dict (binary)
state_path = os.path.join(save_dir, 'mlp_regressor_state_dict.pth')
torch.save(model.state_dict(), state_path)

print(f"Saved state_dict -> {state_path}")

Saved state_dict -> models\mlp_regressor_state_dict.pth


# Correlation analysis

In [None]:
test_countries = countries[test_idx]

In [24]:
# Count frequency of each country in the existing `countries` array
unique, counts = np.unique(countries, return_counts=True)
country_counts = dict(zip(unique, counts))
country_counts

{np.str_('argentina'): np.int64(675),
 np.str_('chile'): np.int64(617),
 np.str_('colombia'): np.int64(725),
 np.str_('costa_rica'): np.int64(823),
 np.str_('cuba'): np.int64(651),
 np.str_('ecuador'): np.int64(705),
 np.str_('el_salvador'): np.int64(777),
 np.str_('guatemala'): np.int64(743),
 np.str_('honduras'): np.int64(918),
 np.str_('mexico'): np.int64(533),
 np.str_('nicaragua'): np.int64(726),
 np.str_('panama'): np.int64(767),
 np.str_('paraguay'): np.int64(887),
 np.str_('peru'): np.int64(561),
 np.str_('republica_dominicana'): np.int64(837),
 np.str_('usa'): np.int64(420),
 np.str_('venezuela'): np.int64(649)}