<a href="https://colab.research.google.com/github/sfgeekgit/lesswrongDataDzppg/blob/main/lesswrong_challenge_colonizing_the_superhypersphere.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# lesswrong challenge: Colonizing the SuperHyperSphere

problem: https://www.lesswrong.com/posts/Rpjrwspx2QZuHbmPE/d-and-d-sci-fi-colonizing-the-superhypersphere-evaluation

submit: https://h-b-p.github.io/d-and-d-sci-SuperHyperSphere/

## setup

In [None]:
!wget https://raw.githubusercontent.com/sfgeekgit/lesswrongDataDzppg/main/cleared_sites_formated.csv
!wget https://raw.githubusercontent.com/sfgeekgit/lesswrongDataDzppg/main/measured_data.csv

--2024-01-25 22:47:33--  https://raw.githubusercontent.com/sfgeekgit/lesswrongDataDzppg/main/cleared_sites_formated.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11480603 (11M) [text/plain]
Saving to: ‘cleared_sites_formated.csv’


2024-01-25 22:47:33 (105 MB/s) - ‘cleared_sites_formated.csv’ saved [11480603/11480603]

--2024-01-25 22:47:33--  https://raw.githubusercontent.com/sfgeekgit/lesswrongDataDzppg/main/measured_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1175467 (1.1M) [text/plain]
Saving to: ‘measured_d

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
# import data
unlabeled = pd.read_csv('cleared_sites_formated.csv')
labeled = pd.read_csv('measured_data.csv')
n_features = labeled.shape[1] - 2

In [None]:
# split into training and test sets
train_data2, test_data = train_test_split(labeled, test_size =.1, random_state = 1)
train_labels = train_data2['ZPPG_Performance']
train_data = train_data2.drop(['ZPPG_id', 'ZPPG_Performance'], axis=1)

print(f'train data: {len(train_data)}')
print(f'test data:  {len(test_data)}')

train data: 9366
test data:  1041


## utils

In [None]:
@torch.no_grad()
def evaluate(model, dataset):
  model.eval()
  val = dataset.drop(['ZPPG_id', 'ZPPG_Performance'], axis=1)
  preds = model(torch.tensor(val.values).float().to(device))
  labels = torch.tensor(dataset['ZPPG_Performance'].values).view(-1, 1).to(device)
  diff = preds.detach() - labels
  error = diff.abs().mean().item()
  model.train()
  return error

In [None]:
@torch.no_grad()
def topn(model, dataset, n=15):
  model.eval()
  to_drop = ['ZPPG_id']
  if 'ZPPG_Performance' in dataset: to_drop.append('ZPPG_Performance')
  preds = model(torch.tensor(dataset.drop(to_drop, axis=1).values).float().to(device))
  ids = dataset['ZPPG_id']
  preds_id = list(zip(
    preds.view(-1).tolist(),
    ids.values.tolist()))
  return sorted(preds_id, reverse=True)[:n]

## model and train

In [None]:
class MLP(nn.Module):
  def __init__(self, n_inputs=n_features, hidden=32, dropout=0.2):
    super().__init__()
    self.model = nn.Sequential(
        nn.Linear(n_inputs, hidden),
        nn.BatchNorm1d(hidden),
        nn.ReLU(),
        nn.Dropout(dropout),
        nn.Linear(hidden, hidden),
        nn.BatchNorm1d(hidden),
        nn.ReLU(),
        nn.Dropout(dropout),
        nn.Linear(hidden, hidden),
        nn.BatchNorm1d(hidden),
        nn.ReLU(),
        nn.Dropout(dropout),
        nn.Linear(hidden, hidden),
        nn.BatchNorm1d(hidden),
        nn.ReLU(),
        nn.Dropout(dropout),
        nn.Linear(hidden, hidden),
        nn.BatchNorm1d(hidden),
        nn.ReLU(),
        nn.Dropout(dropout),
        nn.Linear(hidden, hidden),
        nn.BatchNorm1d(hidden),
        nn.ReLU(),
        nn.Dropout(dropout),
        nn.Linear(hidden, hidden),
        nn.BatchNorm1d(hidden),
        nn.ReLU(),
        nn.Dropout(dropout),
        nn.Linear(hidden, 1),
    )
  def forward(self, x):
    output = self.model(x)
    return output

In [None]:
model = MLP().to(device)
optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=0.01)

In [None]:
def train(model, epochs=10000):
  model.train()
  labels = torch.tensor(labeled['ZPPG_Performance'].values).float().view(-1, 1).to(device)
  data = labeled.drop(['ZPPG_id', 'ZPPG_Performance'], axis=1)
  data = torch.tensor(data.values).float().to(device)
  # data = torch.tensor(train_data.values).float().to(device)
  # labels = torch.tensor(train_labels.values).float().view(-1, 1).to(device)
  for epoch in range(epochs):
    predictions = model(data)
    loss = F.mse_loss(predictions, labels)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if epoch % 200 == 0:
      error_test = error_train = evaluate(model, labeled)
      # error_test = evaluate(model, test_data)
      # error_train = evaluate(model, train_data2)
      print(f'{epoch=:5} {loss.item()=:7.4f} {error_test=:7.4f} {error_train=:7.4f}')
  model.eval()

train(model)

epoch=    0 loss.item()= 0.0009 error_test= 0.1059 error_train= 0.1059
epoch=  200 loss.item()= 0.0009 error_test= 0.1059 error_train= 0.1059
epoch=  400 loss.item()= 0.0010 error_test= 0.1056 error_train= 0.1056
epoch=  600 loss.item()= 0.0009 error_test= 0.1053 error_train= 0.1053
epoch=  800 loss.item()= 0.0009 error_test= 0.1046 error_train= 0.1046
epoch= 1000 loss.item()= 0.0009 error_test= 0.1045 error_train= 0.1045
epoch= 1200 loss.item()= 0.0009 error_test= 0.1059 error_train= 0.1059
epoch= 1400 loss.item()= 0.0009 error_test= 0.1062 error_train= 0.1062
epoch= 1600 loss.item()= 0.0010 error_test= 0.1070 error_train= 0.1070
epoch= 1800 loss.item()= 0.0009 error_test= 0.1065 error_train= 0.1065
epoch= 2000 loss.item()= 0.0009 error_test= 0.1069 error_train= 0.1069
epoch= 2200 loss.item()= 0.0009 error_test= 0.1064 error_train= 0.1064
epoch= 2400 loss.item()= 0.0009 error_test= 0.1050 error_train= 0.1050
epoch= 2600 loss.item()= 0.0010 error_test= 0.1051 error_train= 0.1051
epoch=

## eval

In [None]:
# predict solution
topn(model, unlabeled)

[(0.7313426733016968, 23565),
 (0.7297401428222656, 93762),
 (0.7287615537643433, 96286),
 (0.7213119268417358, 107278),
 (0.7154839038848877, 53987),
 (0.714455246925354, 88956),
 (0.7140936851501465, 80395),
 (0.7107514142990112, 94408),
 (0.7104818820953369, 94304),
 (0.7085078954696655, 905),
 (0.7079377174377441, 94942),
 (0.7079243659973145, 38055),
 (0.7075937986373901, 104260),
 (0.7075873613357544, 58945),
 (0.7071173191070557, 11558)]

In [None]:
# validation
topn(model, test_data)
topn(model, train_data2)

[(1.8702857494354248, 3412),
 (1.8266371488571167, 2695),
 (1.8240541219711304, 7487),
 (1.8221070766448975, 2509),
 (1.821927547454834, 3773),
 (1.8169137239456177, 8721),
 (1.8135265111923218, 5386),
 (1.8075464963912964, 5751),
 (1.8061460256576538, 7243),
 (1.7997924089431763, 2004),
 (1.797755241394043, 5072),
 (1.7943958044052124, 4148),
 (1.7933684587478638, 6989),
 (1.791631817817688, 7146),
 (1.7850587368011475, 5667)]