In [1]:
import datasets
import numpy as np
import pandas as pd
import pickle

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import tqdm
from torch.utils.tensorboard import SummaryWriter

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("DEVICE IS ... ", device)

DEVICE IS ...  cuda


In [13]:
class MultiTaskNet(nn.Module):
    def __init__(
        self, embed_dim=11348, layer_sizes=[2048, 500, 2048, 500]
    ):
        super().__init__()

        self.embedding_dim = embed_dim

        self.mlp_net = nn.Sequential(
            nn.Linear(layer_sizes[0], layer_sizes[1]),  ## 96x64
            nn.ReLU(),
            nn.Linear(layer_sizes[1], layer_sizes[2]),  ## 64x1
            nn.ReLU(),
            nn.Linear(layer_sizes[2], layer_sizes[3]),
        )

        self.last_layer = nn.Linear(
            layer_sizes[3], 1
        )  ## change if we need classification or softmax

    def forward(self, x):
        x = self.mlp_net(x)

        out_x = self.last_layer(x)

        return out_x

In [8]:
epochs = 20
lr = 1e-4
batch_size = 16


In [7]:
with open("data/dev_random_proj.pt", "rb") as f:
    dev = torch.load(f)

dev_labels= pd.read_parquet("data/dev_20221130.parquet.gzip", columns=["Mean_BMI", "Under5_Mortality_Rate"])

In [9]:
def collator_fn(data):
    x, y_df = data
    x_inp = x.to(device)
    y_bmi = torch.tensor(y_df["Mean_BMI"].values, dtype=torch.float32, device=device)
    y_cmr = torch.tensor(
        y_df["Under5_Mortality_Rate"].values, dtype=torch.float32, device=device
    )
    return x_inp, y_bmi, y_cmr

tmp = DataLoader((dev,dev_labels), batch_size=batch_size, collate_fn=collator_fn)

In [10]:
tmp[0]

TypeError: 'DataLoader' object is not subscriptable

In [14]:
print("Model loading")
model = MultiTaskNet().to(device)
loss_fn = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

Model loading


In [15]:
dev.shape

torch.Size([12100, 2048])

In [11]:
def masked_mse(output, target):
    mse_loss = nn.MSELoss()
    mask = torch.isnan(target)
    target = torch.where(mask, 0.0, target)
    output = torch.where(mask, 0.0, output)
    return mse_loss(target, output)

In [36]:
def r2_loss(output, target):
    
    
    target_mean = torch.nanmean(target)
    ss_tot = torch.nansum(((target - target_mean)) ** 2)
    ss_res = torch.nansum(((target - output)) ** 2)
    r2 = 1 - ss_res / ss_tot
    return r2

In [12]:
a = torch.tensor([1.,2.,3.,4.,10.])
b = torch.tensor([1.,2.,float('nan'),4.,5.])

In [13]:
masked_mse(a, b)

tensor(5.)

In [3]:
dev_labels = pd.read_parquet(
    "data/dev_20221130.parquet.gzip", columns=["Mean_BMI", "Under5_Mortality_Rate"]
)

In [4]:
dev_labels

Unnamed: 0_level_0,Unnamed: 1_level_0,Mean_BMI,Under5_Mortality_Rate
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PH,101293,,0.00
PE,98708,24.91,7.89
GH,34990,,
ML,81759,19.89,8.49
LS,78953,23.90,12.20
...,...,...,...
ZM,118281,24.89,12.12
BF,6497,18.85,24.42
PE,96202,,
CO,16927,24.20,7.41
