In [43]:
import datasets
import numpy as np
import pandas as pd
import pickle

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import tqdm
from torch.utils.tensorboard import SummaryWriter

In [44]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("DEVICE IS ... ", device)

DEVICE IS ...  cuda


In [45]:
class MultiTaskNet(nn.Module):
    def __init__(self, embed_dim=11348, layer_sizes=[2048, 500, 2048, 500]):
        super().__init__()

        self.embedding_dim = embed_dim

        self.mlp_net = nn.Sequential(
            nn.Linear(layer_sizes[0], layer_sizes[1]),  ## 96x64
            nn.ReLU(),
            nn.Linear(layer_sizes[1], layer_sizes[2]),  ## 64x1
            nn.ReLU(),
            nn.Linear(layer_sizes[2], layer_sizes[3]),
        )

        self.last_layer = nn.Linear(
            layer_sizes[3], 1
        )  ## change if we need classification or softmax

    def forward(self, x):
        x = self.mlp_net(x)

        out_x = self.last_layer(x)

        return out_x

In [46]:
epochs = 20
lr = 1e-4
batch_size = 16


In [47]:
with open("data/test_random_proj.pt", "rb") as f:
    dev = torch.load(f)

dev_labels= pd.read_parquet("data/test_20221130.parquet.gzip", columns=["Mean_BMI", "Under5_Mortality_Rate"])

In [48]:
def collator_fn(data):
    x, y_df = data
    x_inp = x.to(device)
    y_bmi = torch.tensor(y_df["Mean_BMI"].values, dtype=torch.float32, device=device)
    y_cmr = torch.tensor(
        y_df["Under5_Mortality_Rate"].values, dtype=torch.float32, device=device
    )
    return x_inp, y_bmi, y_cmr

# dataloader = DataLoader((dev,dev_labels), batch_size=1, collate_fn=collator_fn)

In [49]:
dataloader = DataLoader(TensorDataset(*collator_fn((dev,dev_labels))), batch_size=1)

In [50]:
for idx, batch in enumerate(dataloader):
    x, y_bmi, y_cmr = batch
    break

In [51]:
y_bmi.shape

torch.Size([1])

In [96]:
def masked_mse(output, target):
    mse_loss = nn.MSELoss()
    mask = torch.isnan(target)
    target = torch.where(mask, 0.0, target)
    output = torch.where(mask, 0.0, output)
    return mse_loss(target, output)

def r2_loss(output, target):
    target_mean = torch.nanmean(target)
    ss_tot = torch.nansum(((target - target_mean)) ** 2)
    ss_res = torch.nansum(((target - output)) ** 2)
    r2 = 1 - ss_res / ss_tot

    mask = torch.isnan(target)
    return torch.where(mask, 0.0, r2)

In [83]:
print("Model loading")
model = MultiTaskNet().to(device)
loss_fn = masked_mse
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

Model loading


In [84]:
model.load_state_dict(torch.load("outputs/best_bmi.pth")["model_state_dict"])

<All keys matched successfully>

In [92]:
def evaluate_model(model, dataloader):
    mse_loss = []
    r2_losses = []

    for idx, batch in enumerate(dataloader):
        x, y_bmi, y_cmr = batch
        with torch.no_grad():
            outs = model(x).squeeze()
            loss = loss_fn(outs, y_bmi)
            r2_val_loss = r2_loss(outs, y_bmi)

            mse_loss.append(loss.item())
            r2_losses.append(r2_val_loss.item())
    # print(np.sum())
    mse_loss_avg = np.array(mse_loss).mean()
    r2_losses_avg = np.array(r2_losses).mean()

    return mse_loss_avg, r2_losses_avg

In [97]:
evaluate_model(model, dataloader)

tensor([23.7900], device='cuda:0') tensor(23.7900, device='cuda:0')
tensor([24.7500], device='cuda:0') tensor(24.7500, device='cuda:0')
tensor([23.3600], device='cuda:0') tensor(23.3600, device='cuda:0')
tensor([26.7700], device='cuda:0') tensor(26.7700, device='cuda:0')
tensor([27.7900], device='cuda:0') tensor(27.7900, device='cuda:0')
tensor([25.3000], device='cuda:0') tensor(25.3000, device='cuda:0')
tensor([24.9900], device='cuda:0') tensor(24.9900, device='cuda:0')
tensor([26.6000], device='cuda:0') tensor(26.6000, device='cuda:0')
tensor([35.7700], device='cuda:0') tensor(35.7700, device='cuda:0')
tensor([26.5300], device='cuda:0') tensor(26.5300, device='cuda:0')
tensor([24.3200], device='cuda:0') tensor(24.3200, device='cuda:0')
tensor([23.9400], device='cuda:0') tensor(23.9400, device='cuda:0')
tensor([24.0300], device='cuda:0') tensor(24.0300, device='cuda:0')
tensor([24.8500], device='cuda:0') tensor(24.8500, device='cuda:0')
tensor([26.8400], device='cuda:0') tensor(26.840

KeyboardInterrupt: 

In [61]:
with open("data/drop_cols.pickle", "rb") as f:
    drop = pickle.load(f)

In [62]:
test_df = pd.read_parquet("data/test_20221130.parquet.gzip")

In [63]:
test_df.drop(drop, errors='ignore', inplace=True, axis = 1)

In [64]:
df = test_df.iloc[27]

In [65]:
df

DHSCLUST_x                                        627.000000
DHSYEAR_x                                        2017.000000
LATNUM_x                                           41.312695
LONGNUM_x                                          19.823230
ET_water_median@CAS/IGSNRR/PML/V2&timestamped      -0.802911
                                                    ...     
motorized_only_travel_time_50000                   20.489632
walking_only_friction_surface_100000                0.033939
walking_only_travel_time_100000                   187.340560
motorized_only_friction_surface_100000              0.016490
motorized_only_travel_time_100000                  40.371296
Name: (AL, 1076), Length: 11333, dtype: float64

In [66]:
df_mean_std = pd.read_parquet("data/train_means_std.parquet.gzip")

In [67]:
df_norm = (df-df_mean_std["means"])/df_mean_std["stds"]

In [68]:
df_mean_std["stds"][11305]

0.013536716843997987

In [35]:
df[11305]

0.0

In [36]:
df_norm[11305]

-0.15821440220480487

In [37]:
df_norm.fillna(value=0, inplace=True)
x_inp = torch.tensor(df_norm.values, dtype=torch.float32)

In [38]:
x_inp[11305]

tensor(-0.1582)

In [39]:
with open("data/rand_proj.pt", "rb") as f:
    rand_proj = torch.load(f)


In [40]:
torch.argmin(x_inp)

tensor(9283)

In [41]:
x_proj = x_inp @ rand_proj

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x11342 and 11333x2048)

In [42]:
x_proj

NameError: name 'x_proj' is not defined

In [60]:
rand_proj

tensor([[ 0.0118,  0.0084, -0.0104,  ...,  0.0082, -0.0079, -0.0010],
        [-0.0092,  0.0034, -0.0022,  ..., -0.0015, -0.0187,  0.0070],
        [ 0.0083,  0.0114, -0.0076,  ...,  0.0016, -0.0196,  0.0105],
        ...,
        [ 0.0082,  0.0071,  0.0028,  ...,  0.0110, -0.0117,  0.0114],
        [ 0.0026, -0.0003,  0.0116,  ..., -0.0170,  0.0168,  0.0071],
        [-0.0044, -0.0087,  0.0202,  ..., -0.0058, -0.0088, -0.0183]])

In [81]:
dev_labels.iloc[27]

Mean_BMI                 NaN
Under5_Mortality_Rate    0.0
Name: (AL, 1076), dtype: float64