In [23]:
import os
import pandas as pd
import torch
from prediction_utils.pytorch_utils.models import TorchModel
from prediction_utils.pytorch_utils.datasets import ArrayLoaderGenerator

In [24]:
data_path = '/share/pi/nigam/projects/spfohl/cohorts/admissions/starr_20200523/'
cohort_path = os.path.join(data_path, 'cohort', 'cohort.parquet')
predictions_path = os.path.join(data_path, 'experiments', 'baseline_tuning_fold_1_10', 'performance', 'hospital_mortality', '0.yaml', '1', 'output_df.parquet')

In [27]:
cohort_df = pd.read_parquet(cohort_path)
pred_df = pd.read_parquet(predictions_path)

In [9]:
pred_df

Unnamed: 0,phase,outputs,pred_probs,labels,row_id
0,val,-1.094916,0.079028,0,16
1,val,-2.456556,0.003679,0,21
2,val,-3.585434,0.000268,0,24
3,val,-0.885466,0.112826,0,35
4,val,-1.538378,0.027455,0,37
...,...,...,...,...,...
37737,test,-3.919549,0.000137,0,198593
37738,test,-2.550915,0.002893,0,198620
37739,test,-2.312639,0.005135,0,198626
37740,test,-2.023413,0.011006,0,198636


In [21]:
class LinearLayer1D(torch.nn.Module):
    """
    Linear Regression model
    """
    def __init__(self, in_features, out_features, apply_log_transform=False):
        super().__init__()
        self.linear = torch.nn.Linear(in_features, out_features)
        self.apply_log_transform=apply_log_transform

    def forward(self, x):
        if self.apply_log_transform:
            x = torch.log(x)
        if len(x.size()) == 1:
            x = x.unsqueeze(1)
        return self.linear(x)
    
class LinearModel1D(TorchModel):
    
    def get_default_config(self):
        config_dict = super().get_default_config()
        update_dict = {
            "input_dim": 1,
            "output_dim": 2,
            "sparse": False,
            "apply_log_transform": True
        }

        return {**config_dict, **update_dict}
    
    def init_model(self):
        model = LinearLayer1D(
            in_features=self.config_dict['input_dim'],
            out_features=self.config_dict['output_dim'],
            apply_log_transform=self.config_dict['apply_log_transform']
        )

In [28]:
cohort_df.reset_index(drop=True).rename_axis('row_id_pred_probs').reset_index()

Unnamed: 0,row_id_pred_probs,person_id,admit_date,discharge_date,hospital_mortality,month_mortality,LOS_days,LOS_7,readmission_30,age_in_years,age_group,race_eth,gender_concept_name,prediction_id,fold_id
0,0,29935972,2012-04-13,2012-04-14,0,0,1,0,0,90,[75-91),White,FEMALE,-4914281712421165508,3
1,1,29954767,2014-02-07,2014-02-15,0,0,8,1,0,90,[75-91),White,MALE,6734749193582821703,8
2,2,29973672,2010-02-20,2010-02-23,0,0,3,0,1,90,[75-91),Other,FEMALE,-8953631066410342509,test
3,3,29977637,2011-10-30,2011-11-02,0,0,3,0,0,90,[75-91),White,FEMALE,4205901604256989764,4
4,4,29977678,2013-06-09,2013-06-13,0,0,4,0,0,90,[75-91),White,FEMALE,-6822238745805029022,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198639,198639,43763197,2020-01-10,2020-01-12,0,0,2,0,0,89,[75-91),White,MALE,-1507086093933441997,9
198640,198640,43763272,2020-01-24,2020-01-26,0,0,2,0,0,89,[75-91),Asian,MALE,-938368361124472352,1
198641,198641,43996048,2009-08-26,2009-08-30,0,0,4,0,0,89,[75-91),White,FEMALE,8994658897675187992,1
198642,198642,44247408,2020-03-25,2020-03-31,0,0,6,0,0,89,[75-91),Other,FEMALE,-3006987821701518356,9


In [17]:
train_df

Unnamed: 0,phase,outputs,pred_probs,labels,row_id
0,val,-1.094916,0.079028,0,16
1,val,-2.456556,0.003679,0,21
2,val,-3.585434,0.000268,0,24
3,val,-0.885466,0.112826,0,35
4,val,-1.538378,0.027455,0,37
...,...,...,...,...,...
17873,val,-1.423776,0.037314,0,198601
17874,val,-1.970004,0.011250,0,198627
17875,val,-3.218438,0.000618,0,198629
17876,val,-2.217583,0.006336,0,198635


In [19]:
train_df = pred_df.query('phase == "val"')
val_df = pred_df.query('phase == "train"')

layer = LinearLayer1D(1, 2, apply_log_transform=True)
pred = layer(torch.FloatTensor(train_df.pred_probs))

tensor([[0.2870, 0.2976],
        [0.9336, 1.8193],
        [1.4854, 3.1182],
        ...,
        [1.3097, 2.7045],
        [0.8190, 1.5496],
        [0.4075, 0.5812]], grad_fn=<AddmmBackward>)

In [53]:
# class LogProbModel(LightningModule):

#     def __init__(self, needs_log=True):
#         super().__init__()
#         self.layer = torch.nn.Linear(1, 2, bias=True)
#         self.needs_log = needs_log

#     def forward(self, x):
#         return self.layer(x)

#     def training_step(self, batch, batch_idx):
#         x, y = batch
#         x = x.unsqueeze(1)
#         if self.needs_log:
#             x = torch.log(x)
#         y_hat = self.forward(x)
#         loss = F.cross_entropy(y_hat, y)
#         return {'loss': loss}


# train_df = pred_df.query('phase == "val"')
# val_df = pred_df.query('phase == "train"')
# train_dataset = torch.utils.data.TensorDataset(torch.FloatTensor(train_df.pred_probs), torch.LongTensor(train_df.labels))
# train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32)
# val_dataset = torch.utils.data.TensorDataset(torch.FloatTensor(val_df.pred_probs), torch.LongTensor(val_df.labels))
# val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32)

GPU available: False, used: False
No environment variable for node rank defined. Set as 0.

  | Name  | Type   | Params
-----------------------------
0 | layer | Linear | 4     


1