In [1]:
import sys 
sys.path.insert(0, '../')

In [2]:
pip install pytorch-pretrained-bert

You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
from common.common import create_folder
from common.pytorch import load_model
import pytorch_pretrained_bert as Bert
from model.utils import age_vocab
from common.common import load_obj
from dataLoader.NumericalTrain import NTLoader
from torch.utils.data import DataLoader
import pandas as pd
from model.NT import BertForSequenceClassification
from sklearn.model_selection import train_test_split
from model.optimiser import adam
import sklearn.metrics as skm
import numpy as np
import torch
import time
import torch.nn as nn
import os

In [4]:
class BertConfig(Bert.modeling.BertConfig):
    def __init__(self, config):
        super(BertConfig, self).__init__(
            vocab_size_or_config_json_file=config.get('vocab_size'),
            hidden_size=config['hidden_size'],
            num_hidden_layers=config.get('num_hidden_layers'),
            num_attention_heads=config.get('num_attention_heads'),
            intermediate_size=config.get('intermediate_size'),
            hidden_act=config.get('hidden_act'),
            hidden_dropout_prob=config.get('hidden_dropout_prob'),
            attention_probs_dropout_prob=config.get('attention_probs_dropout_prob'),
            max_position_embeddings = config.get('max_position_embedding'),
            initializer_range=config.get('initializer_range'),
        )
        self.seg_vocab_size = config.get('seg_vocab_size')
        self.age_vocab_size = config.get('age_vocab_size')
        
class TrainConfig(object):
    def __init__(self, config):
        self.batch_size = config.get('batch_size')
        self.use_cuda = config.get('use_cuda')
        self.max_len_seq = config.get('max_len_seq')
        self.train_loader_workers = config.get('train_loader_workers')
        self.test_loader_workers = config.get('test_loader_workers')
        self.device = config.get('device')
        self.output_dir = config.get('output_dir')
        self.output_name = config.get('output_name')
        self.best_name = config.get('best_name')

In [5]:
file_config = {
    'data': '',  # formated data 
    'model_path': 'model', # where to save model
    'model_name': 'SepsisBERT.csv', # model name
    'file_name': '',  # log path
}
create_folder(file_config['model_path'])

In [6]:
global_params = {
    'max_seq_len': 280,
    'max_age': 110,
    'month': 1,
    'age_symbol': None,
    'min_visit': 5,
    'gradient_accumulation_steps': 1
}

optim_param = {
    'lr': 3e-5,
    'warmup_proportion': 0.1,
    'weight_decay': 0.01
}

train_params = {
    'batch_size': 256,
    'use_cuda': False,
    'max_len_seq': global_params['max_seq_len'],
    'device': 'cpu'
}

In [7]:
data = pd.read_csv('../preprocess/final_dataset/final_dataset.csv')

In [8]:
data.head()

Unnamed: 0,HR,Temp,O2Sat,Resp,SepsisLabel,PatientID
0,"{6: 101.0, 10: 87.0, 14: 81.0, 18: 87.0, 22: 9...","{6: 36.6, 10: 36.0, 14: 36.0, 18: 36.7, 22: 36...","{6: 100.0, 10: 100.0, 14: 100.0, 18: 100.0, 22...","{6: 20.0, 10: 20.0, 14: 20.0, 18: 20.0, 22: 18...","{6: 0, 10: 0, 14: 0, 18: 0, 22: 0, 26: 0, 30: ...",p116812
1,"{0: 84.5, 1: 87.0, 2: 88.0, 3: 87.0, 4: 92.0, ...","{0: 34.3, 1: 34.6, 2: 35.5, 3: 36.2, 4: 36.65,...","{0: 96.0, 1: 96.0, 2: 95.0, 3: 98.0, 4: 98.5, ...","{0: 30.0, 1: 18.0, 2: 19.0, 3: 18.0, 4: 18.0, ...","{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: ...",p109932
2,"{0: 80.0, 1: 76.0, 2: 80.0, 3: 78.0, 4: 74.0, ...","{0: 36.5, 1: 36.25, 2: 36.25, 3: 36.1, 4: 36.0...","{0: 100.0, 1: 100.0, 2: 100.0, 3: 100.0, 4: 10...","{0: 13.5, 1: 12.0, 2: 12.0, 3: 12.0, 4: 12.5, ...","{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: ...",p014977
3,"{4: 83.0, 8: 81.0, 13: 85.0, 17: 92.0, 22: 87....","{4: 36.11, 8: 35.83, 13: 36.0, 17: 37.28, 22: ...","{4: 99.0, 8: 96.0, 13: 97.0, 17: 93.0, 22: 97....","{4: 16.0, 8: 17.0, 13: 19.0, 17: 17.0, 22: 15....","{4: 0, 8: 0, 13: 0, 17: 0, 22: 0, 28: 0, 39: 0}",p000902
4,"{0: 102.5, 5: 100.0, 8: 102.0, 17: 108.0, 18: ...","{0: 36.89, 5: 37.06, 8: 36.17, 17: 36.17, 18: ...","{0: 96.0, 5: 98.0, 8: 95.0, 17: 89.0, 18: 100....","{0: 16.5, 5: 14.5, 8: 16.0, 17: 18.0, 18: 15.0...","{0: 0, 5: 0, 8: 0, 17: 0, 18: 0, 20: 0, 25: 0,...",p009098


In [9]:
test_patient_ids = np.load("../preprocess/final_dataset/test_set.npy", allow_pickle=True)
test_patient_ids = [patient_id.replace('.psv', '') for patient_id in test_patient_ids]

In [10]:
# Split the DataFrame into training and testing sets
#train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

# Split the DataFrame into training and testing sets based on the test_patient_ids
test_df = data[data['PatientID'].isin(test_patient_ids)]
train_df = data[~data['PatientID'].isin(test_patient_ids)]

In [11]:
# Initialize the NTLoader for Training
Dset = NTLoader(train_df, max_len=global_params['max_seq_len'])
trainload = DataLoader(dataset=Dset, batch_size=train_params['batch_size'], shuffle=True, num_workers=3)

# Initialize the NTLoader for Testing
Dset = NTLoader(test_df, max_len=global_params['max_seq_len'])
testload = DataLoader(dataset=Dset, batch_size=train_params['batch_size'], shuffle=True, num_workers=3)

In [12]:
num_batches_to_inspect = 3
for i, (features, labels) in enumerate(testload):
    if i >= num_batches_to_inspect:
        break
    print(f"Batch {i+1}")
    print("Features shape:", features.shape)
    print("Labels shape:", labels.shape)

Original label sequence: {2: 0, 5: 0, 6: 0, 7: 0, 8: 0, 10: 0, 11: 0, 15: 1}
Processed label sequence before tensor conversion: [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Original label sequence: {3: 0,