In [39]:
from transformers import AutoTokenizer, AutoModel, BertTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn as nn
import torch.optim as optim
import random
from torch.utils.data import DataLoader, Dataset, random_split
from pymongo import MongoClient


In [4]:
print(torch.__version__)
print(torch.cuda.is_available())
if torch.cuda.is_available():
   device = torch.device("cuda")
else:
   device = "cpu"

print(device)

2.5.1+cpu
False
cpu


In [5]:
client = MongoClient("mongodb://localhost:27017/")
db = client["MIMIC"]
readmitted = db["readmitted_concated"]
no_readmitted = db["no_readmitted_concated"]

In [6]:
records_admitted = list(readmitted.find({}))
records_noreadmited = list(no_readmitted.find({}))
print(len(records_admitted))
print(len(records_noreadmited))

3408
9035


In [7]:
def create_balaced_record(record1, record2, percent1, percent2):
    
    size1 = int(len(record1)* percent1)
    size2 = int(len(record2)*percent2)
    sample1 = random.sample(record1, size1)
    sample2 = random.sample(record2, size2)
    combined_list = sample1+sample2
    random.shuffle(combined_list)
    
    return combined_list

In [8]:
data = create_balaced_record(records_admitted, records_noreadmited, 1,0.2)


Clinical bert

In [30]:
model = AutoModelForSequenceClassification.from_pretrained("medicalai/ClinicalBERT", num_labels=2)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
base_model = AutoModel.from_pretrained("medicalai/ClinicalBERT")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at medicalai/ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
#Load and preprocess data
class admission(Dataset):
    def __init__(self, data, tokenizer, max_length=512, stride=25):
       self.data = data
       self.tokenizer = tokenizer
       self.max_length = max_length
       self.stride = stride
       
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        record = self.data[idx]
        text = record["concatenated_notes"]
        label = record["readmission"] 
        tokens = self.tokenizer(text, max_length=self.max_length, truncation=True, padding='max_length', return_tensors='pt')
        
        # Return a dictionary containing tokens and label
        return {
            "input_ids": tokens['input_ids'].squeeze(0),  # Removing batch dimension
            "attention_mask": tokens['attention_mask'].squeeze(0),  # Removing batch dimension
            "label": label
        }

In [15]:
dataset = admission(data, tokenizer)

In [16]:
batch_size = 10
dataloader = DataLoader(dataset = dataset, batch_size=batch_size, shuffle=True)

In [60]:
#model define

class CustomModel(nn.Module):
    def __init__(self, input_size, base_model):
        super(CustomModel, self).__init__()
        self.model = base_model
        self.classifier = nn.Linear(self.model.config.hidden_size , 2)
    
    def forward(self, input_ids, attention_mask=None):
        outputs = self.model(input_ids, attention_mask= attention_mask)
        last_hidden_state = outputs.last_hidden_state
        #pooled_output = outputs.pooler_output
        cls_token_output = last_hidden_state[:, 0, :]
        return self.classifier(cls_token_output)

In [61]:
custom_model = CustomModel(input_size=512, base_model=base_model)

In [62]:
#loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-5)

In [63]:
train_size = int(0.8*len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

In [64]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [65]:
#Training the model
def training(model, dataloader, taset, criterion, optimizer, epochs=20):
    model.train()
    for epoch in range(epochs):
        running_loss = 0
        for batch in train_dataloader:
            input_ids = batch["input_ids"]
            attention_mask = batch["attention_mask"]
            labels = batch["label"]

            optimizer.zero_grad()

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_dataloader)}")


In [66]:
# Evaluation loop
model.eval() 
correct = 0
total = 0

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']

        # Forward pass
        outputs = custom_model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, 1)

        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Test Accuracy: {accuracy:.4f}")


Test Accuracy: 0.3452


In [67]:
#save

torch.save(model, "00_34_bert_custom")

In [1]:

#This is a new command line