In [1]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import random

In [2]:
file_path = './metadata_info/MR_data.csv'
df = pd.read_csv(file_path)
df = df.fillna(0)

training_df = df[df['Label'] != 'Unknown'][['Label', 'EchoTime', 'RepetitionTime', 'InversionTime', 'Rows', 'Columns', 'Image Plane',
                                            'Manufacturer', 'PixelSpacing', 'FlipAngle', 'SliceThickness']]

training_df = training_df[df['Label'] != 'Localizer']



  df = pd.read_csv(file_path)
  training_df = training_df[df['Label'] != 'Localizer']


In [3]:
#training_df['Text'] = training_df.apply(lambda row: f'The Echo Time is {num2words(round(row["EchoTime"], 2))} ms. The Repetition Time is {num2words(round(row["RepetitionTime"], 2))} ms.', axis=1)

training_df['Text'] = training_df.apply(lambda row: f'The Echo Time is {round(row["EchoTime"], 2)} ms. The Repetition Time is {round(row["RepetitionTime"], 2)} ms.', axis=1)


In [4]:
unique_labels = training_df['Label'].unique()
label_mapping = {label: i for i, label in enumerate(unique_labels)}
training_df['Label_ID'] = training_df['Label'].replace(label_mapping)

print(label_mapping)

texts = training_df.Text.values
labels = training_df.Label_ID.values

{'T1': 0, 'T2': 1, 'DWI': 2, 'ADC': 3, 'T1 CE': 4, 'T2 FLAIR': 5, 'T2 FS': 6, 'DTI': 7, 'PD': 8}


In [5]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification, BertTokenizer, AdamW

# Set the seed for reproducibility
torch.manual_seed(42)

# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=9)  # Adjust `num_labels` based on your task

# Tokenize the texts and convert labels to torch tensors
encoded_inputs = tokenizer(texts.tolist(), truncation=True, padding=True, return_tensors="pt")
labels = torch.tensor(labels)

# Create a custom PyTorch dataset
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item
    
    def __len__(self):
        return len(self.labels)

dataset = CustomDataset(encoded_inputs, labels)

# Split the dataset into train and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Define the data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Set the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the optimizer and learning rate
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training loop
num_epochs = 5  # Adjust the number of epochs as needed

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        #print(labels.shape)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    train_loss /= len(train_loader)
    
    # Validation
    model.eval()
    val_loss = 0.0
    val_correct = 0
    
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            
            val_loss += loss.item()
            
            predicted_labels = torch.argmax(logits, dim=1)
            val_correct += (predicted_labels == labels).sum().item()
    
    val_loss /= len(val_loader)
    val_accuracy = val_correct / len(val_dataset)
    
    # Print training and validation metrics
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Validation Loss: {val_loss:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print()


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch 1/5
Train Loss: 1.2165
Validation Loss: 0.7977
Validation Accuracy: 0.7296

Epoch 2/5
Train Loss: 0.6986
Validation Loss: 0.5676
Validation Accuracy: 0.8100

Epoch 3/5
Train Loss: 0.5448
Validation Loss: 0.4718
Validation Accuracy: 0.8309

Epoch 4/5
Train Loss: 0.4560
Validation Loss: 0.4514
Validation Accuracy: 0.8434

Epoch 5/5
Train Loss: 0.3825
Validation Loss: 0.4529
Validation Accuracy: 0.8549



In [7]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True
    )

Downloading vocab.txt: 0.00B [00:00, ?B/s]

In [8]:
def print_rand_sentence():
  '''Displays the tokens and respective IDs of a random text sample'''
  index = random.randint(0, len(texts)-1)
  table = np.array([tokenizer.tokenize(texts[index]), 
                    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(texts[index]))]).T
  print(tabulate(table,
                 headers = ['Tokens', 'Token IDs'],
                 tablefmt = 'fancy_grid'))

print_rand_sentence()

╒════════════╤═════════════╕
│ Tokens     │   Token IDs │
╞════════════╪═════════════╡
│ the        │        1996 │
├────────────┼─────────────┤
│ echo       │        9052 │
├────────────┼─────────────┤
│ time       │        2051 │
├────────────┼─────────────┤
│ is         │        2003 │
├────────────┼─────────────┤
│ 107        │       10550 │
├────────────┼─────────────┤
│ .          │        1012 │
├────────────┼─────────────┤
│ 0          │        1014 │
├────────────┼─────────────┤
│ ms         │        5796 │
├────────────┼─────────────┤
│ .          │        1012 │
├────────────┼─────────────┤
│ the        │        1996 │
├────────────┼─────────────┤
│ repetition │       23318 │
├────────────┼─────────────┤
│ time       │        2051 │
├────────────┼─────────────┤
│ is         │        2003 │
├────────────┼─────────────┤
│ 1300       │       19527 │
├────────────┼─────────────┤
│ .          │        1012 │
├────────────┼─────────────┤
│ 0          │        1014 │
├────────────┼