In [None]:
import numpy as np
import os
import pandas as pd
import tiktoken
import torch
from torch.utils.data import DataLoader, Dataset

## Import dataset

In [None]:
data_dir = '../../data'

In [None]:
df = pd.read_csv(os.path.join(data_dir, "google_gemma-2-2b-it_model_labeled_20240925.csv"))

In [None]:
df.head()

In [None]:
df.columns

In [None]:
cols = [
    'case_identifier', 
    'findings', 
    'conclusions_and_recommendations', 
    'pulmonary edema', 
    'consolidation', 
    'pleural effusion', 
    'pneumothorax',
    'cardiomegaly'
]

In [None]:
df = df[cols]

In [None]:
df.head()

## Get dataset statistics

In [None]:
abnormalities = ['pulmonary edema', 'consolidation', 'pleural effusion', 'pneumothorax', 'cardiomegaly']

In [None]:
# Calculate the sum (count) for each abnormality
abnormality_counts = df[abnormalities].sum()

# Display the results
print(abnormality_counts)

# Calculate the percentage of reports with each abnormality
total_reports = len(df)
abnormality_percentages = (abnormality_counts / total_reports) * 100

# Display the percentages
print("\nPercentage of reports with each abnormality:")
print(abnormality_percentages)

## Create data loaders

- Note that the text messages have different lengths; if we want to combine multiple training examples in a batch, we have to either
  1. truncate all messages to the length of the shortest message in the dataset or batch
  2. pad all messages to the length of the longest message in the dataset or batch

- We choose option 2 and pad all messages to the longest message in the dataset
- For that, we use `<|endoftext|>` as a padding token

In [None]:
def random_split(df, train_frac, validation_frac):
    # Shuffle the entire DataFrame
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)

    # Calculate split indices
    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)

    # Split the DataFrame
    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]

    return train_df, validation_df, test_df

train_df, validation_df, test_df = random_split(df, 0.7, 0.1)
# Test size is implied to be 0.2 as the remainder

train_df.to_csv(os.path.join(data_dir, "train.csv"), index=None)
validation_df.to_csv(os.path.join(data_dir, "validation.csv"), index=None)
test_df.to_csv(os.path.join(data_dir, "test.csv"), index=None)

In [None]:
tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

In [None]:
class RadiologyDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256):
        self.data = pd.read_csv(csv_file)
        self.abnormalities = ['pulmonary edema', 'consolidation', 'pleural effusion', 'pneumothorax', 'cardiomegaly']

        # Convert abnormality columns to numeric type
        for abnormality in self.abnormalities:
            self.data[abnormality] = pd.to_numeric(self.data[abnormality], errors='coerce').fillna(0).astype(int)

        # Pre-tokenize texts
        self.encoded_texts = [
            tokenizer.encode(text) for text in self.data["conclusions_and_recommendations"]
        ]

        if max_length is None:
            self.max_length = self._longest_encoded_length()
        else:
            self.max_length = max_length
            # Truncate sequences if they are longer than max_length
            self.encoded_texts = [
                encoded_text[:self.max_length]
                for encoded_text in self.encoded_texts
            ]

        # Pad sequences to the longest sequence
        self.encoded_texts = [
            encoded_text + [pad_token_id] * (self.max_length - len(encoded_text))
            for encoded_text in self.encoded_texts
        ]

    def __getitem__(self, index):
        encoded = self.encoded_texts[index]
        labels = self.data.iloc[index][self.abnormalities].values.astype(np.float32)
        return (
            torch.tensor(encoded, dtype=torch.long),
            torch.tensor(labels, dtype=torch.float)
        )

    def __len__(self):
        return len(self.data)

    def _longest_encoded_length(self):
        max_length = 0
        for encoded_text in self.encoded_texts:
            encoded_length = len(encoded_text)
            if encoded_length > max_length:
                max_length = encoded_length
        return max_length

In [None]:
train_dataset = RadiologyDataset(
    csv_file=os.path.join(data_dir, "train.csv"),
    max_length=None,
    tokenizer=tokenizer
)

print(train_dataset.max_length)

In [None]:
val_dataset = RadiologyDataset(
    csv_file=os.path.join(data_dir, "validation.csv"),
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)
test_dataset = RadiologyDataset(
    csv_file=os.path.join(data_dir, "test.csv"),
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)

In [None]:
num_workers = 0
batch_size = 8

torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True,
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)

- As a verification step, we iterate through the data loaders and ensure that the batches contain 8 training examples each, where each training example consists of 525 tokens

In [None]:
print("Train loader:")
for input_batch, target_batch in train_loader:
    pass

print("Input batch dimensions:", input_batch.shape)
print("Label batch dimensions:", target_batch.shape)

- Lastly, let's print the total number of batches in each dataset

In [None]:
print(f"{len(train_loader)} training batches")
print(f"{len(val_loader)} validation batches")
print(f"{len(test_loader)} test batches")

# Define GPT model