# This project involves building a sequence classification model to analyze pairs of medical questions and determine their relationship. The aim is to classify whether two medical questions are related or not.

In [1]:
!kaggle datasets download -d thedevastator/medical-question-pair-classification

Dataset URL: https://www.kaggle.com/datasets/thedevastator/medical-question-pair-classification
License(s): CC0-1.0
Downloading medical-question-pair-classification.zip to /content
  0% 0.00/173k [00:00<?, ?B/s]
100% 173k/173k [00:00<00:00, 93.0MB/s]


In [2]:
data_path = "/content/medical-question-pair-classification.zip"

In [3]:
! unzip {data_path}

Archive:  /content/medical-question-pair-classification.zip
  inflating: train.csv               


In [4]:
import pandas as pd

data = pd.read_csv("train.csv")

In [5]:
# Split data
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(data, test_size=0.2, shuffle=True, random_state=1999)
val_df, test_df = train_test_split(temp_df, test_size=0.1, random_state=1999)

In [6]:
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [7]:
from transformers import AutoTokenizer

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [8]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer
max_len = tokenizer.model_max_length

class MedPairDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        q1 = row['question_1']
        q2 = row['question_2']
        label = row['label']

        inputs = self.tokenizer.encode_plus(
            q1,
            q2,
            truncation=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            add_special_tokens=True,
            truncation_strategy='longest_first',
        )

        input_ids = torch.tensor(inputs['input_ids'])
        attention_mask = torch.tensor(inputs['attention_mask'])
        token_type_ids = torch.tensor(inputs['token_type_ids'])
        labels = torch.tensor(label)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'token_type_ids': token_type_ids,
            'labels': labels
        }

In [9]:
train_dataset = MedPairDataset(train_df, tokenizer)
test_dataset = MedPairDataset(test_df, tokenizer)
val_dataset = MedPairDataset(val_df, tokenizer)

In [10]:
train_dataset[0]



{'input_ids': tensor([  101,  2339,  2003,  2026,  2558,  2668,  2601,  2829,  1010,  3030,
          2044,  1018, 17850,  2015,  1012,  2003,  2023,  1037,  2558,  1029,
           102,  2054,  2515,  2829, 12436, 24965, 11889,  2812,  1029,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [11]:
z = next(iter(train_dataset)); z

{'input_ids': tensor([  101,  2339,  2003,  2026,  2558,  2668,  2601,  2829,  1010,  3030,
          2044,  1018, 17850,  2015,  1012,  2003,  2023,  1037,  2558,  1029,
           102,  2054,  2515,  2829, 12436, 24965, 11889,  2812,  1029,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [12]:
tokenizer.decode(z['input_ids'])

'[CLS] why is my period blood dark brown, stopped after 4 hrs. is this a period? [SEP] what does brown vaginal discharge mean? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PA

In [13]:
import torch
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [14]:
from torch.utils.data import DataLoader

batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)


In [16]:
from transformers import get_linear_schedule_with_warmup

def validate(model, val_loader):
    model.eval()
    total_val_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)
            loss = outputs.loss
            total_val_loss += loss.item()

            _, preds = torch.max(outputs.logits, dim=1)
            correct_predictions += torch.sum(preds == labels)

    avg_val_loss = total_val_loss / len(val_loader)
    accuracy = correct_predictions.double() / len(val_loader.dataset)

    print(f"Validation Loss: {avg_val_loss}, Accuracy: {accuracy}")

epochs = 3
total_steps = len(train_loader) * epochs

# Setup optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Set scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Set loss function
criterion = torch.nn.CrossEntropyLoss().to(device)

# Training Loop with Validation
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)
        loss = outputs.loss

        loss.backward()
        total_loss += loss.item()

        optimizer.step()
        scheduler.step()

        if step % 10 == 0:
            print(f"Epoch {epoch + 1}/{epochs} | Step {step}/{len(train_loader)} | Loss: {loss.item()}")

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs} | Average Loss: {avg_loss}")

    # Validate the model
    validate(model, val_loader)




Epoch 1/3 | Step 0/77 | Loss: 0.8956274390220642
Epoch 1/3 | Step 10/77 | Loss: 0.6639115810394287
Epoch 1/3 | Step 20/77 | Loss: 0.6108710169792175
Epoch 1/3 | Step 30/77 | Loss: 0.5420915484428406
Epoch 1/3 | Step 40/77 | Loss: 0.5713611245155334
Epoch 1/3 | Step 50/77 | Loss: 0.6189216375350952
Epoch 1/3 | Step 60/77 | Loss: 0.40967053174972534
Epoch 1/3 | Step 70/77 | Loss: 0.7877730131149292
Epoch 1/3 | Average Loss: 0.5992301783778451
Validation Loss: 0.570234884818395, Accuracy: 0.6867030965391621
Epoch 2/3 | Step 0/77 | Loss: 0.6194705367088318
Epoch 2/3 | Step 10/77 | Loss: 0.5416010022163391
Epoch 2/3 | Step 20/77 | Loss: 0.3054751753807068
Epoch 2/3 | Step 30/77 | Loss: 0.5479623675346375
Epoch 2/3 | Step 40/77 | Loss: 0.40943193435668945
Epoch 2/3 | Step 50/77 | Loss: 0.6185083389282227
Epoch 2/3 | Step 60/77 | Loss: 0.4225403964519501
Epoch 2/3 | Step 70/77 | Loss: 0.6155210733413696
Epoch 2/3 | Average Loss: 0.48233215220562825
Validation Loss: 0.4656411020292176, Accurac

In [17]:
model.save_pretrained('/content/saved_model/MedModel')
tokenizer.save_pretrained('/content/saved_model/MedTokenizer')


('/content/saved_model/MedTokenizer/tokenizer_config.json',
 '/content/saved_model/MedTokenizer/special_tokens_map.json',
 '/content/saved_model/MedTokenizer/vocab.txt',
 '/content/saved_model/MedTokenizer/added_tokens.json',
 '/content/saved_model/MedTokenizer/tokenizer.json')

In [44]:
import random

# Function to predict on a random sample from the test set
def predict_random_sample(model, tokenizer, test_dataset):
    # Choose a random index from the test set
    idx = random.randint(0, len(test_dataset) - 1)

    # Retrieve the input question pair and label from the test dataset
    inputs = test_df.iloc[idx]
    q1 = inputs['question_1']
    q2 = inputs['question_2']
    true_label = inputs['label']

    # Tokenize the input question pair
    inputs_encoded = tokenizer.encode_plus(
        q1,
        q2,
        truncation=True,
        max_length=max_len,
        pad_to_max_length=True,
        add_special_tokens=True,
        return_tensors='pt'
    )

    # Move inputs to device
    input_ids = inputs_encoded['input_ids'].to(device)
    attention_mask = inputs_encoded['attention_mask'].to(device)
    token_type_ids = inputs_encoded['token_type_ids'].to(device)

    # Perform prediction
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

    # Get predicted label
    _, predicted = torch.max(outputs.logits, dim=1)
    predicted_label = predicted.item()

    # Print the results
    print(f"Question 1: {q1}")
    print(f"Question 2: {q2}")
    print(f"True Label: {true_label}")
    print(f"Predicted Label: {predicted_label}")

# Example usage
predict_random_sample(model, tokenizer, test_dataset)


Question 1: I believe I have the Flu and I have been vomiting, green and yellow. I also have a fever. I am 27.?
Question 2: What could be causing additional symptoms like green and yellow vomiting when one has the flu?
True Label: 1
Predicted Label: 1
