In [None]:
from google.colab import files
uploaded = files.upload()


Saving arabic_medical_reports_100.csv to arabic_medical_reports_100.csv


In [None]:
import pandas as pd

df = pd.read_csv("arabic_medical_reports_100.csv")
df.head()


Unnamed: 0,id,text,summary,labels
1,المريض يعاني من صداع شديد و دوخة بعد الوقوف ال...,صداع و دوخة,صداع,دوخة
2,المريض يشكو من ألم في الصدر و خفقان شديد في ال...,ألم صدر و خفقان,ألم الصدر,خفقان
3,المريض يعاني من سعال جاف و ارتفاع في درجة الحر...,سعال و حمى,سعال,حمى
4,المريض يعاني من طفح جلدي و حكة شديدة.,طفح جلدي و حكة,طفح جلدي,حكة
5,المريض يعاني من غثيان صباحي و قيء مستمر.,غثيان و قيء,غثيان,قيء


In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

# Split the labels by comma into a list
df['label_list'] = df['labels'].apply(lambda x: x.split(','))

#  MultiLabelBinarizer to create multi-hot encoding for labels
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['label_list'])

print("All label classes:", mlb.classes_)
print("Sample multi-hot label vector:", y[0])


All label classes: ['آلام البطن' 'ألم الصدر' 'إسهال' 'حرقان التبول' 'حكة' 'حمى' 'خفقان'
 'دوخة' 'سعال' 'صداع' 'ضغط الدم' 'ضيق التنفس' 'طفح جلدي' 'غثيان'
 'فقدان التوازن' 'قيء' 'كحة']
Sample multi-hot label vector: [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]


In [None]:
from transformers import AutoTokenizer

model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the text with padding and truncation
X = tokenizer(list(df['text']), padding=True, truncation=True, max_length=128, return_tensors="pt")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/720k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class ArabicMedicalDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.float)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }

    def __len__(self):
        return len(self.labels)

dataset = ArabicMedicalDataset(X, y)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)


In [None]:
from transformers import AutoModel
import torch.nn as nn
import torch

class MultiLabelClassifier(nn.Module):
    def __init__(self, model_name, num_labels):
        super(MultiLabelClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # Take [CLS] token representation
        cls_output = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(cls_output)
        return logits

model = MultiLabelClassifier(model_name, num_labels=len(mlb.classes_))


model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

In [None]:
from torch.optim import Adam
from torch.nn import BCEWithLogitsLoss

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = Adam(model.parameters(), lr=2e-5)
criterion = BCEWithLogitsLoss()


In [None]:
model.train()

for epoch in range(5):
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1} - Loss: {total_loss/len(dataloader):.4f}")


Epoch 1 - Loss: 0.2255
Epoch 2 - Loss: 0.2192
Epoch 3 - Loss: 0.2090
Epoch 4 - Loss: 0.2060
Epoch 5 - Loss: 0.1950


In [None]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), y, test_size=0.2, random_state=42
)

# Tokenize both sets
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")

# Create datasets
train_dataset = ArabicMedicalDataset(train_encodings, train_labels)
val_dataset = ArabicMedicalDataset(val_encodings, val_labels)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)


In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

num_epochs = 5
model.train()

for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)

    # Evaluation
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()
            outputs = model(input_ids, attention_mask)
            logits = outputs.cpu().numpy()
            print("Logits range:", logits.min(), logits.max())
            preds = sigmoid(logits) >= 0.2
            all_preds.append(preds)
            all_labels.append(labels)

    all_preds = np.vstack(all_preds)
    all_labels = np.vstack(all_labels)

    f1 = f1_score(all_labels, all_preds, average='micro')
    precision = precision_score(all_labels, all_preds, average='micro')
    recall = recall_score(all_labels, all_preds, average='micro')

    print(f"Epoch {epoch+1} — Train Loss: {avg_train_loss:.4f} | Val F1: {f1:.4f} | Val Precision: {precision:.4f} | Val Recall: {recall:.4f}")

    model.train()


Logits range: -2.5064442 0.85034347
Logits range: -2.463109 0.8740977
Epoch 1 — Train Loss: 0.1852 | Val F1: 0.7368 | Val Precision: 0.6087 | Val Recall: 0.9333
Logits range: -2.6750093 0.84568906
Logits range: -2.5272853 0.89577657
Epoch 2 — Train Loss: 0.1806 | Val F1: 0.8000 | Val Precision: 0.7000 | Val Recall: 0.9333
Logits range: -2.61692 0.8386087
Logits range: -2.5021293 0.95839334
Epoch 3 — Train Loss: 0.1760 | Val F1: 0.8485 | Val Precision: 0.7778 | Val Recall: 0.9333
Logits range: -2.5994868 0.8922781
Logits range: -2.5626516 1.1795735
Epoch 4 — Train Loss: 0.1703 | Val F1: 0.8000 | Val Precision: 0.7000 | Val Recall: 0.9333
Logits range: -2.7025204 1.1562017
Logits range: -2.5996935 1.2251763
Epoch 5 — Train Loss: 0.1646 | Val F1: 0.8235 | Val Precision: 0.7368 | Val Recall: 0.9333


In [None]:
model.eval()
with torch.no_grad():
    sample = val_dataset[0]
    input_ids = sample['input_ids'].unsqueeze(0).to(device)
    attention_mask = sample['attention_mask'].unsqueeze(0).to(device)
    output = model(input_ids, attention_mask)
    print("Raw logits:", output)
    print("Sigmoid probs:", torch.sigmoid(output))


Raw logits: tensor([[-2.0652, -1.8494, -1.6834, -1.8260, -1.7914, -1.1413, -1.9101, -1.9012,
         -1.7366, -1.3019, -1.9190, -2.0563, -1.5714, -2.0254, -1.7717, -2.1165,
         -1.6047]])
Sigmoid probs: tensor([[0.1125, 0.1359, 0.1566, 0.1387, 0.1429, 0.2421, 0.1290, 0.1300, 0.1497,
         0.2138, 0.1280, 0.1134, 0.1720, 0.1166, 0.1453, 0.1075, 0.1673]])


In [1]:
from google.colab import files
uploaded = files.upload()


Saving ner_dataset.txt to ner_dataset.txt


In [2]:
def read_ner_data(file_path):
    sentences = []
    labels = []
    current_sentence = []
    current_labels = []

    with open(file_path, encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if current_sentence:
                    sentences.append(current_sentence)
                    labels.append(current_labels)
                    current_sentence = []
                    current_labels = []
            else:
                token, tag = line.split()
                current_sentence.append(token)
                current_labels.append(tag)

    if current_sentence:
        sentences.append(current_sentence)
        labels.append(current_labels)

    return sentences, labels

# Use the exact filename you uploaded
sentences, tags = read_ner_data("ner_dataset.txt")
print(sentences[0])
print(tags[0])


['تعاني', 'المريضة', 'من', 'صداع', 'شديد', 'و', 'حمى', '.']
['O', 'O', 'O', 'B-SYMPTOM', 'I-SYMPTOM', 'O', 'B-SYMPTOM', 'O']


In [3]:
from transformers import AutoTokenizer


model_checkpoint = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Unique tags
unique_tags = list(set(tag for sent in tags for tag in sent))
tag2id = {tag: idx for idx, tag in enumerate(unique_tags)}
id2tag = {idx: tag for tag, idx in tag2id.items()}

print(tag2id)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/720k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

{'O': 0, 'B-SYMPTOM': 1, 'I-MEDICATION': 2, 'I-SYMPTOM': 3, 'B-MEDICATION': 4}


In [4]:
def tokenize_and_align_labels(sentences, labels):
    tokenized_inputs = []
    aligned_labels = []

    for words, tags in zip(sentences, labels):
        tokenized = tokenizer(words, is_split_into_words=True, truncation=True, padding='max_length', max_length=128)
        word_ids = tokenized.word_ids()

        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignored in loss calculation
            elif word_idx != previous_word_idx:
                label_ids.append(tag2id[tags[word_idx]])
            else:
                label_ids.append(tag2id[tags[word_idx]] if tags[word_idx].startswith("I-") else -100)
            previous_word_idx = word_idx

        tokenized_inputs.append(tokenized)
        aligned_labels.append(label_ids)

    return tokenized_inputs, aligned_labels

tokenized_data, label_ids = tokenize_and_align_labels(sentences, tags)

# Example:
print(tokenized_data[0].tokens())
print(label_ids[0])


['[CLS]', 'تعاني', 'المر', '##يض', '##ة', 'من', 'صداع', 'شديد', 'و', 'حمى', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '

In [5]:
!pip install datasets transformers seqeval
from torch.utils.data import Dataset
import torch


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=3a4ed6b77757d0497e26f771d85db067f91cb46b9ff8a6e842c04bf2190d0538
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [7]:
from sklearn.model_selection import train_test_split

# Prepare zipped list
data = list(zip(input_encodings["input_ids"], input_encodings["attention_mask"], label_ids))

# Now split
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Unpack
train_input_ids, train_attention_masks, train_labels = zip(*train_data)
val_input_ids, val_attention_masks, val_labels = zip(*val_data)

# Rebuild encoding dicts
train_inputs = {"input_ids": list(train_input_ids), "attention_mask": list(train_attention_masks)}
val_inputs = {"input_ids": list(val_input_ids), "attention_mask": list(val_attention_masks)}


In [8]:
train_dataset = NERDataset(train_inputs, train_labels)
val_dataset = NERDataset(val_inputs, val_labels)


In [9]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)


In [22]:
label2id = {'O': 0, 'B-SYMPTOM': 1, 'I-SYMPTOM': 2}
id2label = {v: k for k, v in label2id.items()}

# Example label sequence:
labels = ['O', 'O', 'O', 'B-SYMPTOM', 'I-SYMPTOM', 'O', 'B-SYMPTOM', 'O']

label_ids = [label2id[label] for label in labels]
# label_ids => [0, 0, 0, 1, 2, 0, 1, 0]



In [24]:
num_labels = len(label2id)
model = AutoModelForTokenClassification.from_pretrained("aubmindlab/bert-base-arabertv2", num_labels=num_labels)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Step 1: Define your labels and mappings
labels = ['O', 'B-SYMPTOM', 'I-SYMPTOM']
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}

num_labels = len(labels)  # 3 in this case

# Step 2: Example tokens and labels
tokens = ['تعاني', 'المريضة', 'من', 'صداع', 'شديد', 'و', 'حمى', '.']
ner_labels = ['O', 'O', 'O', 'B-SYMPTOM', 'I-SYMPTOM', 'O', 'B-SYMPTOM', 'O']

# Step 3: Encode labels as IDs
label_ids = [label2id[label] for label in ner_labels]
print("Label IDs:", label_ids)

# Step 4: Load tokenizer and encode tokens (example)
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
encoding = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", padding=True, truncation=True)

# Step 5: Initialize model with correct number of labels
model = AutoModelForTokenClassification.from_pretrained("aubmindlab/bert-base-arabertv2", num_labels=num_labels)
model.to(device)  # Make sure device is set (e.g., device = torch.device("cuda" if torch.cuda.is_available() else "cpu"))



Label IDs: [0, 0, 0, 1, 2, 0, 1, 0]


Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(64000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [26]:
import torch
from torch.utils.data import Dataset, DataLoader

class NERDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels  # list of label id lists, one per sentence

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Return input_ids, attention_mask, and labels for the idx-th example
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item



# For demo
list_of_token_lists = [
    ['تعاني', 'المريضة', 'من', 'صداع', 'شديد', 'و', 'حمى', '.'],
    ['مثال', 'ثاني', 'لجملة']
]

list_of_labels = [
    [0, 0, 0, 1, 2, 0, 1, 0],      # Labels for first sentence
    [0, 0, 0]                      # Labels for second sentence
]

# Tokenize batch
encodings = tokenizer(list_of_token_lists, is_split_into_words=True, padding=True, truncation=True, return_tensors="pt")

# Pad labels manually to match input_ids shape
max_len = encodings.input_ids.shape[1]
padded_labels = []
for label_seq in list_of_labels:
    padded = label_seq + [-100]*(max_len - len(label_seq))  # -100 to ignore in loss
    padded_labels.append(padded)

# Create dataset
dataset = NERDataset(encodings, padded_labels)

# Create dataloader
loader = DataLoader(dataset, batch_size=2, shuffle=True)




In [27]:
from transformers import AutoModelForTokenClassification
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForTokenClassification.from_pretrained("aubmindlab/bert-base-arabertv2", num_labels=num_labels)
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

model.train()

epochs = 3

for epoch in range(epochs):
    total_loss = 0
    for batch in loader:
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(loader)
    print(f"Epoch {epoch+1}/{epochs} — Loss: {avg_loss:.4f}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/3 — Loss: 1.1451
Epoch 2/3 — Loss: 0.7579
Epoch 3/3 — Loss: 0.6654


In [30]:
model.save_pretrained("my_ner_model")
tokenizer.save_pretrained("my_ner_model")


('my_ner_model/tokenizer_config.json',
 'my_ner_model/special_tokens_map.json',
 'my_ner_model/vocab.txt',
 'my_ner_model/added_tokens.json',
 'my_ner_model/tokenizer.json')