In [None]:
import string
import numpy as np
import os
import xml.etree.ElementTree as ET
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from transformers import DataCollatorWithPadding
from sklearn.metrics import classification_report, multilabel_confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold
import torch.optim as optim
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

def parse_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    text = root.find('TEXT').text.strip()
    tags = {tag.tag: tag.attrib['met'] for tag in root.find('TAGS')}
    return text, tags

xml_dir = 'part1'
data = []

for file_name in os.listdir(xml_dir):
    if file_name.endswith('.xml'):
        file_path = os.path.join(xml_dir, file_name)
        text, tags = parse_xml(file_path)
        filtered_tags = {key: tags[key] for key in ['ABDOMINAL', 'CREATININE', 'MAJOR-DIABETES']}
        filtered_tags['text'] = text
        data.append(filtered_tags)

df = pd.DataFrame(data)

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation.replace('-', '')))
    words = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

df['clean_text'] = df['text'].apply(preprocess_text)

df['ABDOMINAL'] = df['ABDOMINAL'].apply(lambda x: 1 if x == 'met' else 0)
df['CREATININE'] = df['CREATININE'].apply(lambda x: 1 if x == 'met' else 0)
df['MAJOR-DIABETES'] = df['MAJOR-DIABETES'].apply(lambda x: 1 if x == 'met' else 0)

labels = df[['ABDOMINAL', 'CREATININE', 'MAJOR-DIABETES']].values
texts = df['clean_text'].tolist()

tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-v1.1')

def tokenize_texts(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

class MedicalDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.texts.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def train_and_evaluate(train_loader, val_loader, model, optimizer, scheduler, device, epochs=15, accumulation_steps=4):
    model.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        optimizer.zero_grad()  # Move optimizer.zero_grad() outside the batch loop

        for step, batch in enumerate(train_loader):
            # Move the batch to the device
            labels = batch.pop('labels').to(device)
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**batch, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss = loss / accumulation_steps

            loss.backward()

            if (step + 1) % accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()

        avg_train_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}, Loss: {avg_train_loss}")

    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in val_loader:
            labels = batch.pop('labels').to(device)
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**batch, labels=labels)
            logits = outputs.logits

            predictions.append(logits.cpu().numpy())
            true_labels.append(labels.cpu().numpy())

    predictions = np.concatenate(predictions, axis=0)
    true_labels = np.concatenate(true_labels, axis=0)
    pred_labels = (predictions > 0.5).astype(int)

    return classification_report(true_labels, pred_labels, target_names=['ABDOMINAL', 'CREATININE', 'MAJOR-DIABETES']), multilabel_confusion_matrix(true_labels, pred_labels)

model = BertForSequenceClassification.from_pretrained('dmis-lab/biobert-v1.1', num_labels=3)
optimizer = optim.AdamW(model.parameters(), lr=1e-4, eps=1e-8)
device = torch.device('cuda')

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

y = np.array([np.argmax(label) for label in labels])

fold_results = []

for fold, (train_val_index, test_index) in enumerate(kf.split(texts, y)):
    print(f"Fold {fold + 1}")

    train_val_texts = [texts[i] for i in train_val_index]
    test_texts = [texts[i] for i in test_index]
    train_val_labels = labels[train_val_index]
    test_labels = labels[test_index]

    train_texts, val_texts, train_labels, val_labels = train_test_split(train_val_texts, train_val_labels, test_size=0.1111, random_state=42)  # 0.1111 * 90% = 10%

    train_encoded_texts = tokenize_texts(train_texts)
    val_encoded_texts = tokenize_texts(val_texts)
    test_encoded_texts = tokenize_texts(test_texts)

    train_dataset = MedicalDataset(train_encoded_texts, train_labels)
    val_dataset = MedicalDataset(val_encoded_texts, val_labels)
    test_dataset = MedicalDataset(test_encoded_texts, test_labels)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=data_collator)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=data_collator)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=data_collator)

    total_steps = len(train_loader) * 15  # 15 epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=total_steps//10, num_training_steps=total_steps)

    fold_report, fold_confusion_matrices = train_and_evaluate(train_loader, val_loader, model, optimizer, scheduler, device)
    fold_results.append((fold_report, fold_confusion_matrices))

for i, (report, conf_matrices) in enumerate(fold_results):
    print(f"Results for Fold {i+1}:")
    print("Classification Report:")
    print(report)
    print("Confusion Matrices:")
    for j, label in enumerate(['ABDOMINAL', 'CREATININE', 'MAJOR-DIABETES']):
        print(f"Confusion Matrix for {label}:")
        print(conf_matrices[j])

model_save_path = 'path_to_save_final_model_after_cv'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = batch['labels'].cpu().numpy()
        predictions.append(logits)
        true_labels.append(label_ids)

predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels, axis=0)
pred_labels = (predictions > 0.5).astype(int)

conf_matrices = multilabel_confusion_matrix(true_labels, pred_labels)
print("Confusion Matrices:")
for i, label in enumerate(['ABDOMINAL', 'CREATININE', 'MAJOR-DIABETES']):
    print(f"Confusion Matrix for {label}:")
    print(conf_matrices[i])

class_report = classification_report(true_labels, pred_labels, target_names=['ABDOMINAL', 'CREATININE', 'MAJOR-DIABETES'])
print("Classification Report:")
print(class_report)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 1
Epoch 1, Loss: 0.7092295289039612
Epoch 2, Loss: 0.7017975330352784
Epoch 3, Loss: 0.6906179904937744
Epoch 4, Loss: 0.6873279929161071
Epoch 5, Loss: 0.6771642804145813
Epoch 6, Loss: 0.6661722421646118
Epoch 7, Loss: 0.6790554761886597
Epoch 8, Loss: 0.6740540385246276
Epoch 9, Loss: 0.6779130220413208
Epoch 10, Loss: 0.6717692852020264
Epoch 11, Loss: 0.668136715888977
Epoch 12, Loss: 0.6599303126335144
Epoch 13, Loss: 0.6545925855636596
Epoch 14, Loss: 0.6372753620147705
Epoch 15, Loss: 0.6854901075363159


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fold 2
Epoch 1, Loss: 0.7104501366615296
Epoch 2, Loss: 0.7075736284255981
Epoch 3, Loss: 0.6671812176704407
Epoch 4, Loss: 0.626364803314209
Epoch 5, Loss: 0.6669377326965332
Epoch 6, Loss: 0.6697378635406495
Epoch 7, Loss: 0.6561361670494079
Epoch 8, Loss: 0.653447151184082
Epoch 9, Loss: 0.6504805684089661
Epoch 10, Loss: 0.6447255134582519
Epoch 11, Loss: 0.6333563208580018
Epoch 12, Loss: 0.6092201709747315
Epoch 13, Loss: 0.5969486951828002
Epoch 14, Loss: 0.5722700595855713
Epoch 15, Loss: 0.5383786916732788


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fold 3
Epoch 1, Loss: 0.5360174626111984
Epoch 2, Loss: 0.5495795011520386
Epoch 3, Loss: 0.522450178861618
Epoch 4, Loss: 0.5040276348590851
Epoch 5, Loss: 0.5074393004179001
Epoch 6, Loss: 0.4318307985862096
Epoch 7, Loss: 0.4140099436044693
Epoch 8, Loss: 0.3683725992838542
Epoch 9, Loss: 0.3414212415615718
Epoch 10, Loss: 0.3228698670864105
Epoch 11, Loss: 0.3809761752684911
Epoch 12, Loss: 0.36160710205634433
Epoch 13, Loss: 0.30785974860191345
Epoch 14, Loss: 0.2847176988919576
Epoch 15, Loss: 0.24717796593904495


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fold 4
Epoch 1, Loss: 0.25693770001331967
Epoch 2, Loss: 0.24828250954548517
Epoch 3, Loss: 0.2858055805166562
Epoch 4, Loss: 0.23309014240900675
Epoch 5, Loss: 0.20068746556838354
Epoch 6, Loss: 0.20001914476354918
Epoch 7, Loss: 0.18780794367194176
Epoch 8, Loss: 0.16733994086583456
Epoch 9, Loss: 0.1312048820157846
Epoch 10, Loss: 0.13803814103206
Epoch 11, Loss: 0.11248199890057246
Epoch 12, Loss: 0.08949428796768188
Epoch 13, Loss: 0.0902277280886968
Epoch 14, Loss: 0.08706822929282983
Epoch 15, Loss: 0.06804790844519933


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fold 5
Epoch 1, Loss: 0.10769724721709888
Epoch 2, Loss: 0.09427037090063095
Epoch 3, Loss: 0.0930891577154398
Epoch 4, Loss: 0.0996816127250592
Epoch 5, Loss: 0.06807181052863598
Epoch 6, Loss: 0.06324427450696628
Epoch 7, Loss: 0.060907372583945595
Epoch 8, Loss: 0.05160104048748811
Epoch 9, Loss: 0.052936747670173645
Epoch 10, Loss: 0.04765116237103939
Epoch 11, Loss: 0.039340246468782425
Epoch 12, Loss: 0.0382203304519256
Epoch 13, Loss: 0.03854519004623095
Epoch 14, Loss: 0.030375546775758266
Epoch 15, Loss: 0.03343358946343263


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fold 6
Epoch 1, Loss: 0.034571109960476555
Epoch 2, Loss: 0.03505562866727511
Epoch 3, Loss: 0.02915427243957917
Epoch 4, Loss: 0.02633424320568641
Epoch 5, Loss: 0.025124576563636463
Epoch 6, Loss: 0.023259863257408142
Epoch 7, Loss: 0.02465758038063844
Epoch 8, Loss: 0.02229643225048979
Epoch 9, Loss: 0.020733152826627094
Epoch 10, Loss: 0.01826291826243202
Epoch 11, Loss: 0.019709311425685883
Epoch 12, Loss: 0.016259723498175543
Epoch 13, Loss: 0.016595342506965
Epoch 14, Loss: 0.015800297415504854
Epoch 15, Loss: 0.014221892847369114


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fold 7
Epoch 1, Loss: 0.014075218699872494
Epoch 2, Loss: 0.018580389985193808
Epoch 3, Loss: 0.016661721592148144
Epoch 4, Loss: 0.014441504143178463
Epoch 5, Loss: 0.014814620216687521
Epoch 6, Loss: 0.013317916542291641
Epoch 7, Loss: 0.012601127692808708
Epoch 8, Loss: 0.011708266412218412
Epoch 9, Loss: 0.011876260706533989
Epoch 10, Loss: 0.011067370030408105
Epoch 11, Loss: 0.011220042050505677
Epoch 12, Loss: 0.01088319206610322
Epoch 13, Loss: 0.009822180649886528
Epoch 14, Loss: 0.010553533832232157
Epoch 15, Loss: 0.010270477272570133


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fold 8
Epoch 1, Loss: 0.00920264127974709
Epoch 2, Loss: 0.009536595782265067
Epoch 3, Loss: 0.010421010044713816
Epoch 4, Loss: 0.009923359223951897
Epoch 5, Loss: 0.008859753143042326
Epoch 6, Loss: 0.009047092404216528
Epoch 7, Loss: 0.008513231916973988
Epoch 8, Loss: 0.008375801534081498
Epoch 9, Loss: 0.00857234001159668
Epoch 10, Loss: 0.008522826402137676
Epoch 11, Loss: 0.008481454104185104
Epoch 12, Loss: 0.008181511424481869
Epoch 13, Loss: 0.0076333026712139445
Epoch 14, Loss: 0.007942482751483718
Epoch 15, Loss: 0.007149172713980079


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fold 9
Epoch 1, Loss: 0.007421638894205292
Epoch 2, Loss: 0.007528009979675214
Epoch 3, Loss: 0.007176764930287997
Epoch 4, Loss: 0.007227759808301926
Epoch 5, Loss: 0.007145404427622755
Epoch 6, Loss: 0.0072113102457175655
Epoch 7, Loss: 0.008512490196153522
Epoch 8, Loss: 0.006819186654562752
Epoch 9, Loss: 0.006982405204325914
Epoch 10, Loss: 0.0066612293012440205
Epoch 11, Loss: 0.00630668131634593
Epoch 12, Loss: 0.0064338739806165295
Epoch 13, Loss: 0.006450063005710642
Epoch 14, Loss: 0.006306412785003583
Epoch 15, Loss: 0.0068092006258666515


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fold 10
Epoch 1, Loss: 0.005971172358840704
Epoch 2, Loss: 0.00618156767450273
Epoch 3, Loss: 0.005897634895518422
Epoch 4, Loss: 0.005803555327778061
Epoch 5, Loss: 0.006406250720222791
Epoch 6, Loss: 0.006001800609131654
Epoch 7, Loss: 0.006273890224595864
Epoch 8, Loss: 0.0067635606198261184
Epoch 9, Loss: 0.006133409605051081
Epoch 10, Loss: 0.00545814687696596
Epoch 11, Loss: 0.005751791410148144
Epoch 12, Loss: 0.005824444505075614
Epoch 13, Loss: 0.006568614936744173
Epoch 14, Loss: 0.005411131074652076
Epoch 15, Loss: 0.0058590020053088665


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Results for Fold 1:
Classification Report:
                precision    recall  f1-score   support

     ABDOMINAL       0.00      0.00      0.00         7
    CREATININE       0.00      0.00      0.00         8
MAJOR-DIABETES       0.00      0.00      0.00        11

     micro avg       0.00      0.00      0.00        26
     macro avg       0.00      0.00      0.00        26
  weighted avg       0.00      0.00      0.00        26
   samples avg       0.00      0.00      0.00        26

Confusion Matrices:
Confusion Matrix for ABDOMINAL:
[[14  0]
 [ 7  0]]
Confusion Matrix for CREATININE:
[[13  0]
 [ 8  0]]
Confusion Matrix for MAJOR-DIABETES:
[[10  0]
 [11  0]]
Results for Fold 2:
Classification Report:
                precision    recall  f1-score   support

     ABDOMINAL       0.50      0.11      0.18         9
    CREATININE       0.00      0.00      0.00        10
MAJOR-DIABETES       0.62      0.45      0.53        11

     micro avg       0.60      0.20      0.30        30
  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
ls

part1.zip  [0m[01;34msample_data[0m/


In [None]:
!unzip part1.zip -d .

Archive:  part1.zip
   creating: ./part1/
  inflating: ./part1/100.xml         
  inflating: ./part1/101.xml         
  inflating: ./part1/102.xml         
  inflating: ./part1/103.xml         
  inflating: ./part1/104.xml         
  inflating: ./part1/105.xml         
  inflating: ./part1/106.xml         
  inflating: ./part1/107.xml         
  inflating: ./part1/109.xml         
  inflating: ./part1/110.xml         
  inflating: ./part1/111.xml         
  inflating: ./part1/112.xml         
  inflating: ./part1/113.xml         
  inflating: ./part1/114.xml         
  inflating: ./part1/116.xml         
  inflating: ./part1/117.xml         
  inflating: ./part1/121.xml         
  inflating: ./part1/122.xml         
  inflating: ./part1/123.xml         
  inflating: ./part1/124.xml         
  inflating: ./part1/125.xml         
  inflating: ./part1/126.xml         
  inflating: ./part1/127.xml         
  inflating: ./part1/128.xml         
  inflating: ./part1/129.xml         
  inflat

In [None]:
ls

[0m[01;34mpart1[0m/  part1.zip  [01;34mpath_to_save_final_model_after_cv[0m/  [01;34msample_data[0m/


In [None]:
zip path_to_save_final_model_after_cv/

SyntaxError: invalid syntax (<ipython-input-4-8e1b91c98971>, line 1)

In [None]:
!zip out.zip path_to_save_final_model_after_cv/

  adding: path_to_save_final_model_after_cv/ (stored 0%)


In [None]:
ls

out.zip  [0m[01;34mpart1[0m/  part1.zip  [01;34mpath_to_save_final_model_after_cv[0m/  [01;34msample_data[0m/


In [None]:
cd path_to_save_final_model_after_cv/

/content/path_to_save_final_model_after_cv


In [None]:
ls -l


total 423344
-rw-r--r-- 1 root root       884 Jun 26 20:43 config.json
-rw-r--r-- 1 root root 433273844 Jun 26 20:43 model.safetensors
-rw-r--r-- 1 root root       125 Jun 26 20:43 special_tokens_map.json
-rw-r--r-- 1 root root      1243 Jun 26 20:43 tokenizer_config.json
-rw-r--r-- 1 root root    213450 Jun 26 20:43 vocab.txt


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
ls

config.json  model.safetensors  special_tokens_map.json  tokenizer_config.json  vocab.txt


In [None]:
!cp model.safetensors ../drive/MyDrive/