In [1]:
# CLONE REPOSITORY
!git clone --branch main https://github.com/rolysr/medical-knowledge-discoverer

Cloning into 'medical-knowledge-discoverer'...
remote: Enumerating objects: 442, done.[K
remote: Counting objects: 100% (103/103), done.[K
remote: Compressing objects: 100% (69/69), done.[K
remote: Total 442 (delta 50), reused 71 (delta 32), pack-reused 339[K
Receiving objects: 100% (442/442), 1.35 MiB | 21.59 MiB/s, done.
Resolving deltas: 100% (197/197), done.


In [2]:
# MOVE TO PROJECT
%cd medical-knowledge-discoverer/

/content/medical-knowledge-discoverer


In [3]:
# INSTALLS
%pip install simplet5 fasttext
!python -m spacy download es_core_news_sm en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting simplet5
  Downloading simplet5-0.1.4.tar.gz (7.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from simplet5)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
Collecting transformers==4.16.2 (from simplet5)
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytorch-lightning==1.5.10 (from s

2023-06-06 03:23:02.645427: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-06 03:23:05.084042: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-06-06 03:23:05.084492: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355


In [4]:
# IMPORTS
import os
from pathlib import Path
from simplet5 import SimpleT5
from rich.progress import track
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# FROM PROJECT
from utils.anntools import Collection


# MODELS
from models.T5.t5 import T5
from models.NER.ner import NER


INFO:pytorch_lightning.utilities.seed:Global seed set to 42


In [5]:
# T5 MODEL
t5 = T5()

# OUTPUT
output_path = Path('./output')
os.makedirs(output_path, exist_ok=True)

# TRAIN PATH
train_path = Path('./datasets/train')
csv_train_file = './models/T5/re_train.csv'

# GENERATE TRAIN DATA
test_collection = Collection().load_dir(train_path)
train_dataset = T5.generate_t5_input_output_format(test_collection)
MAX_INPUT_TOKENS = max([len(data[0]) for data in train_dataset])
MAX_OUTPUT_TOKENS = max([len(data[1]) for data in train_dataset])
t5.generate_csv(train_dataset, csv_train_file)

In [6]:
# TRAIN MODEL
# TRAIN A MODEL FROM SCRATCH AND SAVE EACH EPOCH IN DIFFERENT FILES
model = SimpleT5()

t5.generate_csv(train_dataset, str(csv_train_file))
df = t5.load_csv(str(csv_train_file))
train_df, test_df = train_test_split(df, test_size=0.1)

model.from_pretrained(model_type="t5", model_name="t5-base")

print('Training...')
model.train(train_df=train_df,
            eval_df=test_df, 
            source_max_token_len=MAX_INPUT_TOKENS + 50, 
            target_max_token_len=MAX_OUTPUT_TOKENS + 8, 
            batch_size=8,
            max_epochs=4,
            use_gpu=True,
            outputdir=output_path
)

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.distributed:GPU available: True, used: True
INFO:pytorch_lightning.utilities.distributed:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.distributed:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Training...


INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.seed:Global seed set to 42


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [8]:
# # SELECT MODEL
!ls "./output"

NotImplementedError: ignored

In [11]:
# SELECT MODEL
trained_model = './output/simplet5-epoch-3-train-loss-0.0747-val-loss-0.0715'

In [12]:
# LOAD TRAINED MODEL
model = SimpleT5()
model.load_model('t5', trained_model, use_gpu=True)
t5.model = model

In [13]:
# NER MODEL
ner = NER()

# TRAINING NER MODEL
train_collection = Collection().load_dir(train_path)
ner.train(train_collection)




Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, None, 10)]   0           []                               
                                                                                                  
 time_distributed (TimeDistribu  (None, None, 10, 10  2540       ['input_2[0][0]']                
 ted)                           )                                                                 
                                                                                                  
 input_1 (InputLayer)           [(None, None, 3993)  0           []                               
                                ]                                                                 
                                                                                              

Output()

In [24]:
from utils.anntools import Relation

def relation(rel: Relation):
    return (rel.origin, rel.destination, rel.label)

In [41]:
# EVALUATION
def eval(test_collection: Collection, ner_collection: Collection, model):
    
    CORRECT, MISSING, SPURIOUS, INCORRECT = 0, 0, 0, 0

    for sentences in track([x for x in zip(test_collection.sentences, ner_collection.sentences)]):
        test_sentence, ner_sentence = sentences
        
        test = {}
        for test_relation in test_sentence.relations:
            origin = test_relation.from_phrase
            origin_text = origin.text.lower()
            destination = test_relation.to_phrase
            destination_text = destination.text.lower()

            input_text = T5.get_marked_sentence_t5_input_format(test_sentence.text, origin_text, origin.label, destination_text, destination.label)
            output_text = T5.get_t5_output_format(origin_text, origin.label, destination_text, destination.label, test_relation.label)
            
            test[relation(test_relation)] = output_text

        results= {}
        for ner_relation in ner_sentence.relations:
            origin = ner_relation.from_phrase
            origin_text = origin.text.lower()
            destination = ner_relation.to_phrase
            destination_text = destination.text.lower()

            #making the pair
            input_text = T5.get_marked_sentence_t5_input_format(ner_sentence.text, origin_text, origin.label, destination_text, destination.label)

            results[relation(ner_relation)] = model.predict(input_text)[0]
        
        
        for i in test.copy():
            if results.get(i) is not None:
                if results[i].split("'")[1] == test[i]:
                    CORRECT += 1
                    results.pop(i)
                    test.pop(i)
                else:
                    INCORRECT += 1
                    results.pop(i)
                    test.pop(i)
        
        SPURIOUS += len(results)
        MISSING += len(test)


    return CORRECT, MISSING, SPURIOUS, INCORRECT

In [42]:
# RE EVALUATION
test_path = Path('./datasets/test/scenario1-main')
csv_test_file = Path('models/T5/re_test.csv')
test_collection = Collection().load_dir(test_path)

# EVALUATE NER
ner_collection = ner.run(test_collection)

print(len(test_collection), len(ner_collection))
CORRECT, MISSING, SPURIOUS, INCORRECT = eval(test_collection, ner_collection, model)



Output()

200 200


In [43]:
# SHOW RESULTS
precision = CORRECT / (CORRECT + MISSING + INCORRECT) if (CORRECT + MISSING + INCORRECT) > 0 else 0
recall = CORRECT / (CORRECT + SPURIOUS + INCORRECT) if (CORRECT + SPURIOUS + INCORRECT) > 0 else 0
f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print("Precision:", precision)
print('Recall:', recall)
print('F1 score:', f1)

Precision: 0.6327014218009479
Recall: 0.6327014218009479
F1 score: 0.6327014218009479


In [44]:
print(CORRECT, MISSING, SPURIOUS, INCORRECT)

534 0 0 310
