In [None]:
import json
import os
from datetime import date
from medcat.cat import CAT
from medcat.meta_cat import MetaCAT
from medcat.config_meta_cat import ConfigMetaCAT
from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBPE, TokenizerWrapperBERT
from tokenizers import ByteLevelBPETokenizer

# Set parameters

In [None]:
# relative path to working_with_cogstack folder
_rel_path = os.path.join("..", "..", "..")
# absolute path to working_with_cogstack folder
base_path = os.path.abspath(_rel_path)
# Load mct export
ann_dir = os.path.join(base_path, "data", "medcattrainer_export")

mctrainer_export_path = ann_dir + ""  # name of your mct export

# Load model
model_dir = os.path.join(base_path, "models", "modelpack")
modelpack = '' # name of modelpack
model_pack_path = os.path.join(model_dir, modelpack)
     #output_modelpack = model_dir + f"{today}_trained_model"

# will be used to date the trained model
today = str(date.today())
today = today.replace("-","")

# Initialise meta_ann models
if model_pack_path[-4:] == '.zip':
    base_dir_meta_models = model_pack_path[:-4]
else:
    base_dir_meta_models = model_pack_path

# Iterate through the meta_models contained in the model
meta_model_names = [] # These Meta_annotation tasks should correspond to the ones labelled in the mcttrainer export
for dirpath, dirnames, filenames in os.walk(base_dir_meta_models):
    for dirname in dirnames:
        if dirname.startswith('meta_'):
            meta_model_names.append(dirname[5:])

Before you run the next section please double check that the model meta_annotation names matches to those specified in the mct export.

# For LSTM model

In [None]:
for meta_model in meta_model_names:
    vocab_file = os.path.join(base_dir_meta_models,"meta_"+meta_model,'bbpe-vocab.json')
    merges_file = os.path.join(base_dir_meta_models,"meta_"+meta_model,'bbpe-merges.txt')
    tokenizer = TokenizerWrapperBPE(ByteLevelBPETokenizer(vocab=vocab_file,
                                    merges=merges_file,
                                    lowercase=True))
    # load and sort out the config
    config_file = os.path.join(base_dir_meta_models,"meta_"+meta_model,"config.json")
    with open(config_file, 'r') as jfile:
        config_dict = json.load(jfile)
    config = ConfigMetaCAT()
    for key, value in config_dict.items():
        setattr(config, key, value['py/state']['__dict__'])
        # Reset the config attributes. TODO: Talk to Mart about how his new config style has affected this and best practise going forward

    save_dir_path= "test_meta_"+meta_model # Where to save the meta_model and results. 
    #Ideally this should replace the meta_models inside the modelpack

    # Initialise and train meta_model
    mc = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config)
    results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)

    # Save results
    json.dump(results, open(os.path.join(save_dir_path,'meta_'+meta_model+'_results.json'), 'w'))

# For BERT model

In [None]:
for meta_model in meta_model_names:
    # load and sort out the config
    config_file = os.path.join(base_dir_meta_models,"meta_"+meta_model,"config.json")
    with open(config_file, 'r') as jfile:
        config_dict = json.load(jfile)
    config = ConfigMetaCAT()
    for key, value in config_dict.items():
        setattr(config, key, value['py/state']['__dict__'])

    # change model name if training BERT for the first time
    config['model']['model_name'] = 'bert'

    # change input_size as well
    config['model']['input_size'] = 768

    tokenizer = TokenizerWrapperBERT.load(os.path.join(base_dir_meta_models,"meta_"+meta_model), config['model']['model_variant'])
    
    save_dir_path= "test_meta_"+meta_model # Where to save the meta_model and results. 
    #Ideally this should replace the meta_models inside the modelpack

    # Initialise and train meta_model
    mc = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config)
    results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)

    # Save results
    json.dump(results, open(os.path.join(save_dir_path,'meta_'+meta_model+'_results.json'), 'w'))

## If you dont have the model packs, and are training from scratch

In [None]:
config = ConfigMetaCAT()
# make sure to change the following parameters:
# config['model']['nclasses']
# config['general']['category_name']

# change model name if training BERT for the first time
config['model']['model_name'] = 'bert'

tokenizer = TokenizerWrapperBERT.load("", config['model']['model_variant'])

save_dir_path= "test_meta" # Where to save the meta_model and results. 
#Ideally this should replace the meta_models inside the modelpack

# Initialise and train meta_model
mc = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config)
results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)

# Save results
json.dump(results, open(os.path.join(save_dir_path,'meta_results.json'), 'w'))

## If using 2 phase learning with training

### Phase 1

In [None]:
# Follow same steps till defining save_dir_path

######################################################################################################
# 2 phase learning (used for imbalanced datasets) - trains the models twice: 
#                    phase 1: trains for minority class(es) by undersampling data
#                    phase 2: trains for all classes
# parameter values: 
# 1: Phase 1 - Train model on undersampled data
# 2: Phase 2 - Continue training on full data
# 0: None

# NOTE: Make sure to use class weights in favour of minority classes with 2 phase learning
#####################################################################################################

config.model.phase_number = 1

# specify the class that will define the desired sample size for the undersampling process
# if this is left empty, the class with the lowest samples will be chosen
# example shown for Status classification task
config.model['category_undersample'] = 'Other'


# For class weights
# using specified class weights
config['train']['class_weights'] = [0.3,0.7]

# to calculate class weights based on class distribution
config['train']['compute_class_weights'] = True

# NOTE: when using class weights, it is recommended to define the category to index mapping to ensure the weights are assigned to the right class
config['general']['category_value2id'] = {"Other":1,"Confirmed":0}


# Initialise and train meta_model 
mc = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config)
results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)

### Phase 2

In [None]:
# Perform 2nd round of training
config.model.phase_number = 2
# Train phase 2
results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)

json.dump(results, open(os.path.join(save_dir_path,'meta_'+meta_model+'_results.json'), 'w'))

## Oversampling data

You can oversample data as well to help mitigate class imbalance. <br> Use this code to generated synthetic data using LLM - [link](https://gist.github.com/shubham-s-agarwal/401ef8bf6cbbd66fa0c76a8fbfc1f6c4)

In [None]:
# To run the training with original + synthetic data
# Follow the same steps till initializing the metacat model

# Initialise and train meta_model
mc = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config)

# the format expected is [[['text','of','the','document'], [index of medical entity], "label" ],
#                ['text','of','the','document'], [index of medical entity], "label" ]]

synthetic_data_export = [[],[],[]]

results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path,data_oversampled=synthetic_data_export)

# Save results
json.dump(results, open(os.path.join(save_dir_path,'meta_results.json'), 'w'))