In [1]:
import os
os.chdir('../')

In [2]:
%pwd

'/home/milad/projects/medical-nlp-pipeline'

In [124]:
from datasets import load_dataset, Split
from transformers import AutoTokenizer, AutoModel
from transformers import DistilBertForSequenceClassification
from transformers import BertForSequenceClassification
import torch


In [202]:
import mlflow
from urllib.parse import urlparse

In [6]:
def label_prep(row):
    label_dict = {'Medical Necessity': 0, 'Experimental/Investigational': 1, 'Urgent Care': 2}
    label_type = label_dict[row['Type']]
    return {'labels': label_type}

In [7]:
dataset = load_dataset(path='csv',data_files='artifacts/data_ingestion/ca-independent-medical-review/Independent_Medical_Reviews_Custom.csv',split=Split.TRAIN)
dataset = dataset.map(label_prep)
shuffled_dataset = dataset.shuffle(seed=42)
split_dataset = shuffled_dataset.train_test_split(test_size=0.2)

In [38]:
train_dataset = split_dataset['train']
test_dataset = split_dataset['test']

In [8]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['Findings', 'Type', 'labels'],
        num_rows: 15380
    })
    test: Dataset({
        features: ['Findings', 'Type', 'labels'],
        num_rows: 3845
    })
})

In [33]:
split_dataset['train'].features

{'Findings': Value(dtype='string', id=None),
 'Type': Value(dtype='string', id=None),
 'labels': Value(dtype='int64', id=None)}

In [180]:
# mapping integer labels to string labels and vv
id2label = {0: 'Medical Necessity', 1: 'Experimental/Investigational', 2: 'Urgent Care'}
label2id = {'Medical Necessity': 0, 'Experimental/Investigational': 1, 'Urgent Care': 2}

In [181]:
torch.manual_seed(42)
bert_cls = DistilBertForSequenceClassification.from_pretrained(
'distilbert-base-uncased', num_labels=len(id2label), id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [183]:
auto_tokenizer = AutoTokenizer.from_pretrained(
'distilbert-base-uncased'
)
def tokenize(row):
    return auto_tokenizer(row['Findings'],
                        truncation=True,
                        padding='max_length',
                        max_length=30)

In [184]:
tokenized_train_dataset = train_dataset.map(
tokenize, batched=True
)
tokenized_test_dataset = test_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/3845 [00:00<?, ? examples/s]

Map: 100%|██████████| 3845/3845 [00:00<00:00, 4111.77 examples/s]


In [185]:
print(tokenized_train_dataset[0])

{'Findings': 'Nature of Statutory Criteria/Case Summary: A male enrollee has requested Zepatier for treatment of his hepatitis C virus. Findings: The physician reviewer found that according to the recent joint guidelines issued by the American Association for the Study of Liver Diseases (AASLD) and the Infectious Diseases Society of America (IDSA), all patients with chronic hepatitis C should be treated except those with limited life expectancy due to non-liver-related conditions. After failure of NS5A inhibitors such as Harvoni, AASLD/IDSA guidelines state that patients who are high priority for treatment should undergo testing for resistance-associated variants before re-treatment. No NS5A or NS3 inhibitor resistance testing results have been provided in this case. If there is no NS5A resistance, Zepatier is not recommended.  When resistance to NS5A inhibitors is present, the AASLD/IDSA guidelines do not recommend use of Zepatier therapy unless there is also NS3 resistance.  If both 

In [186]:
tokenized_train_dataset.set_format(
type='torch',
columns=['input_ids', 'attention_mask', 'labels'])

tokenized_test_dataset.set_format(
type='torch',
columns=['input_ids', 'attention_mask', 'labels']
)

In [187]:
from transformers import Trainer
trainer = Trainer(model=bert_cls,
train_dataset=tokenized_train_dataset)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [188]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir='output',
    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    evaluation_strategy='steps',
    eval_steps=300,
    logging_steps=300,
    gradient_accumulation_steps=8,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [189]:
import numpy as np
def compute_metrics(eval_pred):
    predictions = eval_pred.predictions
    labels = eval_pred.label_ids
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

In [190]:
trainer = Trainer(model=bert_cls,
                args=training_args,
                train_dataset=tokenized_train_dataset,
                eval_dataset=tokenized_test_dataset,
                compute_metrics=compute_metrics)

In [191]:
trainer.train()

 31%|███▏      | 300/960 [01:17<02:46,  3.97it/s]

{'loss': 0.3637, 'grad_norm': 1.3411959409713745, 'learning_rate': 3.4375e-05, 'epoch': 0.62}


                                                 
 31%|███▏      | 301/960 [01:21<14:34,  1.33s/it] 

{'eval_loss': 0.2644086480140686, 'eval_accuracy': 0.8925877763328999, 'eval_runtime': 3.623, 'eval_samples_per_second': 1061.282, 'eval_steps_per_second': 132.764, 'epoch': 0.62}


 62%|██████▎   | 600/960 [02:40<01:10,  5.12it/s]

{'loss': 0.234, 'grad_norm': 2.3393137454986572, 'learning_rate': 1.8750000000000002e-05, 'epoch': 1.25}


                                                 
 63%|██████▎   | 601/960 [02:44<07:37,  1.28s/it] 

{'eval_loss': 0.25135377049446106, 'eval_accuracy': 0.893368010403121, 'eval_runtime': 3.6055, 'eval_samples_per_second': 1066.436, 'eval_steps_per_second': 133.409, 'epoch': 1.25}


 94%|█████████▍| 900/960 [03:59<00:14,  4.23it/s]

{'loss': 0.2076, 'grad_norm': 2.560081720352173, 'learning_rate': 3.125e-06, 'epoch': 1.87}


                                                 
 94%|█████████▍| 900/960 [04:02<00:14,  4.23it/s] 

{'eval_loss': 0.23597051203250885, 'eval_accuracy': 0.9076723016905072, 'eval_runtime': 3.5624, 'eval_samples_per_second': 1079.322, 'eval_steps_per_second': 135.021, 'epoch': 1.87}


100%|██████████| 960/960 [04:18<00:00,  3.72it/s]

{'train_runtime': 258.3223, 'train_samples_per_second': 119.076, 'train_steps_per_second': 3.716, 'train_loss': 0.2630216340223948, 'epoch': 2.0}





TrainOutput(global_step=960, training_loss=0.2630216340223948, metrics={'train_runtime': 258.3223, 'train_samples_per_second': 119.076, 'train_steps_per_second': 3.716, 'total_flos': 238445569843200.0, 'train_loss': 0.2630216340223948, 'epoch': 1.9973992197659298})

In [192]:
trainer.evaluate()

  0%|          | 0/481 [00:00<?, ?it/s]

100%|██████████| 481/481 [00:03<00:00, 128.63it/s]


{'eval_loss': 0.23864923417568207,
 'eval_accuracy': 0.9063719115734721,
 'eval_runtime': 3.752,
 'eval_samples_per_second': 1024.775,
 'eval_steps_per_second': 128.197,
 'epoch': 1.9973992197659298}

In [193]:
trainer.save_model('artifacts/training/distilbert')

In [225]:
from transformers import AutoModelForSequenceClassification
loaded_model = (AutoModelForSequenceClassification.from_pretrained('artifacts/training/bert-base-uncased'))
loaded_model.device

device(type='cpu')

In [195]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
loaded_model.to(device)
loaded_model.device

device(type='cuda', index=0)

In [227]:
sentence = '''Nature of Statutory Criteria/Case Summary:  An enrollee has requested a vascular surgery evaluation for treatment of her bilateral lower extremity varicose veins.  
Findings:  The physician reviewer found that based on the available documentation, the requested vascular surgery evaluation is medically necessary to manage this patient’s compression hose, 
to rule out other causes for pain other than varicose veins, and to interpret diagnostic imaging in the presence of varicose veins. Further, the requested evaluation is consistent with the recommended management and treatment guidelines in the current medical literature. 
Thus, the requested evaluation with a vascular surgeon is supported as medically necessary for evaluation of this patient’s painful varicose veins.  
Final Result: The reviewer determined that the requested services are medically necessary for evaluation of the patient’s medical condition. 
Therefore, the Health Plan’s denial should be overturned.  Credentials/Qualifications: The reviewer is board certified in surgery with sub-specialty certification in vascular surgery and is actively practicing. 
The reviewer is an expert in the treatment of the enrollee’s medical condition and knowledgeable about the proposed treatment through recent or current actual clinical experience treating those with the same or a similar medical condition.'''
tokens = auto_tokenizer(sentence, return_tensors='pt')

In [228]:
tokens.to(loaded_model.device)

{'input_ids': tensor([[  101,  3267,  1997, 15201,  9181,  1013,  2553, 12654,  1024,  2019,
         25612,  4402,  2038,  7303,  1037, 21449,  5970,  9312,  2005,  3949,
          1997,  2014, 17758,  2896,  4654,  7913, 16383, 13075, 11261,  3366,
          9607,  1012,  9556,  1024,  1996,  7522, 12027,  2179,  2008,  2241,
          2006,  1996,  2800, 12653,  1010,  1996,  7303, 21449,  5970,  9312,
          2003,  2966,  2135,  4072,  2000,  6133,  2023,  5776,  1521,  1055,
         13379, 21290,  1010,  2000,  3627,  2041,  2060,  5320,  2005,  3255,
          2060,  2084, 13075, 11261,  3366,  9607,  1010,  1998,  2000, 17841,
         16474, 12126,  1999,  1996,  3739,  1997, 13075, 11261,  3366,  9607,
          1012,  2582,  1010,  1996,  7303,  9312,  2003,  8335,  2007,  1996,
          6749,  2968,  1998,  3949, 11594,  1999,  1996,  2783,  2966,  3906,
          1012,  2947,  1010,  1996,  7303,  9312,  2007,  1037, 21449,  9431,
          2003,  3569,  2004,  2966,  

In [229]:
loaded_model.eval()
logits = loaded_model(input_ids=tokens['input_ids'],
attention_mask=tokens['attention_mask'])
logits

SequenceClassifierOutput(loss=None, logits=tensor([[ 3.9107, -1.9434, -3.1365]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [230]:
logits.logits.argmax(dim=1)

tensor([0])

In [231]:
predicted_class_idx = logits.logits.argmax(-1).item()
print("Predicted class:", loaded_model.config.id2label[predicted_class_idx])

Predicted class: Medical Necessity


In [215]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class TrainingConfig:
    root_dir: Path
    nlp_trained_model_path: Path
    nlp_updated_base_model_path: Path
    training_data: Path
    # mlflow_uri: str
    all_params: dict
    params_batch_size: int
    params_epochs: int
    params_learning_rate: float
    params_model_name: str

In [216]:
from medical_nlp.constants import *
from medical_nlp.utils.common import read_yaml, create_directories
from dotenv import load_dotenv

load_dotenv()

# MLFLOW_TRACKING_URI = os.environ["MLFLOW_TRACKING_URI"]
# MLFLOW_TRACKING_USERNAME = os.environ["MLFLOW_TRACKING_USERNAME"]
# MLFLOW_TRACKING_PASSWORD = os.environ["MLFLOW_TRACKING_PASSWORD"]

False

In [218]:
class configurationManager:
    def __init__(self,
                 config_file_path=CONFIG_FILE_PATH,
                 params_file_path=PARAMS_FILE_PATH):
        
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)
        
        create_directories([self.config.artifacts_root])
    
    def get_training_config(self) -> TrainingConfig:
        model_training = self.config.model_training
        prepare_base_model = self.config.prepare_base_model
        training_data = os.path.join(self.config.data_ingestion.root_dir, 'ca-independent-medical-review/')
        
        create_directories([model_training.root_dir])
        
        training_config = TrainingConfig(
            root_dir= model_training.root_dir,
            nlp_trained_model_path= model_training.nlp_trained_model_path,
            nlp_updated_base_model_path= prepare_base_model.nlp_updated_base_model_path,
            training_data= training_data,
            # mlflow_uri = MLFLOW_TRACKING_URI,
            all_params = self.params,
            params_batch_size= self.params.BATCH_SIZE,
            params_epochs= self.params.EPOCHS,
            params_learning_rate= self.params.LEARNING_RATE,
            params_model_name=self.params.MODEL_NAME
        )
        
        return training_config

In [236]:
class ModelTrainerHF(object):
    def __init__(self, config:TrainingConfig):
        self.config = config
        self.dataset = self.load_dataset()
        self.tokenizer, self.model = self.load_model()
        # self.set_loaders()
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.set_seed()
        
    def load_model(self):
        # mapping integer labels to string labels and vv
        id2label = {0: 'Medical Necessity', 1: 'Experimental/Investigational', 2: 'Urgent Care'}
        label2id = {'Medical Necessity': 0, 'Experimental/Investigational': 1, 'Urgent Care': 2}
        tokenizer = AutoTokenizer.from_pretrained(self.config.params_model_name)
        model = AutoModelForSequenceClassification.from_pretrained(
                    self.config.params_model_name, num_labels=len(id2label), id2label=id2label, label2id=label2id
                )
        print(model.classifier)
        return tokenizer, model
    
    def _label_prep(self, row):
        label_dict = {'Medical Necessity': 0, 'Experimental/Investigational': 1, 'Urgent Care': 2}
        label_type = label_dict[row['Type']]
        return {'labels': label_type}
    
    def _tokenize(self, row):
        return self.tokenizer(row['Findings'],
                            truncation=True,
                            padding='max_length',
                            max_length=50)
                        
    
    def load_dataset(self):
        dataset = dataset = load_dataset(path='csv',data_files=self.config.training_data + 'Independent_Medical_Reviews_Custom.csv',split=Split.TRAIN)
        return dataset
    
    def set_seed(self, seed=42):
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        torch.manual_seed(seed)
        np.random.seed(seed)
    
    def set_loaders(self):
        self.dataset = self.dataset.map(self._label_prep)
        self.dataset = self.dataset.shuffle(seed=42)
        self.dataset = self.dataset.train_test_split(test_size=0.2)
        train_dataset = self.dataset['train']
        test_dataset = self.dataset['test']
        tokenized_train_dataset = train_dataset.map(self._tokenize, batched=True)
        tokenized_test_dataset = test_dataset.map(self._tokenize, batched=True)

        tokenized_train_dataset.set_format(
                            type='torch',
                            columns=['input_ids', 'attention_mask', 'labels'])

        tokenized_test_dataset.set_format(
                            type='torch',
                            columns=['input_ids', 'attention_mask', 'labels'])
        
        return tokenized_train_dataset, tokenized_test_dataset
    
    def load_trainer(self):
        
        # training_args = TrainingArguments(
        #                 output_dir='output',
        #                 num_train_epochs=2,
        #                 per_device_train_batch_size=4,
        #                 per_device_eval_batch_size=8,
        #                 evaluation_strategy='steps',
        #                 eval_steps=300,
        #                 logging_steps=300,
        #                 gradient_accumulation_steps=8,
        #             )
        tokenized_train_dataset, tokenized_test_dataset = self.set_loaders()
        args = TrainingArguments(
                self.config.nlp_trained_model_path + self.config.params_model_name,
                # save_strategy="epoch",
                evaluation_strategy="epoch",
                learning_rate=self.config.params_learning_rate,
                per_device_train_batch_size=self.config.params_batch_size,
                per_device_eval_batch_size=self.config.params_batch_size*2,
                num_train_epochs=self.config.params_epochs,
                load_best_model_at_end=True,
                metric_for_best_model="accuracy",
                logging_dir='logs',
                remove_unused_columns=False,
                gradient_accumulation_steps=8,
            )
        args.report_to = []
        
        def _compute_metrics(eval_pred):
            predictions = eval_pred.predictions
            labels = eval_pred.label_ids
            predictions = np.argmax(predictions, axis=1)
            return {"accuracy": (predictions == labels).mean()}
        
        # trainer = Trainer(
        #                 self.model,
        #                 args, 
        #                 train_dataset=self.dataset['train'],
        #                 eval_dataset=self.dataset['test'],
        #                 compute_metrics=_compute_metrics,
        #                 tokenizer=self.tokenizer,
        #             )
        
        trainer = Trainer(model=self.model,
                args=args,
                train_dataset=tokenized_train_dataset,
                eval_dataset=tokenized_test_dataset,
                compute_metrics=_compute_metrics)
        
        return trainer
    
    def train(self):
        trainer = self.load_trainer()
        trainer.train()
        self.save_checkpoint(trainer)
    
    
    def evaluation(self):
        self.load_checkpoint()
        trainer = self.load_trainer()
        outputs = trainer.evaluate()
        print(outputs)
        # y_true = outputs.label_ids
        # y_pred = outputs.predictions.argmax(1)
        # labels = self.dataset['train'].features['label'].names
        # cm = confusion_matrix(y_true, y_pred)
        # disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
        # disp.plot(xticks_rotation=45)
            
    def save_checkpoint(self, trainer):
        trainer.save_model(self.config.nlp_trained_model_path + self.config.params_model_name)
        
    def load_checkpoint(self):
        self.model = AutoModelForSequenceClassification.from_pretrained(self.config.nlp_trained_model_path + self.config.params_model_name)
        self.model.to(self.device)
    
    
    def predict(self, text):
        self.load_checkpoint()
        tokenized_text = self.tokenizer(text, return_tensors='pt')
        tokenized_text.to(self.device)
        outputs = self.model(**tokenized_text)
        logits = outputs.logits
        # model predicts one of the 1000 ImageNet classes
        predicted_class_idx = logits.argmax(-1).item()
        
        return self.model.config.id2label[predicted_class_idx]
    
    def log_into_mlflow(self):
        mlflow.set_registry_uri(self.config.mlflow_uri)
        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
        
        with mlflow.start_run():
            mlflow.log_params(self.config.all_params)
            mlflow.log_metrics({'train_loss': np.mean(self.losses),'val_loss': np.mean(self.val_losses), 'train_accuracy': np.mean(self.accuracy), 'val_accuracy': np.mean(self.val_accuracy)})
        
            # Model registry does not work with file store
            if tracking_url_type_store != "file":

                # Register the model
                # There are other ways to use the Model Registry, which depends on the use case,
                # please refer to the doc for more information:
                # https://mlflow.org/docs/latest/model-registry.html#api-workflow
                mlflow.pytorch.log_model(self.model, "model", registered_model_name=self.config.params_model_name)
            else:
                mlflow.pytorch.log_model(self.model, "model")

In [246]:
try:
    config = configurationManager()
    training_config = config.get_training_config()
    training = ModelTrainerHF(config=training_config)
    # training.train()
    # training.evaluation()
    prediction = training.predict('i need help with my heart burn')
    print(prediction)
    # training.log_into_mlflow()

except Exception as e:
    raise e

[2024-05-07 15:04:44,217: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-05-07 15:04:44,220: INFO: common: yaml file: params.yaml loaded successfully]
[2024-05-07 15:04:44,221: INFO: common: created directory at: artifacts]
[2024-05-07 15:04:44,222: INFO: common: created directory at: artifacts/training]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Linear(in_features=768, out_features=3, bias=True)
Urgent Care
