In [1]:
%pwd

'/home/milad/projects/medical-nlp-pipeline/research'

In [2]:
import os
os.chdir('../')

In [3]:
%pwd

'/home/milad/projects/medical-nlp-pipeline'

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class TrainingConfig:
    root_dir: Path
    nlp_trained_model_path: Path
    nlp_updated_base_model_path: Path
    training_data: Path
    # mlflow_uri: str
    all_params: dict
    params_batch_size: int
    params_epochs: int
    params_learning_rate: float
    params_model_name: str

In [6]:
from medical_nlp.constants import *
from medical_nlp.utils.common import read_yaml, create_directories
from dotenv import load_dotenv

load_dotenv()

# MLFLOW_TRACKING_URI = os.environ["MLFLOW_TRACKING_URI"]
# MLFLOW_TRACKING_USERNAME = os.environ["MLFLOW_TRACKING_USERNAME"]
# MLFLOW_TRACKING_PASSWORD = os.environ["MLFLOW_TRACKING_PASSWORD"]

False

In [8]:
class configurationManager:
    def __init__(self,
                 config_file_path=CONFIG_FILE_PATH,
                 params_file_path=PARAMS_FILE_PATH):
        
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)
        
        create_directories([self.config.artifacts_root])
    
    def get_training_config(self) -> TrainingConfig:
        model_training = self.config.model_training
        prepare_base_model = self.config.prepare_base_model
        training_data = os.path.join(self.config.data_ingestion.root_dir, 'ca-independent-medical-review/')
        
        create_directories([model_training.root_dir])
        
        training_config = TrainingConfig(
            root_dir= model_training.root_dir,
            nlp_trained_model_path= model_training.nlp_trained_model_path,
            nlp_updated_base_model_path= prepare_base_model.nlp_updated_base_model_path,
            training_data= training_data,
            # mlflow_uri = MLFLOW_TRACKING_URI,
            all_params = self.params,
            params_batch_size= self.params.BATCH_SIZE,
            params_epochs= self.params.EPOCHS,
            params_learning_rate= self.params.LEARNING_RATE,
            params_model_name=self.params.MODEL_NAME
        )
        
        return training_config

In [9]:
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from datasets import load_dataset, Split
from transformers import AutoTokenizer, AutoModel
import numpy as np
import mlflow
from urllib.parse import urlparse

[2024-05-06 07:49:04,628: INFO: config: PyTorch version 2.2.0 available.]


  from .autonotebook import tqdm as notebook_tqdm


In [17]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model, ff_units, n_outputs, dropout=0.3):
        super().__init__()
        self.d_model = bert_model.config.dim
        self.n_outputs = n_outputs
        self.encoder = bert_model
        self.mlp = nn.Sequential(
            nn.Linear(self.d_model, ff_units),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(ff_units, n_outputs)
        )
    def encode(self, source, source_mask=None):
        states = self.encoder(
        input_ids=source, attention_mask=source_mask)[0]
        cls_state = states[:, 0]
        return cls_state
    def forward(self, X):
        source_mask = (X > 0)
        # Featurizer
        cls_state = self.encode(X, source_mask)
        # Classifier
        out = self.mlp(cls_state)
        return out

In [18]:
class ModelTrainer(object):
    def __init__(self, config:TrainingConfig, loss_fn=None, optimizer=None):
        self.config = config
        self.model = self.load_model()
        self.loss_fn = loss_fn if loss_fn else nn.BCEWithLogitsLoss()
        self.optimizer = optimizer if optimizer else optim.Adam(self.model.parameters(), lr=self.config.params_learning_rate)
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model.to(self.device)
        
        self.train_loader, self.val_loader = self.set_loaders()
        
        self.losses = []
        self.val_losses = []
        self.accuracy = []
        self.val_accuracy = []
        self.total_epoches = 0
        
        self.train_step_fn = self._make_train_step_fn()
        self.val_step_fn = self._make_val_step_fn()
        
    def load_model(self):
        # return torch.load(self.config.nlp_updated_base_model_path)
        bert_model = AutoModel.from_pretrained(self.config.params_model_name)
        return BERTClassifier(bert_model, 128, n_outputs=1)
    
    def set_seed(self, seed=42):
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        torch.manual_seed(seed)
        np.random.seed(seed)
    
    def label_prep(self, row):
        label_dict = {'Medical Necessity': 0, 'Experimental/Investigational': 1, 'Urgent Care': 2}
        label_type = label_dict[row['Type']]
        return {'labels': label_type}

    
    def set_loaders(self):
        dataset = load_dataset(path='csv',data_files=self.config.training_data + 'Independent_Medical_Reviews_Custom.csv',split=Split.TRAIN)
        dataset = dataset.map(self.label_prep)
        shuffled_dataset = dataset.shuffle(seed=42)
        split_dataset = shuffled_dataset.train_test_split(test_size=0.2)
        train_dataset = split_dataset['train']
        test_dataset = split_dataset['test']
        auto_tokenizer = AutoTokenizer.from_pretrained(self.config.params_model_name)
        tokenizer_kwargs = dict(truncation=True,
                                padding=True,
                                max_length=30,
                                add_special_tokens=True)
        
        train_dataset_float = train_dataset.map(
            lambda row: {'labels': [float(row['labels'])]}
        )
        test_dataset_float = test_dataset.map(
            lambda row: {'labels': [float(row['labels'])]}
        )
        train_tensor_dataset = self._tokenize_dataset(train_dataset_float,
                                                'Findings',
                                                'labels',
                                                auto_tokenizer,
                                                **tokenizer_kwargs)
        test_tensor_dataset = self._tokenize_dataset(test_dataset_float,
                                                'Findings',
                                                'labels',
                                                auto_tokenizer,
                                                **tokenizer_kwargs)
        generator = torch.Generator()
        train_loader = DataLoader(
            train_tensor_dataset, batch_size=4,
            shuffle=True, generator=generator
        )
        test_loader = DataLoader(test_tensor_dataset, batch_size=8)
        
        return train_loader, test_loader
    
    # higher order function to be set and built globally and constructed the inner fuction without knowning x and y before hand
    def _make_train_step_fn(self):
        # single batch operation
        def perform_train_step_fn(x,y):
            # set the train mode
            self.model.train()
            
            # step 1: compute model output
            yhat = self.model(x)
            
            # step 2: compute the loss  
            loss= self.loss_fn(yhat, y)
            
            # step 2': compute accuracy 
            yhat = torch.argmax(yhat,1)
            total_correct = (yhat ==y).sum().item()
            total = y.shape[0]
            acc = total_correct/total
            
            # step 3: compute the gradient
            loss.backward()
            
            #step4: update parameters
            self.optimizer.step()
            self.optimizer.zero_grad()
            
            #step 5: return the loss
            return loss.item() , acc
        return perform_train_step_fn
    
    def _make_val_step_fn(self):
        # single batch operation
        def perform_val_step_fn(x,y):
            # set the model in val mode
            self.model.eval()
            
            #step 1: compute the prediction
            yhat = self.model(x)
            
            #step 2: compute the loss
            loss = self.loss_fn(yhat, y)
            # step 2': compute accuracy 
            yhat = torch.argmax(yhat,1)
            total_correct = (yhat ==y).sum().item()
            total = y.shape[0]
            acc = total_correct/total
            
            return loss.item(), acc
        return perform_val_step_fn
    
    def _mini_batch(self, validation=False):
        # one epoch operation 
        if validation:
            data_loader = self.val_loader
            step_fn = self.val_step_fn
            
        else: 
            data_loader = self.train_loader
            step_fn = self.train_step_fn
            
        if data_loader is None:
            return None
        
        mini_batch_losses = []
        mini_batch_accs = []
        for x_batch, y_batch in data_loader:
            x_batch = x_batch.to(self.device)
            y_batch = y_batch.to(self.device)
            
            mini_batch_loss, mini_batch_acc = step_fn(x_batch,y_batch)
            
            mini_batch_losses.append(mini_batch_loss)
            mini_batch_accs.append(mini_batch_acc)
        
        loss = np.mean(mini_batch_losses)
        acc = np.mean(mini_batch_accs)
        return loss, acc
    
    def train(self, seed=42):
        self.set_seed(seed)
        
        for epoch in range(self.config.params_epochs):
            self.total_epoches +=1
            
            # perform training on mini batches within 1 epoch
            loss, acc = self._mini_batch(validation=False)
            self.losses.append(loss)
            self.accuracy.append(acc)
            # now calc validation
            with torch.no_grad():
                val_loss, val_acc = self._mini_batch(validation=True)
                self.val_losses.append(val_loss)
                self.val_accuracy.append(val_acc)
                
            print(
                f'\nEpoch: {epoch+1} \tTraining Loss: {loss:.4f} \tValidation Loss: {val_loss:.4f}'
            )
            print(
                f'\t\tTraining Accuracy: {100 * acc:.2f}%\t Validation Accuracy: {100 * val_acc:.2f}%'
            )
        self.save_checkpoint()
            
    def save_checkpoint(self):
        checkpoint = {'epoch': self.total_epoches,
                      'model_state_dict': self.model.state_dict(),
                      'optimizer_state_dict': self.optimizer.state_dict(),
                      'loss': self.losses,
                      'accuracy': self.accuracy,
                      'val_loss': self.val_losses,
                      'val_accuracy': self.val_accuracy
                      }
        torch.save(checkpoint, self.config.nlp_trained_model_path)
        
    def load_checkpoint(self):
        checkpoint = torch.load(self.config.nlp_trained_model_path)
        self.model.load_state_dict(checkpoint["model_state_dict"])
        self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
        self.total_epoches = checkpoint["epoch"]
        self.losses = checkpoint["loss"]
        self.accuracy = checkpoint['accuracy']
        self.val_accuracy = checkpoint['val_accuracy']
        self.val_losses = checkpoint["val_loss"]
        self.model.train() # always use train for resuming traning
    
    
    # HF’s Dataset to Tokenized TensorDataset
    def _tokenize_dataset(self, hf_dataset, sentence_field,
        label_field, tokenizer, **kwargs):
        sentences = hf_dataset[sentence_field]
        token_ids = tokenizer(
        sentences, return_tensors='pt', **kwargs
        )['input_ids']
        labels = torch.as_tensor(hf_dataset[label_field])
        dataset = TensorDataset(token_ids, labels)
        return dataset
    
    
    def predict(self, text):
        self.load_checkpoint()
        self.model.eval()
        auto_tokenizer = AutoTokenizer.from_pretrained(self.config.params_model_name)
        tokenizer_kwargs = dict(truncation=True,
                                padding=True,
                                max_length=30,
                                add_special_tokens=True)
        tokenize_text = self._tokenize_dataset(text,
                                                'sentence',
                                                'labels',
                                                auto_tokenizer,
                                                **tokenizer_kwargs)
        x_tensor = torch.as_tensor(tokenize_text).float()
        y_hat_tensor = self.model(x_tensor.to(self.device))
        
        # set it back to the train mode
        self.model.train()
        labels = {0: 'Medical Necessity', 1: 'Experimental/Investigational', 2: 'Urgent Care'}
        
        return labels[np.argmax(y_hat_tensor.detach().cpu().numpy())]
    
    def log_into_mlflow(self):
        mlflow.set_registry_uri(self.config.mlflow_uri)
        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
        
        with mlflow.start_run():
            mlflow.log_params(self.config.all_params)
            mlflow.log_metrics({'train_loss': np.mean(self.losses),'val_loss': np.mean(self.val_losses), 'train_accuracy': np.mean(self.accuracy), 'val_accuracy': np.mean(self.val_accuracy)})
        
            # Model registry does not work with file store
            if tracking_url_type_store != "file":

                # Register the model
                # There are other ways to use the Model Registry, which depends on the use case,
                # please refer to the doc for more information:
                # https://mlflow.org/docs/latest/model-registry.html#api-workflow
                mlflow.pytorch.log_model(self.model, "model", registered_model_name="nlp18Model")
            else:
                mlflow.pytorch.log_model(self.model, "model")

In [19]:
try:
    config = configurationManager()
    training_config = config.get_training_config()
    training = ModelTrainer(config=training_config)
    training.train()
    # training.log_into_mlflow()

except Exception as e:
    raise e

[2024-05-06 07:52:01,558: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-05-06 07:52:01,560: INFO: common: yaml file: params.yaml loaded successfully]
[2024-05-06 07:52:01,561: INFO: common: created directory at: artifacts]
[2024-05-06 07:52:01,562: INFO: common: created directory at: artifacts/training]


Map: 100%|██████████| 19225/19225 [00:00<00:00, 19475.01 examples/s]
Map: 100%|██████████| 15380/15380 [00:01<00:00, 10644.00 examples/s]
Map: 100%|██████████| 3845/3845 [00:00<00:00, 11381.82 examples/s]



Epoch: 1 	Training Loss: 0.1892 	Validation Loss: 0.0086
		Training Accuracy: 282.63%	 Validation Accuracy: 564.24%

Epoch: 2 	Training Loss: -0.1376 	Validation Loss: -0.3190
		Training Accuracy: 282.63%	 Validation Accuracy: 564.24%
