In [1]:
import os

In [2]:
%pwd

'e:\\Projects for portfolio\\Exoplanet Chatbot\\research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'e:\\Projects for portfolio\\Exoplanet Chatbot'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelEvaluatorConfig: # defined for the config components present in artifacts for model training
   root_dir: Path
   data_path: Path
   model_path: Path
   tokenizer_path: Path
   metric_file_name: Path
   evaluation_data_path: Path

In [6]:
# Configuration manager
from exoplanet_chatbot.constants import *
from exoplanet_chatbot.utils.common import read_yaml,create_directories

class ConfigurationManager:
    def __init__(
            self,
            config_filepath = CONFIG_FILE_PATH,
            params_filepath = PARAMS_FILE_PATH):
    # Here we are reading the yaml file and we can now use the file paths and parameter values present inside pararms and config.yaml        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root]) # Here we are calling the artifacts_root key values using '.' , which was the purpose of @ensure_annotations

    def get_model_evaluator_config(self) -> ModelEvaluatorConfig:

        config= self.config.model_evaluation # Calling the model_trainer dictionary created in config.yaml file

        create_directories([config.root_dir]) # Creating a directory using the root directory

        model_evaluator_config = ModelEvaluatorConfig( # Extracting the values from the config.yaml to here inside data_ingestion_config

        root_dir=config.root_dir,
        data_path=config.data_path,
        model_path=config.model_path,
        tokenizer_path=config.tokenizer_path,
        metric_file_name=config.metric_file_name,
        evaluation_data_path=config.evaluation_data_path
        )
        return model_evaluator_config

In [14]:
# Model Evaluation

import pandas as pd
import torch
import gc
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset, load_metric


class ModelEvaluator:
    def __init__(self,config: ModelEvaluatorConfig):
        self.config = config

    def sampler(self,data_path):

        finetune_dataset = pd.read_csv(data_path)
        evaluation_sample = finetune_dataset.sample(n=1, random_state=42)

        evaluation_sample.rename(columns={'output' : 'expected_response'},inplace=True)

        return evaluation_sample
    
    def generate_response(self,prompt, model, tokenizer, max_length=350):
        inputs = tokenizer(prompt, return_tensors="pt").to('cpu')
        with torch.no_grad():
            outputs = model.generate(inputs['input_ids'], max_length=max_length)
        return tokenizer.decode(outputs[0], skip_special_tokens=True)


    def evaluate(self):

        device = "cuda" if torch.cuda.is_available() else "cpu"

        evaluation_sample = self.sampler(self.config.data_path) # Calling the sampler function to generate the evaluation sample

        evaluation_sample.to_csv(self.config.evaluation_data_path) # Saving the evaluation dataset

        evaluation_sample = evaluation_sample[['prompt','expected_response']] # Selecting the evaluation columns

        # Loading the model and tokenizer

        model = AutoModelForCausalLM.from_pretrained(self.config.model_path,torch_dtype=torch.float16)

        tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.padding_side = "left"

        # Generate predictions
        evaluation_sample['generated_response'] = evaluation_sample['prompt'].apply(lambda x: self.generate_response(x, model, tokenizer))

        # Clear CUDA cache
        torch.cuda.empty_cache()
        gc.collect()

        # Converting the evaluation sample to dataset
        evaluation_dataset = Dataset.from_pandas(evaluation_sample)

        # Load evaluation metrics
        rouge_metric = load_metric("rouge")
        bleu_metric = load_metric("bleu")

        # Extract references and predictions
        references = evaluation_dataset["expected_response"]
        predictions_texts = evaluation_dataset["generated_response"]

        # Calculate ROUGE scores
        rouge_result = rouge_metric.compute(predictions=predictions_texts, references=references)
        print("ROUGE Score:", rouge_result)

        # Extracting 'mid' values from the rouge_result
        rouge_1_dict = {"precision": rouge_result['rouge1'].mid.precision, "recall": rouge_result['rouge1'].mid.recall, "f1": rouge_result['rouge1'].mid.fmeasure}
        rouge_2_dict = {"precision": rouge_result['rouge2'].mid.precision, "recall": rouge_result['rouge2'].mid.recall, "f1": rouge_result['rouge2'].mid.fmeasure}
        rouge_L_dict = {"precision": rouge_result['rougeL'].mid.precision, "recall": rouge_result['rougeL'].mid.recall, "f1": rouge_result['rougeL'].mid.fmeasure}
        rouge_Lsum_dict = {"precision": rouge_result['rougeLsum'].mid.precision, "recall": rouge_result['rougeLsum'].mid.recall, "f1": rouge_result['rougeLsum'].mid.fmeasure}

        # Calculate BLEU score
        bleu_result = bleu_metric.compute(predictions=[pred.split() for pred in predictions_texts], references=[[ref.split()] for ref in references])
        print("BLEU Score:", bleu_result)

        # Create the results dictionary
        results = {
            "Metric": ["ROUGE-1", "ROUGE-2", "ROUGE-L", "ROUGE-Lsum", "BLEU"],
            "Precision": [rouge_1_dict['precision'], rouge_2_dict['precision'], rouge_L_dict['precision'], rouge_Lsum_dict['precision'], None],
            "Recall": [rouge_1_dict['recall'], rouge_2_dict['recall'], rouge_L_dict['recall'], rouge_Lsum_dict['recall'], None],
            "F1": [rouge_1_dict['f1'], rouge_2_dict['f1'], rouge_L_dict['f1'], rouge_Lsum_dict['f1'], None],
            "Score": [None, None, None, None, bleu_result['bleu']]
        }

        metric_results = pd.DataFrame(results)
        metric_results.to_csv(self.config.metric_file_name, index=False)

In [15]:
#Pipeline
try:
    config = ConfigurationManager()
    model_evaluator_config = config.get_model_evaluator_config() # Storing the configuration
    model_training = ModelEvaluator(config=model_evaluator_config) # Using the configuration saved earlier to call model_training
    model_training.evaluate()
except Exception as e:
    raise e

[2024-06-17 20:28:21,706: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-06-17 20:28:21,709: INFO: common: yaml file: params.yaml loaded successfully]
[2024-06-17 20:28:21,711: INFO: common: created directory at: artifacts]
[2024-06-17 20:28:21,711: INFO: common: created directory at: artifacts/model_evaluation]


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


[2024-06-17 20:40:46,290: INFO: rouge_scorer: Using default tokenizer.]
ROUGE Score: {'rouge1': AggregateScore(low=Score(precision=0.046511627906976744, recall=1.0, fmeasure=0.08888888888888888), mid=Score(precision=0.046511627906976744, recall=1.0, fmeasure=0.08888888888888888), high=Score(precision=0.046511627906976744, recall=1.0, fmeasure=0.08888888888888888)), 'rouge2': AggregateScore(low=Score(precision=0.04205607476635514, recall=1.0, fmeasure=0.08071748878923767), mid=Score(precision=0.04205607476635514, recall=1.0, fmeasure=0.08071748878923767), high=Score(precision=0.04205607476635514, recall=1.0, fmeasure=0.08071748878923767)), 'rougeL': AggregateScore(low=Score(precision=0.046511627906976744, recall=1.0, fmeasure=0.08888888888888888), mid=Score(precision=0.046511627906976744, recall=1.0, fmeasure=0.08888888888888888), high=Score(precision=0.046511627906976744, recall=1.0, fmeasure=0.08888888888888888)), 'rougeLsum': AggregateScore(low=Score(precision=0.046511627906976744, r