In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!cp drive/MyDrive/HW4_Train/train.json .

In [14]:
import json

# فایل اصلی که شامل داده‌های اولیه است
input_file = 'train.json'
# فایل خروجی که داده‌های تبدیل شده را ذخیره می‌کند
output_file = 'evaluation_data.json'

def convert_data(input_file, output_file, start_line=10000, end_line=10050):
    data = []

    # خواندن داده‌ها از فایل اصلی
    with open(input_file, 'r', encoding='utf-8') as file:
        for i, line in enumerate(file):
            if i < start_line:
                continue
            if i > end_line:
                break
            try:
                item = json.loads(line.strip())
                # استخراج فیلدهای مورد نظر
                new_item = {
                    "question": item["question"],
                    "exp": item["exp"]
                }
                data.append(new_item)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON at line {i}: {e}")

    # ذخیره داده‌های تبدیل شده در فایل جدید به فرمت JSON
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

# اجرای تابع تبدیل داده‌ها
convert_data(input_file, output_file)
print(f"Data converted and saved to {output_file}")
output_file = 'evaluation_data_train.json'
convert_data(input_file, output_file,start_line=100, end_line=150)
print(f"Data converted and saved to {output_file}")

Data converted and saved to evaluation_data.json
Data converted and saved to evaluation_data_train.json


In [6]:
%pip install transformers
%pip install datasets
!pip install transformers datasets sacrebleu rouge-score

Collecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.0/58.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting portalocker (from sacrebleu)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-2.10.1-py3-none-any.whl (18 kB)
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rou

In [9]:
import json
import torch
from datasets import Dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

# Step 1: Load and preprocess fine-tuning data
def load_finetune_data(json_file,limit = 2000):
    with open(json_file, 'r', encoding='utf-8') as file:
        data = [json.loads(line) for line in file][:limit]
    return data

def preprocess_finetune_data(data):
    inputs = [f"پرسش: {item['question']}" for item in data]
    targets = [item['exp'] for item in data]
    return inputs, targets

# Step 2: Tokenize inputs and targets
def tokenize_data(tokenizer, inputs, targets, max_length=512):
    encodings = tokenizer(list(map(str, inputs)), truncation=True, padding=True, max_length=max_length)
    labels = tokenizer(list(map(str, targets)), truncation=True, padding=True, max_length=max_length).input_ids

    # Replace padding token id's in the labels by -100 to ignore padding in the loss
    labels = [[(label if label != tokenizer.pad_token_id else -100) for label in labels_example] for labels_example in labels]

    encodings['labels'] = labels
    return encodings

# Step 3: Fine-tune the T5 model
def finetune_model(model, tokenizer, train_dataset, eval_dataset, output_dir="./results"):
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=6,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=1000,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset
    )

    trainer.train()

# Execute steps for fine-tuning
json_file = 'train.json'
finetune_data = load_finetune_data(json_file,limit=3000)
inputs, targets = preprocess_finetune_data(finetune_data)

# Split data into training and evaluation sets
train_inputs, eval_inputs, train_targets, eval_targets = train_test_split(inputs, targets, test_size=0.1)

# Load T5 model and tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Tokenize inputs and targets
train_encodings = tokenize_data(tokenizer, train_inputs, train_targets)
eval_encodings = tokenize_data(tokenizer, eval_inputs, eval_targets)

# Create datasets using the `datasets` library
train_dataset = Dataset.from_dict(train_encodings)
eval_dataset = Dataset.from_dict(eval_encodings)

# Fine-tune the model
finetune_model(model, tokenizer, train_dataset, eval_dataset)

# Save the fine-tuned model
model.save_pretrained("./finetuned_model_6e")
tokenizer.save_pretrained("./finetuned_model_6e")
model.save_pretrained("./drive/MyDrive/finetuned_model_6e")
tokenizer.save_pretrained("./drive/MyDrive/finetuned_model_6e")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Epoch,Training Loss,Validation Loss
1,4.7014,4.509769
2,4.7049,4.277831
3,4.3883,4.167476
4,4.4243,4.105363
5,4.3946,4.077777
6,4.09,4.070305


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


('./drive/MyDrive/finetuned_model_6e/tokenizer_config.json',
 './drive/MyDrive/finetuned_model_6e/special_tokens_map.json',
 './drive/MyDrive/finetuned_model_6e/spiece.model',
 './drive/MyDrive/finetuned_model_6e/added_tokens.json')

In [11]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import load_metric

# Step 1: Load data from JSON file
def load_data(json_file, limit=None):
    with open(json_file, 'r', encoding='utf-8') as file:
        data = json.load(file)
    if limit:
        data = data[:limit]
    return data

# Step 2: Preprocess texts (filter out None values and combine fields)
def preprocess_texts(data):
    filtered_data = [item for item in data if item['exp'] is not None and item['question'] is not None]
    texts = [f"{item['question']} {item['exp']}" for item in filtered_data]
    return filtered_data, texts

# Step 3: Compute TF-IDF for texts and user query
def compute_tfidf(texts, query):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts + [query])
    return tfidf_matrix

# Step 4: Compute Cosine Similarity
def find_similar_texts(tfidf_matrix):
    query_vector = tfidf_matrix[-1]
    similarities = cosine_similarity(query_vector, tfidf_matrix[:-1])
    return similarities.flatten()

# Step 5: Display related texts
def get_related_texts(filtered_data, similarities, top_n=3):
    sorted_indices = np.argsort(similarities)[-top_n:][::-1]
    related_texts = [filtered_data[idx]['exp'] for idx in sorted_indices]
    return related_texts

# Step 6: Combine question and related texts to create input for the model
def create_input(query, related_texts):
    input_text = f"پرسش: {query} زمینه: {' '.join(related_texts)}"
    return input_text

# Step 7: Generate answer using the model
def generate_answer(model, tokenizer, input_text, max_length=150, num_beams=5, early_stopping=True, temperature=1.0, top_k=None, top_p=None, repetition_penalty=1.0, no_repeat_ngram_size=2, length_penalty=2.0, do_sample=True):
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    outputs = model.generate(
        input_ids,
        max_length=max_length,
        num_beams=num_beams,
        early_stopping=early_stopping,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        no_repeat_ngram_size=no_repeat_ngram_size,
        length_penalty=length_penalty,
        do_sample=do_sample
    )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Load evaluation data (limited to 50 items)
eval_file = 'evaluation_data.json'
eval_data = load_data(eval_file, limit=50)

# Initialize BLEU, ROUGE, and METEOR metrics
bleu = load_metric("sacrebleu")
# rouge = load_metric("rouge")
# meteor = load_metric("meteor")

# Generate answers with the base model
base_model_path = "t5-small"
base_model = T5ForConditionalGeneration.from_pretrained(base_model_path)
base_tokenizer = T5Tokenizer.from_pretrained(base_model_path)

# Generate answers with the fine-tuned model
finetuned_model_path = "./finetuned_model_6e/"
finetuned_model = T5ForConditionalGeneration.from_pretrained(finetuned_model_path, local_files_only=True)
finetuned_tokenizer = T5Tokenizer.from_pretrained(finetuned_model_path, local_files_only=True)

# Function to evaluate model
def evaluate_model(model, tokenizer, eval_data):
    references = []
    predictions = []
    for idx, item in enumerate(eval_data):
        query = item['question']
        reference = item['exp']
        _, texts = preprocess_texts([item])
        if not texts:
            print(f"Skipping item {idx} with empty texts: {item}")
            continue
        tfidf_matrix = compute_tfidf(texts, query)
        if tfidf_matrix.shape[0] == 0:
            print(f"Empty TF-IDF matrix for query {idx}: {query}")
            continue
        similarities = find_similar_texts(tfidf_matrix)
        if len(similarities) == 0:
            print(f"No similarities found for query {idx}: {query}")
            continue
        related_texts = get_related_texts([item], similarities)
        input_text = create_input(query, related_texts)
        prediction = generate_answer(model, tokenizer, input_text)
        references.append(reference)
        predictions.append(prediction)
        print(f"Processed item {idx}")
    return references, predictions

# Evaluate base model
print("Evaluating base model...")
base_references, base_predictions = evaluate_model(base_model, base_tokenizer, eval_data)

# Evaluate fine-tuned model
print("Evaluating fine-tuned model...")
finetuned_references, finetuned_predictions = evaluate_model(finetuned_model, finetuned_tokenizer, eval_data)

# Calculate BLEU, ROUGE, and METEOR scores
base_bleu_score = bleu.compute(predictions=base_predictions, references=[[ref] for ref in base_references])
# base_rouge_score = rouge.compute(predictions=base_predictions, references=base_references)
# base_meteor_score = meteor.compute(predictions=base_predictions, references=base_references)

finetuned_bleu_score = bleu.compute(predictions=finetuned_predictions, references=[[ref] for ref in finetuned_references])
# finetuned_rouge_score = rouge.compute(predictions=finetuned_predictions, references=finetuned_references)
# finetuned_meteor_score = meteor.compute(predictions=finetuned_predictions, references=finetuned_references)

print("Base Model BLEU Score:", base_bleu_score)
# print("Base Model ROUGE Score:", base_rouge_score)
# print("Base Model METEOR Score:", base_meteor_score)

print("Fine-tuned Model BLEU Score:", finetuned_bleu_score)
# print("Fine-tuned Model ROUGE Score:", finetuned_rouge_score)
# print("Fine-tuned Model METEOR Score:", finetuned_meteor_score)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Evaluating base model...
Skipping item 0 with empty texts: {'question': 'Retinoscopy in 5 year old is best done with:', 'exp': None}
Processed item 1
Processed item 2
Processed item 3
Processed item 4
Processed item 5
Skipping item 6 with empty texts: {'question': 'True about streptococcus:', 'exp': None}


Token indices sequence length is longer than the specified maximum sequence length for this model (590 > 512). Running this sequence through the model will result in indexing errors


Processed item 7
Processed item 8
Processed item 9
Processed item 10
Processed item 11
Skipping item 12 with empty texts: {'question': 'Patient diagnosed to have malaria, smear shows all stages of schizonts 14-20 merozoites, yellowish - brown pigment. The type of malaria is -', 'exp': None}
Processed item 13
Processed item 14
Processed item 15
Skipping item 16 with empty texts: {'question': 'Crude birth rate denominator is -', 'exp': None}
Processed item 17
Processed item 18
Processed item 19
Skipping item 20 with empty texts: {'question': 'Intrauterine exposure of diethylstilboestrol is associated with -', 'exp': None}
Processed item 21
Processed item 22
Processed item 23
Processed item 24
Processed item 25
Processed item 26
Skipping item 27 with empty texts: {'question': 'Which of the following Antiepileptic acts by opening Potassium channel?', 'exp': None}
Processed item 28
Processed item 29
Processed item 30
Processed item 31
Processed item 32
Processed item 33
Processed item 34
Pr

Token indices sequence length is longer than the specified maximum sequence length for this model (590 > 512). Running this sequence through the model will result in indexing errors


Processed item 7
Processed item 8
Processed item 9
Processed item 10
Processed item 11
Skipping item 12 with empty texts: {'question': 'Patient diagnosed to have malaria, smear shows all stages of schizonts 14-20 merozoites, yellowish - brown pigment. The type of malaria is -', 'exp': None}
Processed item 13
Processed item 14
Processed item 15
Skipping item 16 with empty texts: {'question': 'Crude birth rate denominator is -', 'exp': None}
Processed item 17
Processed item 18
Processed item 19
Skipping item 20 with empty texts: {'question': 'Intrauterine exposure of diethylstilboestrol is associated with -', 'exp': None}
Processed item 21
Processed item 22
Processed item 23
Processed item 24
Processed item 25
Processed item 26
Skipping item 27 with empty texts: {'question': 'Which of the following Antiepileptic acts by opening Potassium channel?', 'exp': None}
Processed item 28
Processed item 29
Processed item 30
Processed item 31
Processed item 32
Processed item 33
Processed item 34
Pr

In [12]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import load_metric

# Step 1: Load data from JSON file
def load_data(json_file, limit=None):
    with open(json_file, 'r', encoding='utf-8') as file:
        data = json.load(file)
    if limit:
        data = data[:limit]
    return data

# Step 2: Preprocess texts (filter out None values and combine fields)
def preprocess_texts(data):
    filtered_data = [item for item in data if item['exp'] is not None and item['question'] is not None]
    texts = [f"{item['question']} {item['exp']}" for item in filtered_data]
    return filtered_data, texts

# Step 3: Compute TF-IDF for texts and user query
def compute_tfidf(texts, query):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts + [query])
    return tfidf_matrix

# Step 4: Compute Cosine Similarity
def find_similar_texts(tfidf_matrix):
    query_vector = tfidf_matrix[-1]
    similarities = cosine_similarity(query_vector, tfidf_matrix[:-1])
    return similarities.flatten()

# Step 5: Display related texts
def get_related_texts(filtered_data, similarities, top_n=3):
    sorted_indices = np.argsort(similarities)[-top_n:][::-1]
    related_texts = [filtered_data[idx]['exp'] for idx in sorted_indices]
    return related_texts

# Step 6: Combine question and related texts to create input for the model
def create_input(query, related_texts):
    input_text = f"پرسش: {query} زمینه: {' '.join(related_texts)}"
    return input_text

# Step 7: Generate answer using the model
def generate_answer(model, tokenizer, input_text, max_length=150, num_beams=5, early_stopping=True, temperature=1.0, top_k=None, top_p=None, repetition_penalty=1.0, no_repeat_ngram_size=2, length_penalty=2.0, do_sample=True):
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    outputs = model.generate(
        input_ids,
        max_length=max_length,
        num_beams=num_beams,
        early_stopping=early_stopping,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        no_repeat_ngram_size=no_repeat_ngram_size,
        length_penalty=length_penalty,
        do_sample=do_sample
    )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Load evaluation data (limited to 50 items)
eval_file = 'evaluation_data.json'
eval_data = load_data(eval_file, limit=50)

# Initialize BLEU, ROUGE, and METEOR metrics
bleu = load_metric("sacrebleu")
rouge = load_metric("rouge")
meteor = load_metric("meteor")

# Generate answers with the base model
base_model_path = "t5-small"
base_model = T5ForConditionalGeneration.from_pretrained(base_model_path)
base_tokenizer = T5Tokenizer.from_pretrained(base_model_path)

# Generate answers with the fine-tuned model
finetuned_model_path = "./finetuned_model_6e/"
finetuned_model = T5ForConditionalGeneration.from_pretrained(finetuned_model_path, local_files_only=True)
finetuned_tokenizer = T5Tokenizer.from_pretrained(finetuned_model_path, local_files_only=True)

# Function to evaluate model
def evaluate_model(model, tokenizer, eval_data):
    references = []
    predictions = []
    for idx, item in enumerate(eval_data):
        query = item['question']
        reference = item['exp']
        _, texts = preprocess_texts([item])
        if not texts:
            print(f"Skipping item {idx} with empty texts: {item}")
            continue
        tfidf_matrix = compute_tfidf(texts, query)
        if tfidf_matrix.shape[0] == 0:
            print(f"Empty TF-IDF matrix for query {idx}: {query}")
            continue
        similarities = find_similar_texts(tfidf_matrix)
        if len(similarities) == 0:
            print(f"No similarities found for query {idx}: {query}")
            continue
        related_texts = get_related_texts([item], similarities)
        input_text = create_input(query, related_texts)
        prediction = generate_answer(model, tokenizer, input_text)
        references.append(reference)
        predictions.append(prediction)
        print(f"Processed item {idx}")
    return references, predictions

# Evaluate base model
print("Evaluating base model...")
base_references, base_predictions = evaluate_model(base_model, base_tokenizer, eval_data)

# Evaluate fine-tuned model
print("Evaluating fine-tuned model...")
finetuned_references, finetuned_predictions = evaluate_model(finetuned_model, finetuned_tokenizer, eval_data)

# Calculate BLEU, ROUGE, and METEOR scores
base_bleu_score = bleu.compute(predictions=base_predictions, references=[[ref] for ref in base_references])
base_rouge_score = rouge.compute(predictions=base_predictions, references=base_references)
base_meteor_score = meteor.compute(predictions=base_predictions, references=base_references)

finetuned_bleu_score = bleu.compute(predictions=finetuned_predictions, references=[[ref] for ref in finetuned_references])
finetuned_rouge_score = rouge.compute(predictions=finetuned_predictions, references=finetuned_references)
finetuned_meteor_score = meteor.compute(predictions=finetuned_predictions, references=finetuned_references)

print("Base Model BLEU Score:", base_bleu_score)
print("Base Model ROUGE Score:", base_rouge_score)
print("Base Model METEOR Score:", base_meteor_score)

print("Fine-tuned Model BLEU Score:", finetuned_bleu_score)
print("Fine-tuned Model ROUGE Score:", finetuned_rouge_score)
print("Fine-tuned Model METEOR Score:", finetuned_meteor_score)

# Calculate precision, recall, and accuracy
def calculate_precision_recall_accuracy(predictions, references):
    precision = np.mean([1 if pred in ref else 0 for pred, ref in zip(predictions, references)])
    recall = np.mean([1 if ref in pred else 0 for pred, ref in zip(predictions, references)])
    accuracy = np.mean([1 if pred == ref else 0 for pred, ref in zip(predictions, references)])
    return precision, recall, accuracy

# Base model precision, recall, accuracy
base_precision, base_recall, base_accuracy = calculate_precision_recall_accuracy(base_predictions, base_references)
print("Base Model Precision:", base_precision)
print("Base Model Recall:", base_recall)
print("Base Model Accuracy:", base_accuracy)

# Fine-tuned model precision, recall, accuracy
finetuned_precision, finetuned_recall, finetuned_accuracy = calculate_precision_recall_accuracy(finetuned_predictions, finetuned_references)
print("Fine-tuned Model Precision:", finetuned_precision)
print("Fine-tuned Model Recall:", finetuned_recall)
print("Fine-tuned Model Accuracy:", finetuned_accuracy)


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

The repository for rouge contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/rouge.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading builder script:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

The repository for meteor contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/meteor.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Evaluating base model...
Skipping item 0 with empty texts: {'question': 'Retinoscopy in 5 year old is best done with:', 'exp': None}
Processed item 1
Processed item 2
Processed item 3
Processed item 4
Processed item 5
Skipping item 6 with empty texts: {'question': 'True about streptococcus:', 'exp': None}


Token indices sequence length is longer than the specified maximum sequence length for this model (590 > 512). Running this sequence through the model will result in indexing errors


Processed item 7
Processed item 8
Processed item 9
Processed item 10
Processed item 11
Skipping item 12 with empty texts: {'question': 'Patient diagnosed to have malaria, smear shows all stages of schizonts 14-20 merozoites, yellowish - brown pigment. The type of malaria is -', 'exp': None}
Processed item 13
Processed item 14
Processed item 15
Skipping item 16 with empty texts: {'question': 'Crude birth rate denominator is -', 'exp': None}
Processed item 17
Processed item 18
Processed item 19
Skipping item 20 with empty texts: {'question': 'Intrauterine exposure of diethylstilboestrol is associated with -', 'exp': None}
Processed item 21
Processed item 22
Processed item 23
Processed item 24
Processed item 25
Processed item 26
Skipping item 27 with empty texts: {'question': 'Which of the following Antiepileptic acts by opening Potassium channel?', 'exp': None}
Processed item 28
Processed item 29
Processed item 30
Processed item 31
Processed item 32
Processed item 33
Processed item 34
Pr

Token indices sequence length is longer than the specified maximum sequence length for this model (590 > 512). Running this sequence through the model will result in indexing errors


Processed item 7
Processed item 8
Processed item 9
Processed item 10
Processed item 11
Skipping item 12 with empty texts: {'question': 'Patient diagnosed to have malaria, smear shows all stages of schizonts 14-20 merozoites, yellowish - brown pigment. The type of malaria is -', 'exp': None}
Processed item 13
Processed item 14
Processed item 15
Skipping item 16 with empty texts: {'question': 'Crude birth rate denominator is -', 'exp': None}
Processed item 17
Processed item 18
Processed item 19
Skipping item 20 with empty texts: {'question': 'Intrauterine exposure of diethylstilboestrol is associated with -', 'exp': None}
Processed item 21
Processed item 22
Processed item 23
Processed item 24
Processed item 25
Processed item 26
Skipping item 27 with empty texts: {'question': 'Which of the following Antiepileptic acts by opening Potassium channel?', 'exp': None}
Processed item 28
Processed item 29
Processed item 30
Processed item 31
Processed item 32
Processed item 33
Processed item 34
Pr

In [21]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import load_metric

# Step 1: Load data from JSON file
def load_data(json_file, limit=None):
    with open(json_file, 'r', encoding='utf-8') as file:
        data = json.load(file)
    if limit:
        data = data[:limit]
    return data

# Step 2: Preprocess texts (filter out None values and combine fields)
def preprocess_texts(data):
    filtered_data = [item for item in data if item['exp'] is not None and item['question'] is not None]
    texts = [f"{item['question']} {item['exp']}" for item in filtered_data]
    return filtered_data, texts

# Step 3: Compute TF-IDF for texts and user query
def compute_tfidf(texts, query):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts + [query])
    return tfidf_matrix

# Step 4: Compute Cosine Similarity
def find_similar_texts(tfidf_matrix):
    query_vector = tfidf_matrix[-1]
    similarities = cosine_similarity(query_vector, tfidf_matrix[:-1])
    return similarities.flatten()

# Step 5: Display related texts
def get_related_texts(filtered_data, similarities, top_n=3):
    sorted_indices = np.argsort(similarities)[-top_n:][::-1]
    related_texts = [filtered_data[idx]['exp'] for idx in sorted_indices]
    return related_texts

# Step 6: Combine question and related texts to create input for the model
def create_input(query, related_texts):
    input_text = f"پرسش: {query} زمینه: {' '.join(related_texts)}"
    return input_text

# Step 7: Generate answer using the model
def generate_answer(model, tokenizer, input_text, max_length=150, num_beams=5, early_stopping=True, temperature=1.0, top_k=None, top_p=None, repetition_penalty=1.0, no_repeat_ngram_size=2, length_penalty=2.0, do_sample=True):
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    outputs = model.generate(
        input_ids,
        max_length=max_length,
        num_beams=num_beams,
        early_stopping=early_stopping,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        no_repeat_ngram_size=no_repeat_ngram_size,
        length_penalty=length_penalty,
        do_sample=do_sample
    )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Load evaluation data (limited to 50 items)
eval_file = 'evaluation_data_train.json'
eval_data = load_data(eval_file, limit=50)

# Initialize BLEU, ROUGE, and METEOR metrics
bleu = load_metric("sacrebleu")
rouge = load_metric("rouge")
meteor = load_metric("meteor")

# Generate answers with the base model
base_model_path = "t5-small"
base_model = T5ForConditionalGeneration.from_pretrained(base_model_path)
base_tokenizer = T5Tokenizer.from_pretrained(base_model_path)

# Generate answers with the fine-tuned model
finetuned_model_path = "./finetuned_model_6e/"
finetuned_model = T5ForConditionalGeneration.from_pretrained(finetuned_model_path, local_files_only=True)
finetuned_tokenizer = T5Tokenizer.from_pretrained(finetuned_model_path, local_files_only=True)

# Function to evaluate model
def evaluate_model(model, tokenizer, eval_data):
    references = []
    predictions = []
    for idx, item in enumerate(eval_data):
        query = item['question']
        reference = item['exp']
        _, texts = preprocess_texts([item])
        if not texts:
            print(f"Skipping item {idx} with empty texts: {item}")
            continue
        tfidf_matrix = compute_tfidf(texts, query)
        if tfidf_matrix.shape[0] == 0:
            print(f"Empty TF-IDF matrix for query {idx}: {query}")
            continue
        similarities = find_similar_texts(tfidf_matrix)
        if len(similarities) == 0:
            print(f"No similarities found for query {idx}: {query}")
            continue
        related_texts = get_related_texts([item], similarities)
        input_text = create_input(query, related_texts)
        prediction = generate_answer(model, tokenizer, input_text)
        references.append(reference)
        predictions.append(prediction)
        print(f"Processed item {idx}")
    return references, predictions

# Evaluate base model
print("Evaluating base model...")
base_references, base_predictions = evaluate_model(base_model, base_tokenizer, eval_data)

# Evaluate fine-tuned model
print("Evaluating fine-tuned model...")
finetuned_references, finetuned_predictions = evaluate_model(finetuned_model, finetuned_tokenizer, eval_data)

# Calculate BLEU, ROUGE, and METEOR scores
base_bleu_score = bleu.compute(predictions=base_predictions, references=[[ref] for ref in base_references])
base_rouge_score = rouge.compute(predictions=base_predictions, references=base_references)
base_meteor_score = meteor.compute(predictions=base_predictions, references=base_references)

finetuned_bleu_score = bleu.compute(predictions=finetuned_predictions, references=[[ref] for ref in finetuned_references])
finetuned_rouge_score = rouge.compute(predictions=finetuned_predictions, references=finetuned_references)
finetuned_meteor_score = meteor.compute(predictions=finetuned_predictions, references=finetuned_references)

print("Base Model BLEU Score:", base_bleu_score)
print("Base Model ROUGE Score:", base_rouge_score)
print("Base Model METEOR Score:", base_meteor_score)

print("Fine-tuned Model BLEU Score:", finetuned_bleu_score)
print("Fine-tuned Model ROUGE Score:", finetuned_rouge_score)
print("Fine-tuned Model METEOR Score:", finetuned_meteor_score)

# Calculate precision, recall, and accuracy
def calculate_precision_recall_accuracy(predictions, references):
    precision = np.mean([1 if pred in ref else 0 for pred, ref in zip(predictions, references)])
    recall = np.mean([1 if ref in pred else 0 for pred, ref in zip(predictions, references)])
    accuracy = np.mean([1 if pred == ref else 0 for pred, ref in zip(predictions, references)])
    return precision, recall, accuracy

# Base model precision, recall, accuracy
base_precision, base_recall, base_accuracy = calculate_precision_recall_accuracy(base_predictions, base_references)
print("Base Model Precision:", base_precision)
print("Base Model Recall:", base_recall)
print("Base Model Accuracy:", base_accuracy)

# Fine-tuned model precision, recall, accuracy
finetuned_precision, finetuned_recall, finetuned_accuracy = calculate_precision_recall_accuracy(finetuned_predictions, finetuned_references)
print("Fine-tuned Model Precision:", finetuned_precision)
print("Fine-tuned Model Recall:", finetuned_recall)
print("Fine-tuned Model Accuracy:", finetuned_accuracy)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Evaluating base model...
Processed item 0
Processed item 1
Processed item 2
Processed item 3
Processed item 4
Processed item 5
Processed item 6
Processed item 7
Processed item 8
Processed item 9
Processed item 10
Skipping item 11 with empty texts: {'question': 'Relining of complete denture is not indicated when', 'exp': None}
Processed item 12
Processed item 13
Processed item 14
Processed item 15
Processed item 16
Processed item 17
Processed item 18
Processed item 19


Token indices sequence length is longer than the specified maximum sequence length for this model (540 > 512). Running this sequence through the model will result in indexing errors


Processed item 20
Skipping item 21 with empty texts: {'question': 'Secondary retention for a removable partial denture is provided by', 'exp': None}
Processed item 22
Processed item 23
Skipping item 24 with empty texts: {'question': 'Egg shell calcification is seen in all except –', 'exp': None}
Processed item 25
Processed item 26
Processed item 27
Processed item 28
Processed item 29
Processed item 30
Skipping item 31 with empty texts: {'question': 'A labourer involved with repair-work of sewers was admitted with fever, jaundice and renal failure. The most appropriate test to diagnose the infection in this patient is -', 'exp': None}
Processed item 32
Processed item 33
Processed item 34
Processed item 35
Skipping item 36 with empty texts: {'question': 'A 40-year old diabetic patient presents with proptosis of one eye and black eschar over palate. The likely organism is :', 'exp': None}
Processed item 37
Skipping item 38 with empty texts: {'question': 'Investigations in a clinically sus

Token indices sequence length is longer than the specified maximum sequence length for this model (540 > 512). Running this sequence through the model will result in indexing errors


Processed item 20
Skipping item 21 with empty texts: {'question': 'Secondary retention for a removable partial denture is provided by', 'exp': None}
Processed item 22
Processed item 23
Skipping item 24 with empty texts: {'question': 'Egg shell calcification is seen in all except –', 'exp': None}
Processed item 25
Processed item 26
Processed item 27
Processed item 28
Processed item 29
Processed item 30
Skipping item 31 with empty texts: {'question': 'A labourer involved with repair-work of sewers was admitted with fever, jaundice and renal failure. The most appropriate test to diagnose the infection in this patient is -', 'exp': None}
Processed item 32
Processed item 33
Processed item 34
Processed item 35
Skipping item 36 with empty texts: {'question': 'A 40-year old diabetic patient presents with proptosis of one eye and black eschar over palate. The likely organism is :', 'exp': None}
Processed item 37
Skipping item 38 with empty texts: {'question': 'Investigations in a clinically sus

In [13]:
import json
import torch
from datasets import Dataset, load_metric
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

# Step 1: Load and preprocess fine-tuning data
def load_finetune_data(json_file, limit=2000):
    with open(json_file, 'r', encoding='utf-8') as file:
        data = [json.loads(line) for line in file][:limit]
    return data

def preprocess_finetune_data(data):
    inputs = [f"پرسش: {item['question']}" for item in data]
    targets = [item['exp'] for item in data]
    return inputs, targets

# Step 2: Tokenize inputs and targets
def tokenize_data(tokenizer, inputs, targets, max_length=512):
    encodings = tokenizer(list(map(str, inputs)), truncation=True, padding=True, max_length=max_length)
    labels = tokenizer(list(map(str, targets)), truncation=True, padding=True, max_length=max_length).input_ids

    # Replace padding token id's in the labels by -100 to ignore padding in the loss
    labels = [[(label if label != tokenizer.pad_token_id else -100) for label in labels_example] for labels_example in labels]

    encodings['labels'] = labels
    return encodings

# Step 3: Fine-tune the T5 model
def finetune_model(model, tokenizer, train_dataset, eval_dataset, output_dir="./results"):
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=10,  # Increase the number of epochs
        per_device_train_batch_size=16,  # Increase batch size if GPU memory allows
        per_device_eval_batch_size=16,
        warmup_steps=500,  # Reduced warmup steps
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="loss",  # Track loss for best model
        greater_is_better=False,
        fp16=True,  # Enable mixed precision training for faster performance on GPUs
        learning_rate=3e-4,  # Increase learning rate slightly
        lr_scheduler_type="cosine",  # Use a cosine scheduler for learning rate
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset
    )

    trainer.train()

# Execute steps for fine-tuning
json_file = 'train.json'
finetune_data = load_finetune_data(json_file, limit=5000)  # Increase data limit for fine-tuning
inputs, targets = preprocess_finetune_data(finetune_data)

# Split data into training and evaluation sets
train_inputs, eval_inputs, train_targets, eval_targets = train_test_split(inputs, targets, test_size=0.1, random_state=42)

# Load T5 model and tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Tokenize inputs and targets
train_encodings = tokenize_data(tokenizer, train_inputs, train_targets)
eval_encodings = tokenize_data(tokenizer, eval_inputs, eval_targets)

# Create datasets using the datasets library
train_dataset = Dataset.from_dict(train_encodings)
eval_dataset = Dataset.from_dict(eval_encodings)

# Fine-tune the model
finetune_model(model, tokenizer, train_dataset, eval_dataset)

# Save the fine-tuned model
model.save_pretrained("./finetuned_model_10e")
tokenizer.save_pretrained("./finetuned_model_10e")
model.save_pretrained("./drive/MyDrive/finetuned_model_10e")
tokenizer.save_pretrained("./drive/MyDrive/finetuned_model_10e")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Epoch,Training Loss,Validation Loss
1,4.4162,4.113675
2,4.2219,3.916138
3,4.0226,3.819242
4,3.8969,3.763644
5,3.8296,3.71825
6,3.8208,3.691356
7,3.7093,3.678784
8,3.6776,3.669274
9,3.6808,3.668258
10,3.6965,3.668129


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


('./drive/MyDrive/finetuned_model_10e/tokenizer_config.json',
 './drive/MyDrive/finetuned_model_10e/special_tokens_map.json',
 './drive/MyDrive/finetuned_model_10e/spiece.model',
 './drive/MyDrive/finetuned_model_10e/added_tokens.json')

In [15]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import load_metric

# Step 1: Load data from JSON file
def load_data(json_file, limit=None):
    with open(json_file, 'r', encoding='utf-8') as file:
        data = json.load(file)
    if limit:
        data = data[:limit]
    return data

# Step 2: Preprocess texts (filter out None values and combine fields)
def preprocess_texts(data):
    filtered_data = [item for item in data if item['exp'] is not None and item['question'] is not None]
    texts = [f"{item['question']} {item['exp']}" for item in filtered_data]
    return filtered_data, texts

# Step 3: Compute TF-IDF for texts and user query
def compute_tfidf(texts, query):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts + [query])
    return tfidf_matrix

# Step 4: Compute Cosine Similarity
def find_similar_texts(tfidf_matrix):
    query_vector = tfidf_matrix[-1]
    similarities = cosine_similarity(query_vector, tfidf_matrix[:-1])
    return similarities.flatten()

# Step 5: Display related texts
def get_related_texts(filtered_data, similarities, top_n=3):
    sorted_indices = np.argsort(similarities)[-top_n:][::-1]
    related_texts = [filtered_data[idx]['exp'] for idx in sorted_indices]
    return related_texts

# Step 6: Combine question and related texts to create input for the model
def create_input(query, related_texts):
    input_text = f"پرسش: {query} زمینه: {' '.join(related_texts)}"
    return input_text

# Step 7: Generate answer using the model
def generate_answer(model, tokenizer, input_text, max_length=150, num_beams=5, early_stopping=True, temperature=1.0, top_k=None, top_p=None, repetition_penalty=1.0, no_repeat_ngram_size=2, length_penalty=2.0, do_sample=True):
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    outputs = model.generate(
        input_ids,
        max_length=max_length,
        num_beams=num_beams,
        early_stopping=early_stopping,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        no_repeat_ngram_size=no_repeat_ngram_size,
        length_penalty=length_penalty,
        do_sample=do_sample
    )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Load evaluation data (limited to 50 items)
eval_file = 'evaluation_data.json'
eval_data = load_data(eval_file, limit=50)

# Initialize BLEU, ROUGE, and METEOR metrics
bleu = load_metric("sacrebleu")
rouge = load_metric("rouge")
meteor = load_metric("meteor")

# Generate answers with the base model
base_model_path = "t5-small"
base_model = T5ForConditionalGeneration.from_pretrained(base_model_path)
base_tokenizer = T5Tokenizer.from_pretrained(base_model_path)

# Generate answers with the fine-tuned model
finetuned_model_path = "./finetuned_model_10e/"
finetuned_model = T5ForConditionalGeneration.from_pretrained(finetuned_model_path, local_files_only=True)
finetuned_tokenizer = T5Tokenizer.from_pretrained(finetuned_model_path, local_files_only=True)

# Function to evaluate model
def evaluate_model(model, tokenizer, eval_data):
    references = []
    predictions = []
    for idx, item in enumerate(eval_data):
        query = item['question']
        reference = item['exp']
        _, texts = preprocess_texts([item])
        if not texts:
            print(f"Skipping item {idx} with empty texts: {item}")
            continue
        tfidf_matrix = compute_tfidf(texts, query)
        if tfidf_matrix.shape[0] == 0:
            print(f"Empty TF-IDF matrix for query {idx}: {query}")
            continue
        similarities = find_similar_texts(tfidf_matrix)
        if len(similarities) == 0:
            print(f"No similarities found for query {idx}: {query}")
            continue
        related_texts = get_related_texts([item], similarities)
        input_text = create_input(query, related_texts)
        prediction = generate_answer(model, tokenizer, input_text)
        references.append(reference)
        predictions.append(prediction)
        print(f"Processed item {idx}")
    return references, predictions

# Evaluate base model
print("Evaluating base model...")
base_references, base_predictions = evaluate_model(base_model, base_tokenizer, eval_data)

# Evaluate fine-tuned model
print("Evaluating fine-tuned model...")
finetuned_references, finetuned_predictions = evaluate_model(finetuned_model, finetuned_tokenizer, eval_data)

# Calculate BLEU, ROUGE, and METEOR scores
base_bleu_score = bleu.compute(predictions=base_predictions, references=[[ref] for ref in base_references])
base_rouge_score = rouge.compute(predictions=base_predictions, references=base_references)
base_meteor_score = meteor.compute(predictions=base_predictions, references=base_references)

finetuned_bleu_score = bleu.compute(predictions=finetuned_predictions, references=[[ref] for ref in finetuned_references])
finetuned_rouge_score = rouge.compute(predictions=finetuned_predictions, references=finetuned_references)
finetuned_meteor_score = meteor.compute(predictions=finetuned_predictions, references=finetuned_references)

print("Base Model BLEU Score:", base_bleu_score)
print("Base Model ROUGE Score:", base_rouge_score)
print("Base Model METEOR Score:", base_meteor_score)

print("Fine-tuned Model BLEU Score:", finetuned_bleu_score)
print("Fine-tuned Model ROUGE Score:", finetuned_rouge_score)
print("Fine-tuned Model METEOR Score:", finetuned_meteor_score)

# Calculate precision, recall, and accuracy
def calculate_precision_recall_accuracy(predictions, references):
    precision = np.mean([1 if pred in ref else 0 for pred, ref in zip(predictions, references)])
    recall = np.mean([1 if ref in pred else 0 for pred, ref in zip(predictions, references)])
    accuracy = np.mean([1 if pred == ref else 0 for pred, ref in zip(predictions, references)])
    return precision, recall, accuracy

# Base model precision, recall, accuracy
base_precision, base_recall, base_accuracy = calculate_precision_recall_accuracy(base_predictions, base_references)
print("Base Model Precision:", base_precision)
print("Base Model Recall:", base_recall)
print("Base Model Accuracy:", base_accuracy)

# Fine-tuned model precision, recall, accuracy
finetuned_precision, finetuned_recall, finetuned_accuracy = calculate_precision_recall_accuracy(finetuned_predictions, finetuned_references)
print("Fine-tuned Model Precision:", finetuned_precision)
print("Fine-tuned Model Recall:", finetuned_recall)
print("Fine-tuned Model Accuracy:", finetuned_accuracy)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Evaluating base model...
Skipping item 0 with empty texts: {'question': 'Retinoscopy in 5 year old is best done with:', 'exp': None}
Processed item 1
Processed item 2
Processed item 3
Processed item 4
Processed item 5
Skipping item 6 with empty texts: {'question': 'True about streptococcus:', 'exp': None}


Token indices sequence length is longer than the specified maximum sequence length for this model (590 > 512). Running this sequence through the model will result in indexing errors


Processed item 7
Processed item 8
Processed item 9
Processed item 10
Processed item 11
Skipping item 12 with empty texts: {'question': 'Patient diagnosed to have malaria, smear shows all stages of schizonts 14-20 merozoites, yellowish - brown pigment. The type of malaria is -', 'exp': None}
Processed item 13
Processed item 14
Processed item 15
Skipping item 16 with empty texts: {'question': 'Crude birth rate denominator is -', 'exp': None}
Processed item 17
Processed item 18
Processed item 19
Skipping item 20 with empty texts: {'question': 'Intrauterine exposure of diethylstilboestrol is associated with -', 'exp': None}
Processed item 21
Processed item 22
Processed item 23
Processed item 24
Processed item 25
Processed item 26
Skipping item 27 with empty texts: {'question': 'Which of the following Antiepileptic acts by opening Potassium channel?', 'exp': None}
Processed item 28
Processed item 29
Processed item 30
Processed item 31
Processed item 32
Processed item 33
Processed item 34
Pr

Token indices sequence length is longer than the specified maximum sequence length for this model (590 > 512). Running this sequence through the model will result in indexing errors


Processed item 7
Processed item 8
Processed item 9
Processed item 10
Processed item 11
Skipping item 12 with empty texts: {'question': 'Patient diagnosed to have malaria, smear shows all stages of schizonts 14-20 merozoites, yellowish - brown pigment. The type of malaria is -', 'exp': None}
Processed item 13
Processed item 14
Processed item 15
Skipping item 16 with empty texts: {'question': 'Crude birth rate denominator is -', 'exp': None}
Processed item 17
Processed item 18
Processed item 19
Skipping item 20 with empty texts: {'question': 'Intrauterine exposure of diethylstilboestrol is associated with -', 'exp': None}
Processed item 21
Processed item 22
Processed item 23
Processed item 24
Processed item 25
Processed item 26
Skipping item 27 with empty texts: {'question': 'Which of the following Antiepileptic acts by opening Potassium channel?', 'exp': None}
Processed item 28
Processed item 29
Processed item 30
Processed item 31
Processed item 32
Processed item 33
Processed item 34
Pr

In [16]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import load_metric

# Step 1: Load data from JSON file
def load_data(json_file, limit=None):
    with open(json_file, 'r', encoding='utf-8') as file:
        data = json.load(file)
    if limit:
        data = data[:limit]
    return data

# Step 2: Preprocess texts (filter out None values and combine fields)
def preprocess_texts(data):
    filtered_data = [item for item in data if item['exp'] is not None and item['question'] is not None]
    texts = [f"{item['question']} {item['exp']}" for item in filtered_data]
    return filtered_data, texts

# Step 3: Compute TF-IDF for texts and user query
def compute_tfidf(texts, query):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts + [query])
    return tfidf_matrix

# Step 4: Compute Cosine Similarity
def find_similar_texts(tfidf_matrix):
    query_vector = tfidf_matrix[-1]
    similarities = cosine_similarity(query_vector, tfidf_matrix[:-1])
    return similarities.flatten()

# Step 5: Display related texts
def get_related_texts(filtered_data, similarities, top_n=3):
    sorted_indices = np.argsort(similarities)[-top_n:][::-1]
    related_texts = [filtered_data[idx]['exp'] for idx in sorted_indices]
    return related_texts

# Step 6: Combine question and related texts to create input for the model
def create_input(query, related_texts):
    input_text = f"پرسش: {query} زمینه: {' '.join(related_texts)}"
    return input_text

# Step 7: Generate answer using the model
def generate_answer(model, tokenizer, input_text, max_length=150, num_beams=5, early_stopping=True, temperature=1.0, top_k=None, top_p=None, repetition_penalty=1.0, no_repeat_ngram_size=2, length_penalty=2.0, do_sample=True):
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    outputs = model.generate(
        input_ids,
        max_length=max_length,
        num_beams=num_beams,
        early_stopping=early_stopping,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        no_repeat_ngram_size=no_repeat_ngram_size,
        length_penalty=length_penalty,
        do_sample=do_sample
    )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Load evaluation data (limited to 50 items)
eval_file = 'evaluation_data_train.json'
eval_data = load_data(eval_file, limit=50)

# Initialize BLEU, ROUGE, and METEOR metrics
bleu = load_metric("sacrebleu")
rouge = load_metric("rouge")
meteor = load_metric("meteor")

# Generate answers with the base model
base_model_path = "t5-small"
base_model = T5ForConditionalGeneration.from_pretrained(base_model_path)
base_tokenizer = T5Tokenizer.from_pretrained(base_model_path)

# Generate answers with the fine-tuned model
finetuned_model_path = "./finetuned_model_10e/"
finetuned_model = T5ForConditionalGeneration.from_pretrained(finetuned_model_path, local_files_only=True)
finetuned_tokenizer = T5Tokenizer.from_pretrained(finetuned_model_path, local_files_only=True)

# Function to evaluate model
def evaluate_model(model, tokenizer, eval_data):
    references = []
    predictions = []
    for idx, item in enumerate(eval_data):
        query = item['question']
        reference = item['exp']
        _, texts = preprocess_texts([item])
        if not texts:
            print(f"Skipping item {idx} with empty texts: {item}")
            continue
        tfidf_matrix = compute_tfidf(texts, query)
        if tfidf_matrix.shape[0] == 0:
            print(f"Empty TF-IDF matrix for query {idx}: {query}")
            continue
        similarities = find_similar_texts(tfidf_matrix)
        if len(similarities) == 0:
            print(f"No similarities found for query {idx}: {query}")
            continue
        related_texts = get_related_texts([item], similarities)
        input_text = create_input(query, related_texts)
        prediction = generate_answer(model, tokenizer, input_text)
        references.append(reference)
        predictions.append(prediction)
        print(f"Processed item {idx}")
    return references, predictions

# Evaluate base model
print("Evaluating base model...")
base_references, base_predictions = evaluate_model(base_model, base_tokenizer, eval_data)

# Evaluate fine-tuned model
print("Evaluating fine-tuned model...")
finetuned_references, finetuned_predictions = evaluate_model(finetuned_model, finetuned_tokenizer, eval_data)

# Calculate BLEU, ROUGE, and METEOR scores
base_bleu_score = bleu.compute(predictions=base_predictions, references=[[ref] for ref in base_references])
base_rouge_score = rouge.compute(predictions=base_predictions, references=base_references)
base_meteor_score = meteor.compute(predictions=base_predictions, references=base_references)

finetuned_bleu_score = bleu.compute(predictions=finetuned_predictions, references=[[ref] for ref in finetuned_references])
finetuned_rouge_score = rouge.compute(predictions=finetuned_predictions, references=finetuned_references)
finetuned_meteor_score = meteor.compute(predictions=finetuned_predictions, references=finetuned_references)

print("Base Model BLEU Score:", base_bleu_score)
print("Base Model ROUGE Score:", base_rouge_score)
print("Base Model METEOR Score:", base_meteor_score)

print("Fine-tuned Model BLEU Score:", finetuned_bleu_score)
print("Fine-tuned Model ROUGE Score:", finetuned_rouge_score)
print("Fine-tuned Model METEOR Score:", finetuned_meteor_score)

# Calculate precision, recall, and accuracy
def calculate_precision_recall_accuracy(predictions, references):
    precision = np.mean([1 if pred in ref else 0 for pred, ref in zip(predictions, references)])
    recall = np.mean([1 if ref in pred else 0 for pred, ref in zip(predictions, references)])
    accuracy = np.mean([1 if pred == ref else 0 for pred, ref in zip(predictions, references)])
    return precision, recall, accuracy

# Base model precision, recall, accuracy
base_precision, base_recall, base_accuracy = calculate_precision_recall_accuracy(base_predictions, base_references)
print("Base Model Precision:", base_precision)
print("Base Model Recall:", base_recall)
print("Base Model Accuracy:", base_accuracy)

# Fine-tuned model precision, recall, accuracy
finetuned_precision, finetuned_recall, finetuned_accuracy = calculate_precision_recall_accuracy(finetuned_predictions, finetuned_references)
print("Fine-tuned Model Precision:", finetuned_precision)
print("Fine-tuned Model Recall:", finetuned_recall)
print("Fine-tuned Model Accuracy:", finetuned_accuracy)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Evaluating base model...
Processed item 0
Processed item 1
Processed item 2
Processed item 3
Processed item 4
Processed item 5
Processed item 6
Processed item 7
Processed item 8
Processed item 9
Processed item 10
Skipping item 11 with empty texts: {'question': 'Relining of complete denture is not indicated when', 'exp': None}
Processed item 12
Processed item 13
Processed item 14
Processed item 15
Processed item 16
Processed item 17
Processed item 18
Processed item 19


Token indices sequence length is longer than the specified maximum sequence length for this model (540 > 512). Running this sequence through the model will result in indexing errors


Processed item 20
Skipping item 21 with empty texts: {'question': 'Secondary retention for a removable partial denture is provided by', 'exp': None}
Processed item 22
Processed item 23
Skipping item 24 with empty texts: {'question': 'Egg shell calcification is seen in all except –', 'exp': None}
Processed item 25
Processed item 26
Processed item 27
Processed item 28
Processed item 29
Processed item 30
Skipping item 31 with empty texts: {'question': 'A labourer involved with repair-work of sewers was admitted with fever, jaundice and renal failure. The most appropriate test to diagnose the infection in this patient is -', 'exp': None}
Processed item 32
Processed item 33
Processed item 34
Processed item 35
Skipping item 36 with empty texts: {'question': 'A 40-year old diabetic patient presents with proptosis of one eye and black eschar over palate. The likely organism is :', 'exp': None}
Processed item 37
Skipping item 38 with empty texts: {'question': 'Investigations in a clinically sus

Token indices sequence length is longer than the specified maximum sequence length for this model (540 > 512). Running this sequence through the model will result in indexing errors


Processed item 20
Skipping item 21 with empty texts: {'question': 'Secondary retention for a removable partial denture is provided by', 'exp': None}
Processed item 22
Processed item 23
Skipping item 24 with empty texts: {'question': 'Egg shell calcification is seen in all except –', 'exp': None}
Processed item 25
Processed item 26
Processed item 27
Processed item 28
Processed item 29
Processed item 30
Skipping item 31 with empty texts: {'question': 'A labourer involved with repair-work of sewers was admitted with fever, jaundice and renal failure. The most appropriate test to diagnose the infection in this patient is -', 'exp': None}
Processed item 32
Processed item 33
Processed item 34
Processed item 35
Skipping item 36 with empty texts: {'question': 'A 40-year old diabetic patient presents with proptosis of one eye and black eschar over palate. The likely organism is :', 'exp': None}
Processed item 37
Skipping item 38 with empty texts: {'question': 'Investigations in a clinically sus

In [17]:
import json
import torch
from datasets import Dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

# Step 1: Load and preprocess fine-tuning data
def load_finetune_data(json_file, limit=5000):  # Increase data limit
    with open(json_file, 'r', encoding='utf-8') as file:
        data = [json.loads(line) for line in file][:limit]
    return data

def preprocess_finetune_data(data):
    inputs = [f"پرسش: {item['question']}" for item in data]
    targets = [item['exp'] for item in data]
    return inputs, targets

# Step 2: Tokenize inputs and targets
def tokenize_data(tokenizer, inputs, targets, max_length=512):
    encodings = tokenizer(list(map(str, inputs)), truncation=True, padding=True, max_length=max_length)
    labels = tokenizer(list(map(str, targets)), truncation=True, padding=True, max_length=max_length).input_ids

    # Replace padding token id's in the labels by -100 to ignore padding in the loss
    labels = [[(label if label != tokenizer.pad_token_id else -100) for label in labels_example] for labels_example in labels]

    encodings['labels'] = labels
    return encodings

# Step 3: Fine-tune the T5 model
def finetune_model(model, tokenizer, train_dataset, eval_dataset, output_dir="./results"):
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=10,  # Increase the number of epochs
        per_device_train_batch_size=8,  # Keep batch size manageable
        per_device_eval_batch_size=8,
        warmup_steps=200,  # Increase warmup steps
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",  # Track validation loss for best model
        greater_is_better=False,
        fp16=True,  # Enable mixed precision training
        learning_rate=5e-5,  # Experiment with lower learning rates
        lr_scheduler_type="cosine",  # Use a cosine scheduler
        save_total_limit=3,  # Save only the last 3 checkpoints
        seed=42  # Ensure reproducibility
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset
    )

    trainer.train()

# Execute steps for fine-tuning
json_file = 'train.json'
finetune_data = load_finetune_data(json_file, limit=5000)  # Increased data limit
inputs, targets = preprocess_finetune_data(finetune_data)

# Split data into training and evaluation sets
train_inputs, eval_inputs, train_targets, eval_targets = train_test_split(inputs, targets, test_size=0.1, random_state=42)

# Load T5 model and tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Tokenize inputs and targets
train_encodings = tokenize_data(tokenizer, train_inputs, train_targets)
eval_encodings = tokenize_data(tokenizer, eval_inputs, eval_targets)

# Create datasets using the datasets library
train_dataset = Dataset.from_dict(train_encodings)
eval_dataset = Dataset.from_dict(eval_encodings)

# Fine-tune the model
finetune_model(model, tokenizer, train_dataset, eval_dataset)

# Save the fine-tuned model
model.save_pretrained("./finetuned_model_11e")
tokenizer.save_pretrained("./finetuned_model_11e")
model.save_pretrained("./drive/MyDrive/finetuned_model_11e")
tokenizer.save_pretrained("./drive/MyDrive/finetuned_model_11e")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Epoch,Training Loss,Validation Loss
1,4.3938,4.140195
2,4.3024,4.02404
3,4.1603,3.965619
4,4.2927,3.933632
5,4.0343,3.904641
6,4.2534,3.886363
7,4.0845,3.879431
8,3.979,3.873139
9,4.0723,3.871222
10,4.1504,3.870848


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


('./drive/MyDrive/finetuned_model_11e/tokenizer_config.json',
 './drive/MyDrive/finetuned_model_11e/special_tokens_map.json',
 './drive/MyDrive/finetuned_model_11e/spiece.model',
 './drive/MyDrive/finetuned_model_11e/added_tokens.json')

In [19]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import load_metric

# Step 1: Load data from JSON file
def load_data(json_file, limit=None):
    with open(json_file, 'r', encoding='utf-8') as file:
        data = json.load(file)
    if limit:
        data = data[:limit]
    return data

# Step 2: Preprocess texts (filter out None values and combine fields)
def preprocess_texts(data):
    filtered_data = [item for item in data if item['exp'] is not None and item['question'] is not None]
    texts = [f"{item['question']} {item['exp']}" for item in filtered_data]
    return filtered_data, texts

# Step 3: Compute TF-IDF for texts and user query
def compute_tfidf(texts, query):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts + [query])
    return tfidf_matrix

# Step 4: Compute Cosine Similarity
def find_similar_texts(tfidf_matrix):
    query_vector = tfidf_matrix[-1]
    similarities = cosine_similarity(query_vector, tfidf_matrix[:-1])
    return similarities.flatten()

# Step 5: Display related texts
def get_related_texts(filtered_data, similarities, top_n=3):
    sorted_indices = np.argsort(similarities)[-top_n:][::-1]
    related_texts = [filtered_data[idx]['exp'] for idx in sorted_indices]
    return related_texts

# Step 6: Combine question and related texts to create input for the model
def create_input(query, related_texts):
    input_text = f"پرسش: {query} زمینه: {' '.join(related_texts)}"
    return input_text

# Step 7: Generate answer using the model
def generate_answer(model, tokenizer, input_text, max_length=150, num_beams=5, early_stopping=True, temperature=1.0, top_k=None, top_p=None, repetition_penalty=1.0, no_repeat_ngram_size=2, length_penalty=2.0, do_sample=True):
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    outputs = model.generate(
        input_ids,
        max_length=max_length,
        num_beams=num_beams,
        early_stopping=early_stopping,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        no_repeat_ngram_size=no_repeat_ngram_size,
        length_penalty=length_penalty,
        do_sample=do_sample
    )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Load evaluation data (limited to 50 items)
eval_file = 'evaluation_data_train.json'
eval_data = load_data(eval_file, limit=50)

# Initialize BLEU, ROUGE, and METEOR metrics
bleu = load_metric("sacrebleu")
rouge = load_metric("rouge")
meteor = load_metric("meteor")

# Generate answers with the base model
base_model_path = "t5-small"
base_model = T5ForConditionalGeneration.from_pretrained(base_model_path)
base_tokenizer = T5Tokenizer.from_pretrained(base_model_path)

# Generate answers with the fine-tuned model
finetuned_model_path = "./finetuned_model_11e/"
finetuned_model = T5ForConditionalGeneration.from_pretrained(finetuned_model_path, local_files_only=True)
finetuned_tokenizer = T5Tokenizer.from_pretrained(finetuned_model_path, local_files_only=True)

# Function to evaluate model
def evaluate_model(model, tokenizer, eval_data):
    references = []
    predictions = []
    for idx, item in enumerate(eval_data):
        query = item['question']
        reference = item['exp']
        _, texts = preprocess_texts([item])
        if not texts:
            print(f"Skipping item {idx} with empty texts: {item}")
            continue
        tfidf_matrix = compute_tfidf(texts, query)
        if tfidf_matrix.shape[0] == 0:
            print(f"Empty TF-IDF matrix for query {idx}: {query}")
            continue
        similarities = find_similar_texts(tfidf_matrix)
        if len(similarities) == 0:
            print(f"No similarities found for query {idx}: {query}")
            continue
        related_texts = get_related_texts([item], similarities)
        input_text = create_input(query, related_texts)
        prediction = generate_answer(model, tokenizer, input_text)
        references.append(reference)
        predictions.append(prediction)
        print(f"Processed item {idx}")
    return references, predictions

# Evaluate base model
print("Evaluating base model...")
base_references, base_predictions = evaluate_model(base_model, base_tokenizer, eval_data)

# Evaluate fine-tuned model
print("Evaluating fine-tuned model...")
finetuned_references, finetuned_predictions = evaluate_model(finetuned_model, finetuned_tokenizer, eval_data)

# Calculate BLEU, ROUGE, and METEOR scores
base_bleu_score = bleu.compute(predictions=base_predictions, references=[[ref] for ref in base_references])
base_rouge_score = rouge.compute(predictions=base_predictions, references=base_references)
base_meteor_score = meteor.compute(predictions=base_predictions, references=base_references)

finetuned_bleu_score = bleu.compute(predictions=finetuned_predictions, references=[[ref] for ref in finetuned_references])
finetuned_rouge_score = rouge.compute(predictions=finetuned_predictions, references=finetuned_references)
finetuned_meteor_score = meteor.compute(predictions=finetuned_predictions, references=finetuned_references)

print("Base Model BLEU Score:", base_bleu_score)
print("Base Model ROUGE Score:", base_rouge_score)
print("Base Model METEOR Score:", base_meteor_score)

print("Fine-tuned Model BLEU Score:", finetuned_bleu_score)
print("Fine-tuned Model ROUGE Score:", finetuned_rouge_score)
print("Fine-tuned Model METEOR Score:", finetuned_meteor_score)

# Calculate precision, recall, and accuracy
def calculate_precision_recall_accuracy(predictions, references):
    precision = np.mean([1 if pred in ref else 0 for pred, ref in zip(predictions, references)])
    recall = np.mean([1 if ref in pred else 0 for pred, ref in zip(predictions, references)])
    accuracy = np.mean([1 if pred == ref else 0 for pred, ref in zip(predictions, references)])
    return precision, recall, accuracy

# Base model precision, recall, accuracy
base_precision, base_recall, base_accuracy = calculate_precision_recall_accuracy(base_predictions, base_references)
print("Base Model Precision:", base_precision)
print("Base Model Recall:", base_recall)
print("Base Model Accuracy:", base_accuracy)

# Fine-tuned model precision, recall, accuracy
finetuned_precision, finetuned_recall, finetuned_accuracy = calculate_precision_recall_accuracy(finetuned_predictions, finetuned_references)
print("Fine-tuned Model Precision:", finetuned_precision)
print("Fine-tuned Model Recall:", finetuned_recall)
print("Fine-tuned Model Accuracy:", finetuned_accuracy)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Evaluating base model...
Processed item 0
Processed item 1
Processed item 2
Processed item 3
Processed item 4
Processed item 5
Processed item 6
Processed item 7
Processed item 8
Processed item 9
Processed item 10
Skipping item 11 with empty texts: {'question': 'Relining of complete denture is not indicated when', 'exp': None}
Processed item 12
Processed item 13
Processed item 14
Processed item 15
Processed item 16
Processed item 17
Processed item 18
Processed item 19


Token indices sequence length is longer than the specified maximum sequence length for this model (540 > 512). Running this sequence through the model will result in indexing errors


Processed item 20
Skipping item 21 with empty texts: {'question': 'Secondary retention for a removable partial denture is provided by', 'exp': None}
Processed item 22
Processed item 23
Skipping item 24 with empty texts: {'question': 'Egg shell calcification is seen in all except –', 'exp': None}
Processed item 25
Processed item 26
Processed item 27
Processed item 28
Processed item 29
Processed item 30
Skipping item 31 with empty texts: {'question': 'A labourer involved with repair-work of sewers was admitted with fever, jaundice and renal failure. The most appropriate test to diagnose the infection in this patient is -', 'exp': None}
Processed item 32
Processed item 33
Processed item 34
Processed item 35
Skipping item 36 with empty texts: {'question': 'A 40-year old diabetic patient presents with proptosis of one eye and black eschar over palate. The likely organism is :', 'exp': None}
Processed item 37
Skipping item 38 with empty texts: {'question': 'Investigations in a clinically sus

Token indices sequence length is longer than the specified maximum sequence length for this model (540 > 512). Running this sequence through the model will result in indexing errors


Processed item 20
Skipping item 21 with empty texts: {'question': 'Secondary retention for a removable partial denture is provided by', 'exp': None}
Processed item 22
Processed item 23
Skipping item 24 with empty texts: {'question': 'Egg shell calcification is seen in all except –', 'exp': None}
Processed item 25
Processed item 26
Processed item 27
Processed item 28
Processed item 29
Processed item 30
Skipping item 31 with empty texts: {'question': 'A labourer involved with repair-work of sewers was admitted with fever, jaundice and renal failure. The most appropriate test to diagnose the infection in this patient is -', 'exp': None}
Processed item 32
Processed item 33
Processed item 34
Processed item 35
Skipping item 36 with empty texts: {'question': 'A 40-year old diabetic patient presents with proptosis of one eye and black eschar over palate. The likely organism is :', 'exp': None}
Processed item 37
Skipping item 38 with empty texts: {'question': 'Investigations in a clinically sus

In [20]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import load_metric

# Step 1: Load data from JSON file
def load_data(json_file, limit=None):
    with open(json_file, 'r', encoding='utf-8') as file:
        data = json.load(file)
    if limit:
        data = data[:limit]
    return data

# Step 2: Preprocess texts (filter out None values and combine fields)
def preprocess_texts(data):
    filtered_data = [item for item in data if item['exp'] is not None and item['question'] is not None]
    texts = [f"{item['question']} {item['exp']}" for item in filtered_data]
    return filtered_data, texts

# Step 3: Compute TF-IDF for texts and user query
def compute_tfidf(texts, query):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts + [query])
    return tfidf_matrix

# Step 4: Compute Cosine Similarity
def find_similar_texts(tfidf_matrix):
    query_vector = tfidf_matrix[-1]
    similarities = cosine_similarity(query_vector, tfidf_matrix[:-1])
    return similarities.flatten()

# Step 5: Display related texts
def get_related_texts(filtered_data, similarities, top_n=3):
    sorted_indices = np.argsort(similarities)[-top_n:][::-1]
    related_texts = [filtered_data[idx]['exp'] for idx in sorted_indices]
    return related_texts

# Step 6: Combine question and related texts to create input for the model
def create_input(query, related_texts):
    input_text = f"پرسش: {query} زمینه: {' '.join(related_texts)}"
    return input_text

# Step 7: Generate answer using the model
def generate_answer(model, tokenizer, input_text, max_length=150, num_beams=5, early_stopping=True, temperature=1.0, top_k=None, top_p=None, repetition_penalty=1.0, no_repeat_ngram_size=2, length_penalty=2.0, do_sample=True):
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    outputs = model.generate(
        input_ids,
        max_length=max_length,
        num_beams=num_beams,
        early_stopping=early_stopping,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        no_repeat_ngram_size=no_repeat_ngram_size,
        length_penalty=length_penalty,
        do_sample=do_sample
    )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Load evaluation data (limited to 50 items)
eval_file = 'evaluation_data.json'
eval_data = load_data(eval_file, limit=50)

# Initialize BLEU, ROUGE, and METEOR metrics
bleu = load_metric("sacrebleu")
rouge = load_metric("rouge")
meteor = load_metric("meteor")

# Generate answers with the base model
base_model_path = "t5-small"
base_model = T5ForConditionalGeneration.from_pretrained(base_model_path)
base_tokenizer = T5Tokenizer.from_pretrained(base_model_path)

# Generate answers with the fine-tuned model
finetuned_model_path = "./finetuned_model_11e/"
finetuned_model = T5ForConditionalGeneration.from_pretrained(finetuned_model_path, local_files_only=True)
finetuned_tokenizer = T5Tokenizer.from_pretrained(finetuned_model_path, local_files_only=True)

# Function to evaluate model
def evaluate_model(model, tokenizer, eval_data):
    references = []
    predictions = []
    for idx, item in enumerate(eval_data):
        query = item['question']
        reference = item['exp']
        _, texts = preprocess_texts([item])
        if not texts:
            print(f"Skipping item {idx} with empty texts: {item}")
            continue
        tfidf_matrix = compute_tfidf(texts, query)
        if tfidf_matrix.shape[0] == 0:
            print(f"Empty TF-IDF matrix for query {idx}: {query}")
            continue
        similarities = find_similar_texts(tfidf_matrix)
        if len(similarities) == 0:
            print(f"No similarities found for query {idx}: {query}")
            continue
        related_texts = get_related_texts([item], similarities)
        input_text = create_input(query, related_texts)
        prediction = generate_answer(model, tokenizer, input_text)
        references.append(reference)
        predictions.append(prediction)
        print(f"Processed item {idx}")
    return references, predictions

# Evaluate base model
print("Evaluating base model...")
base_references, base_predictions = evaluate_model(base_model, base_tokenizer, eval_data)

# Evaluate fine-tuned model
print("Evaluating fine-tuned model...")
finetuned_references, finetuned_predictions = evaluate_model(finetuned_model, finetuned_tokenizer, eval_data)

# Calculate BLEU, ROUGE, and METEOR scores
base_bleu_score = bleu.compute(predictions=base_predictions, references=[[ref] for ref in base_references])
base_rouge_score = rouge.compute(predictions=base_predictions, references=base_references)
base_meteor_score = meteor.compute(predictions=base_predictions, references=base_references)

finetuned_bleu_score = bleu.compute(predictions=finetuned_predictions, references=[[ref] for ref in finetuned_references])
finetuned_rouge_score = rouge.compute(predictions=finetuned_predictions, references=finetuned_references)
finetuned_meteor_score = meteor.compute(predictions=finetuned_predictions, references=finetuned_references)

print("Base Model BLEU Score:", base_bleu_score)
print("Base Model ROUGE Score:", base_rouge_score)
print("Base Model METEOR Score:", base_meteor_score)

print("Fine-tuned Model BLEU Score:", finetuned_bleu_score)
print("Fine-tuned Model ROUGE Score:", finetuned_rouge_score)
print("Fine-tuned Model METEOR Score:", finetuned_meteor_score)

# Calculate precision, recall, and accuracy
def calculate_precision_recall_accuracy(predictions, references):
    precision = np.mean([1 if pred in ref else 0 for pred, ref in zip(predictions, references)])
    recall = np.mean([1 if ref in pred else 0 for pred, ref in zip(predictions, references)])
    accuracy = np.mean([1 if pred == ref else 0 for pred, ref in zip(predictions, references)])
    return precision, recall, accuracy

# Base model precision, recall, accuracy
base_precision, base_recall, base_accuracy = calculate_precision_recall_accuracy(base_predictions, base_references)
print("Base Model Precision:", base_precision)
print("Base Model Recall:", base_recall)
print("Base Model Accuracy:", base_accuracy)

# Fine-tuned model precision, recall, accuracy
finetuned_precision, finetuned_recall, finetuned_accuracy = calculate_precision_recall_accuracy(finetuned_predictions, finetuned_references)
print("Fine-tuned Model Precision:", finetuned_precision)
print("Fine-tuned Model Recall:", finetuned_recall)
print("Fine-tuned Model Accuracy:", finetuned_accuracy)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Evaluating base model...
Skipping item 0 with empty texts: {'question': 'Retinoscopy in 5 year old is best done with:', 'exp': None}
Processed item 1
Processed item 2
Processed item 3
Processed item 4
Processed item 5
Skipping item 6 with empty texts: {'question': 'True about streptococcus:', 'exp': None}


Token indices sequence length is longer than the specified maximum sequence length for this model (590 > 512). Running this sequence through the model will result in indexing errors


Processed item 7
Processed item 8
Processed item 9
Processed item 10
Processed item 11
Skipping item 12 with empty texts: {'question': 'Patient diagnosed to have malaria, smear shows all stages of schizonts 14-20 merozoites, yellowish - brown pigment. The type of malaria is -', 'exp': None}
Processed item 13
Processed item 14
Processed item 15
Skipping item 16 with empty texts: {'question': 'Crude birth rate denominator is -', 'exp': None}
Processed item 17
Processed item 18
Processed item 19
Skipping item 20 with empty texts: {'question': 'Intrauterine exposure of diethylstilboestrol is associated with -', 'exp': None}
Processed item 21
Processed item 22
Processed item 23
Processed item 24
Processed item 25
Processed item 26
Skipping item 27 with empty texts: {'question': 'Which of the following Antiepileptic acts by opening Potassium channel?', 'exp': None}
Processed item 28
Processed item 29
Processed item 30
Processed item 31
Processed item 32
Processed item 33
Processed item 34
Pr

Token indices sequence length is longer than the specified maximum sequence length for this model (590 > 512). Running this sequence through the model will result in indexing errors


Processed item 7
Processed item 8
Processed item 9
Processed item 10
Processed item 11
Skipping item 12 with empty texts: {'question': 'Patient diagnosed to have malaria, smear shows all stages of schizonts 14-20 merozoites, yellowish - brown pigment. The type of malaria is -', 'exp': None}
Processed item 13
Processed item 14
Processed item 15
Skipping item 16 with empty texts: {'question': 'Crude birth rate denominator is -', 'exp': None}
Processed item 17
Processed item 18
Processed item 19
Skipping item 20 with empty texts: {'question': 'Intrauterine exposure of diethylstilboestrol is associated with -', 'exp': None}
Processed item 21
Processed item 22
Processed item 23
Processed item 24
Processed item 25
Processed item 26
Skipping item 27 with empty texts: {'question': 'Which of the following Antiepileptic acts by opening Potassium channel?', 'exp': None}
Processed item 28
Processed item 29
Processed item 30
Processed item 31
Processed item 32
Processed item 33
Processed item 34
Pr