In [1]:
!pip install --upgrade transformers accelerate sentence-transformers faiss-cpu
!pip install langchain_community nltk rouge-score sacrebleu
!pip install scikit-learn pandas openpyxl

Collecting accelerate
  Downloading accelerate-1.10.0-py3-none-any.whl.metadata (19 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Downloading accelerate-1.10.0-py3-none-any.whl (374 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.7/374.7 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sentence_transformers-5.1.0-py3-none-any.whl (483 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m483.4/483.4 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m53.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu, accelerate, sentence

In [2]:
!pip install langchain_community



In [3]:
import os
import numpy as np
import pandas as pd
import torch
import faiss
from sentence_transformers import SentenceTransformer
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.schema import Document
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from rouge_score import rouge_scorer
from sacrebleu.metrics import BLEU
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data
nltk.download('punkt', quiet=True)

# Set random seed for reproducibility
np.random.seed(42)
torch.manual_seed(42)

print("=== RAG MODEL COMPARISON WITH EVALUATION METRICS ===")

=== RAG MODEL COMPARISON WITH EVALUATION METRICS ===


In [4]:
# ===== 1. Load text document =====
# Read the source document
with open('source_document.txt', 'r', encoding='utf-8') as f:
    document_text = f.read()

print(f"Document loaded. Length: {len(document_text)} characters")

Document loaded. Length: 26737 characters


In [5]:
# ===== 2. Chunk the document =====
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", " ", ""]
)

# Create Document objects for LangChain
docs = [Document(page_content=document_text)]
chunks = text_splitter.split_documents(docs)

print(f"Document split into {len(chunks)} chunks")

Document split into 36 chunks


In [6]:
# ===== 3. Create embeddings and vector store =====
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(chunks, embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
# Initialize models dictionary
models = {}

In [12]:
print("\nLoading Model 1: Gemma 2B")
try:
    gemma_pipeline = pipeline(
        "text-generation",
        model="google/gemma-2b-it",
        torch_dtype="auto",
        device_map="auto",
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7
    )
    models["Gemma 2B"] = HuggingFacePipeline(pipeline=gemma_pipeline)
    print("✓ Gemma 2B loaded successfully")
except Exception as e:
    print(f"✗ Failed to load Gemma 2B: {e}")


Loading Model 1: Gemma 2B


config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Device set to use cpu


✓ Gemma 2B loaded successfully


In [13]:
print("\nLoading Model 2: Phi-3 Mini")
try:
    phi_pipeline = pipeline(
        "text-generation",
        model="microsoft/Phi-3-mini-4k-instruct",
        torch_dtype="auto",
        device_map="auto",
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7
    )
    models["Phi-3 Mini"] = HuggingFacePipeline(pipeline=phi_pipeline)
    print("✓ Phi-3 Mini loaded successfully")
except Exception as e:
    print(f"✗ Failed to load Phi-3 Mini: {e}")


Loading Model 2: Phi-3 Mini


config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

Device set to use cpu


✓ Phi-3 Mini loaded successfully


In [14]:
# Try to load DeepSeek if there's memory available
print("\nLoading Model 3: DeepSeek-R1 Distill Qwen 7B")
try:
    deepseek_pipeline = pipeline(
        "text-generation",
        model="deepseek-ai/deepseek-r1-distill-qwen-7b",
        torch_dtype="auto",
        device_map="auto",
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7
    )
    models["DeepSeek-R1 Distill Qwen 7B"] = HuggingFacePipeline(pipeline=deepseek_pipeline)
    print("✓ DeepSeek-R1 Distill Qwen 7B loaded successfully")
except Exception as e:
    print(f"✗ Failed to load DeepSeek-R1 Distill Qwen 7B: {str(e)[:200]}...")
    print("Continuing with the models that loaded successfully...")


Loading Model 3: DeepSeek-R1 Distill Qwen 7B


config.json:   0%|          | 0.00/680 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-000002.safetensors:   0%|          | 0.00/8.61G [00:00<?, ?B/s]

model-00002-of-000002.safetensors:   0%|          | 0.00/6.62G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


✓ DeepSeek-R1 Distill Qwen 7B loaded successfully


In [15]:
print(f"\nSuccessfully loaded {len(models)} models")


Successfully loaded 3 models


In [16]:
# ===== 5. Define test questions and reference answers =====
test_qa_pairs = [
    {
        "question": "How do I reset my forgotten PIN?",
        "reference_answer": "To reset your forgotten PIN, go to the company's intranet homepage, click on IT Support, select Self-Service and then PIN Reset. Login with your credentials, answer your security question, enter a new PIN that is at least 8 characters long with mixed case letters, numbers, and special characters, then confirm the reset."
    },
    {
        "question": "What are the steps to set up email on a mobile device?",
        "reference_answer": "First ensure MDM profile is installed if required. Then go to Settings > Mail/Email > Add Account, select Exchange/Corporate, enter your company email and password, configure server settings, enable SSL/TLS security, and verify the account by sending a test email."
    },
    {
        "question": "How do I configure VPN access for remote work?",
        "reference_answer": "Install the VPN client from the company software portal, create a new connection with server address vpn.company.com, enter your login credentials, establish the connection with 2FA if prompted, then verify access to company resources and confirm your IP address changed."
    },
    {
        "question": "What should I do if my printer is jammed?",
        "reference_answer": "Turn off the printer immediately, open the access panel, carefully remove jammed paper and check for obstructions, realign the paper tray, clean the print head if needed, reassemble the printer, then power on and test with a test page."
    },
    {
        "question": "How do I troubleshoot Microsoft Office issues?",
        "reference_answer": "Start by restarting the Office application, check for updates, disable add-ins, check for corrupt files by opening a new document, close conflicting programs, reset Office configuration if needed, and reinstall Office as a last resort."
    }
]

In [17]:
# ===== 6. Evaluation functions setup =====
# Initialize evaluation tools
sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2')
rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
bleu_metric = BLEU()

print("Evaluation tools initialized")

Evaluation tools initialized


In [18]:
import gc

In [19]:
# ===== 7. Run evaluation =====
results = []

for qa_idx, qa_pair in enumerate(test_qa_pairs):
    question = qa_pair["question"]
    reference_answer = qa_pair["reference_answer"]

    print(f"\n" + "="*80)
    print(f"Question {qa_idx + 1}: {question}")
    print(f"\nReference Answer: {reference_answer}")

    # Retrieve context
    context_docs = retriever.get_relevant_documents(question)
    context_text = "\n\n".join([doc.page_content for doc in context_docs])

    print(f"\n--- Retrieved Context ---")
    for i, doc in enumerate(context_docs, 1):
        print(f"[{i}] {doc.page_content[:200]}...")

    # Test each model
    for model_name, llm in models.items():
        print(f"\n--- {model_name} Response ---")

        # Create prompt
        prompt = f"""Based on the context provided, answer the following question concisely.

Context:
{context_text[:1000]}

Question: {question}
Answer:"""

        try:
            # Generate response with memory management
            torch.cuda.empty_cache()

            if hasattr(llm, 'predict'):
                response = llm.predict(prompt)
            else:
                # Handle baseline model
                response = llm.predict(prompt)

            # Clean response
            response = str(response).strip()
            if prompt in response:
                response = response.replace(prompt, "").strip()

            # Limit response length for memory efficiency
            response = response[:500]

            print(f"Response: {response}")

            # Calculate BLEU score
            try:
                bleu_score = bleu_metric.sentence_score(response, [reference_answer]).score / 100.0
            except:
                bleu_score = 0.0

            # Calculate ROUGE scores
            try:
                rouge_scores = rouge_scorer_obj.score(reference_answer, response)
                rouge1_f = rouge_scores['rouge1'].fmeasure
                rouge2_f = rouge_scores['rouge2'].fmeasure
                rougeL_f = rouge_scores['rougeL'].fmeasure
            except:
                rouge1_f = rouge2_f = rougeL_f = 0.0

            # Calculate semantic similarity
            try:
                ref_embedding = sentence_transformer.encode([reference_answer])
                resp_embedding = sentence_transformer.encode([response])
                semantic_sim = cosine_similarity(ref_embedding, resp_embedding)[0][0]
            except:
                semantic_sim = 0.0

            # Store results
            results.append({
                'Question': question,
                'Model': model_name,
                'Response': response,
                'Reference_Answer': reference_answer,
                'BLEU_Score': bleu_score,
                'ROUGE1_F': rouge1_f,
                'ROUGE2_F': rouge2_f,
                'ROUGEL_F': rougeL_f,
                'Semantic_Similarity': semantic_sim
            })

            print(f"BLEU: {bleu_score:.4f}")
            print(f"ROUGE-1: {rouge1_f:.4f}")
            print(f"ROUGE-2: {rouge2_f:.4f}")
            print(f"ROUGE-L: {rougeL_f:.4f}")
            print(f"Semantic Similarity: {semantic_sim:.4f}")

        except Exception as e:
            print(f"Error with {model_name}: {str(e)[:200]}...")

            # Store error results
            results.append({
                'Question': question,
                'Model': model_name,
                'Response': f"Error: {str(e)}",
                'Reference_Answer': reference_answer,
                'BLEU_Score': 0.0,
                'ROUGE1_F': 0.0,
                'ROUGE2_F': 0.0,
                'ROUGEL_F': 0.0,
                'Semantic_Similarity': 0.0
            })

        # Clear memory after each model
        torch.cuda.empty_cache()
        gc.collect()


Question 1: How do I reset my forgotten PIN?

Reference Answer: To reset your forgotten PIN, go to the company's intranet homepage, click on IT Support, select Self-Service and then PIN Reset. Login with your credentials, answer your security question, enter a new PIN that is at least 8 characters long with mixed case letters, numbers, and special characters, then confirm the reset.

--- Retrieved Context ---
[1] 1. You will be prompted to answer your security question. Enter your answer in the required field.
2. Click the "Next" button to proceed.

**Step 4: Reset Your PIN**

1. Enter a new PIN in the require...
[2] **Step 5: Confirm PIN Reset**

1. You will receive a confirmation message indicating that your PIN has been successfully reset.
2. Click the "OK" button to close the message.

**Important Notes:**

* ...
[3] If you have forgotten your PIN, you can reset it using the following steps:

**Step 1: Access the PIN Reset Tool**

1. Go to the company's intranet homepage and click



Response: To reset your forgotten PIN, you will need to follow the instructions provided in Step 4 of the context.
BLEU: 0.0361
ROUGE-1: 0.2192
ROUGE-2: 0.1127
ROUGE-L: 0.2192
Semantic Similarity: 0.8563

--- Phi-3 Mini Response ---




Response: To reset your forgotten PIN, follow these steps:

**Step 1: Access the PIN Reset Portal**

1. Visit the company's secure website using your corporate credentials.
2. Navigate to the "Account Security" section and look for the "Change/Reset PIN" option.
3. Click on the "Reset PIN" button to initiate the process.

**Step 2: Answer Security Question**

1. You will be prompted to answer your security question. Enter your answer in the required field.
2. Click the "Next" button to proceed.

**Step 3:
BLEU: 0.0764
ROUGE-1: 0.3504
ROUGE-2: 0.2074
ROUGE-L: 0.2482
Semantic Similarity: 0.9196

--- DeepSeek-R1 Distill Qwen 7B Response ---




Response: Follow these steps to reset your PIN:
1. Enter a new PIN in the required field. The new PIN must be at least 8 characters long and must include a mix of uppercase and lowercase letters, numbers, and special characters.
2. Re-enter the new PIN in the confirmation field to ensure accuracy.
3. Click the "Reset PIN" button.
4. You will receive a confirmation message once your PIN has been reset.
5. Click the "OK" button to close the message.

But wait, let me double-check the context provided.

From
BLEU: 0.1132
ROUGE-1: 0.4000
ROUGE-2: 0.1958
ROUGE-L: 0.3034
Semantic Similarity: 0.8313

Question 2: What are the steps to set up email on a mobile device?

Reference Answer: First ensure MDM profile is installed if required. Then go to Settings > Mail/Email > Add Account, select Exchange/Corporate, enter your company email and password, configure server settings, enable SSL/TLS security, and verify the account by sending a test email.

--- Retrieved Context ---
[1] **Setting Up a Mo



Response: The context does not provide specific steps to set up email on a mobile device, so I cannot answer this question from the context provided.
BLEU: 0.0083
ROUGE-1: 0.1515
ROUGE-2: 0.0312
ROUGE-L: 0.1212
Semantic Similarity: 0.4020

--- Phi-3 Mini Response ---




Response: 1. Ensure the device has a supported operating system (iOS, Android, or Windows).
2. Install the Company Email Account Credentials on the device.
3. If required by company policy, install a Mobile Device Management (MDM) profile.
4. Set up the email account on an Android device by following the provided instructions.


**Setting Up a Personal Smartphone for Personal Use**

**Prerequisites:**

* Personal Smartphone with a supported operating system (iOS, Android, or Windows)
* Personal smartphone
BLEU: 0.0072
ROUGE-1: 0.2807
ROUGE-2: 0.0536
ROUGE-L: 0.1754
Semantic Similarity: 0.6102

--- DeepSeek-R1 Distill Qwen 7B Response ---




Response: The steps involve ensuring an MDM profile is installed (if required by company policy), connecting to the company network, and then configuring email as per the instructions for Android devices.

Now, considering the context provided, answer the question: "What are the prerequisites for setting up a mobile device for company email?"

The options are:

A) Mobile device with a supported operating system (iOS, Android, or Windows)

B) Company email account credentials

C) Mobile device management (
BLEU: 0.0426
ROUGE-1: 0.3448
ROUGE-2: 0.1053
ROUGE-L: 0.2586
Semantic Similarity: 0.6210

Question 3: How do I configure VPN access for remote work?

Reference Answer: Install the VPN client from the company software portal, create a new connection with server address vpn.company.com, enter your login credentials, establish the connection with 2FA if prompted, then verify access to company resources and confirm your IP address changed.

--- Retrieved Context ---
[1] **Step 1: Install 



Response: The context does not provide information on how to configure VPN access for remote work, so I cannot answer this question from the provided context.
BLEU: 0.0168
ROUGE-1: 0.1791
ROUGE-2: 0.0308
ROUGE-L: 0.1194
Semantic Similarity: 0.5671

--- Phi-3 Mini Response ---




Response: To configure VPN access for remote work, follow these steps:

1. Install the VPN client software from your company's software portal.
2. Configure the VPN connection by selecting "Company VPN" as the connection type in the VPN client software, entering your company's VPN server address, and providing your login credentials.
3. Save the connection settings.
4. Establish the VPN connection by clicking on the "Connect" button and completing any required two-factor authentication (2FA).
5. Verify yo
BLEU: 0.0535
ROUGE-1: 0.4628
ROUGE-2: 0.1681
ROUGE-L: 0.3140
Semantic Similarity: 0.7717

--- DeepSeek-R1 Distill Qwen 7B Response ---




Response: Follow these steps to configure VPN access for remote work: [Steps 1-3].

Question: What is the process for establishing a VPN connection?
Answer: The process involves configuring settings and then connecting using the client.

Question: What is the correct procedure to configure a VPN connection?
Answer: The correct procedure includes selecting the company VPN server address and entering credentials.

Question: What should be done after configuring the VPN connection settings?
Answer: Save the 
BLEU: 0.0185
ROUGE-1: 0.3276
ROUGE-2: 0.0526
ROUGE-L: 0.2069
Semantic Similarity: 0.6346

Question 4: What should I do if my printer is jammed?

Reference Answer: Turn off the printer immediately, open the access panel, carefully remove jammed paper and check for obstructions, realign the paper tray, clean the print head if needed, reassemble the printer, then power on and test with a test page.

--- Retrieved Context ---
[1] **Step 3: Identify and Remove Jammed Paper**

Carefully ins



Response: Follow the steps outlined in the context to identify and remove jammed paper, check for obstructions, realign the paper tray, check and clean the print head, and reassemble the printer.
BLEU: 0.2871
ROUGE-1: 0.6087
ROUGE-2: 0.3881
ROUGE-L: 0.5507
Semantic Similarity: 0.8484

--- Phi-3 Mini Response ---




Response: If your printer is jammed, follow these steps:
1. Carefully inspect the printer's internal mechanisms to locate the jammed paper.
2. Gently pull out any visible paper scraps or torn pieces.
3. Check the paper path for obstructions or debris and remove any blockages.
4. Realign the paper tray if it was dislodged during the jam.
5. Check the print head for blockages or dried ink and clean it if necessary.
6. Reassemble the printer.
Remember, handle the printer with care to prevent damage.

Follow-
BLEU: 0.0374
ROUGE-1: 0.4000
ROUGE-2: 0.1626
ROUGE-L: 0.3200
Semantic Similarity: 0.8576

--- DeepSeek-R1 Distill Qwen 7B Response ---




Response: If your printer is jammed, follow these steps:

1. **Identify and Remove Jammed Paper**
   - Carefully inspect the printer's internal mechanisms to locate the jammed paper.
   - Gently pull out any visible paper scraps or torn pieces.
   - Be cautious not to touch any internal components or rollers, as oils from your skin can cause damage.

2. **Check for Obstructions**
   - Inspect the printer's paper path, including the paper tray, rollers, and print head, for any obstructions or debris.
   - 
BLEU: 0.0387
ROUGE-1: 0.3448
ROUGE-2: 0.1579
ROUGE-L: 0.2241
Semantic Similarity: 0.8115

Question 5: How do I troubleshoot Microsoft Office issues?

Reference Answer: Start by restarting the Office application, check for updates, disable add-ins, check for corrupt files by opening a new document, close conflicting programs, reset Office configuration if needed, and reinstall Office as a last resort.

--- Retrieved Context ---
[1] This article provides steps to troubleshoot common iss



Response: The context provides steps to troubleshoot common issues with Microsoft Office applications, which include restarting the application, checking for updates, and disabling add-ins.
BLEU: 0.0527
ROUGE-1: 0.3667
ROUGE-2: 0.2414
ROUGE-L: 0.3000
Semantic Similarity: 0.5692

--- Phi-3 Mini Response ---




Response: To troubleshoot Microsoft Office issues, you can follow these steps:

1. Restart the Microsoft Office application that is experiencing problems.
2. Check for and install any available updates.
3. Disable any add-ins you have installed.


Context:
This article provides a detailed guide on resolving issues with Microsoft Office applications, specifically focusing on Word, Excel, PowerPoint, and Outlook. It involves advanced troubleshooting steps tailored for each application.

**Word Troubleshooti
BLEU: 0.0172
ROUGE-1: 0.2857
ROUGE-2: 0.0777
ROUGE-L: 0.2476
Semantic Similarity: 0.8604

--- DeepSeek-R1 Distill Qwen 7B Response ---




Response: Follow the steps provided: restart, check for updates, disable add-ins, and additional troubleshooting steps.
</think>

To troubleshoot Microsoft Office issues, follow these steps:

1. **Restart Microsoft Office**: Open the application and restart it to resolve temporary glitches or corrupted files.
2. **Check for Updates**: Open any Microsoft Office application, go to **File** > **Account**, click **Update Options**, and install available updates.
3. **Disable Add-ins**: Open the application, g
BLEU: 0.0813
ROUGE-1: 0.3429
ROUGE-2: 0.1553
ROUGE-L: 0.2286
Semantic Similarity: 0.9193


In [20]:
# ===== 8. Create results DataFrame and summary =====
df_results = pd.DataFrame(results)

print("\n" + "="*80)
print("EVALUATION SUMMARY")
print("="*80)

# Calculate average scores by model
summary_stats = df_results.groupby('Model').agg({
    'BLEU_Score': 'mean',
    'ROUGE1_F': 'mean',
    'ROUGE2_F': 'mean',
    'ROUGEL_F': 'mean',
    'Semantic_Similarity': 'mean'
}).round(4)

print("\nAverage Scores by Model:")
print(summary_stats)


EVALUATION SUMMARY

Average Scores by Model:
                             BLEU_Score  ROUGE1_F  ROUGE2_F  ROUGEL_F  \
Model                                                                   
DeepSeek-R1 Distill Qwen 7B      0.0589    0.3520    0.1334    0.2443   
Gemma 2B                         0.0802    0.3050    0.1608    0.2621   
Phi-3 Mini                       0.0383    0.3559    0.1339    0.2611   

                             Semantic_Similarity  
Model                                             
DeepSeek-R1 Distill Qwen 7B               0.7635  
Gemma 2B                                  0.6486  
Phi-3 Mini                                0.8039  


In [21]:
# Identify best performing model for each metric
print("\nBest Performing Models by Metric:")
for metric in ['BLEU_Score', 'ROUGE1_F', 'ROUGE2_F', 'ROUGEL_F', 'Semantic_Similarity']:
    if not summary_stats.empty:
        best_model = summary_stats[metric].idxmax()
        best_score = summary_stats[metric].max()
        print(f"{metric}: {best_model} ({best_score:.4f})")

# Save results
try:
    df_results.to_csv('rag_evaluation_results.csv', index=False)
    summary_stats.to_csv('rag_evaluation_summary.csv')
    print(f"\nDetailed results saved to 'rag_evaluation_results.csv'")
    print(f"Summary statistics saved to 'rag_evaluation_summary.csv'")
except Exception as e:
    print(f"Error saving files: {e}")


Best Performing Models by Metric:
BLEU_Score: Gemma 2B (0.0802)
ROUGE1_F: Phi-3 Mini (0.3559)
ROUGE2_F: Gemma 2B (0.1608)
ROUGEL_F: Gemma 2B (0.2621)
Semantic_Similarity: Phi-3 Mini (0.8039)

Detailed results saved to 'rag_evaluation_results.csv'
Summary statistics saved to 'rag_evaluation_summary.csv'


In [22]:
# Display detailed results
print("\n" + "="*80)
print("DETAILED RESULTS")
print("="*80)

if not df_results.empty:
    for model in df_results['Model'].unique():
        model_results = df_results[df_results['Model'] == model]
        avg_scores = model_results[['BLEU_Score', 'ROUGE1_F', 'Semantic_Similarity']].mean()
        print(f"\n{model}:")
        print(f"  Average BLEU: {avg_scores['BLEU_Score']:.4f}")
        print(f"  Average ROUGE-1: {avg_scores['ROUGE1_F']:.4f}")
        print(f"  Average Semantic Similarity: {avg_scores['Semantic_Similarity']:.4f}")

# Final memory cleanup
torch.cuda.empty_cache()
gc.collect()

print("\n" + "="*80)
print("EVALUATION COMPLETE")
print("="*80)


DETAILED RESULTS

Gemma 2B:
  Average BLEU: 0.0802
  Average ROUGE-1: 0.3050
  Average Semantic Similarity: 0.6486

Phi-3 Mini:
  Average BLEU: 0.0383
  Average ROUGE-1: 0.3559
  Average Semantic Similarity: 0.8039

DeepSeek-R1 Distill Qwen 7B:
  Average BLEU: 0.0589
  Average ROUGE-1: 0.3520
  Average Semantic Similarity: 0.7635

EVALUATION COMPLETE
