<a href="https://colab.research.google.com/github/rhaveri/master-thesis/blob/main/3_sft_%2B_ragas%2Bnlp%2Bcompae.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Prevent widget metadata errors
import warnings
warnings.filterwarnings('ignore')

try:
    from IPython.display import clear_output
    clear_output(wait=True)
except:
    pass

print("Environment ready")

In [None]:
!pip install -q "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install -q --no-deps xformers trl peft accelerate bitsandbytes
!pip install -q langchain-community langchain-core chromadb langchain-huggingface
!pip install -q ragas langchain-openai rouge-score bert-score keybert textstat openpyxl evaluate
!pip install -q newspaper3k

In [None]:
# RAGAS EVALUATION SYSTEM

import os
import json
import torch
import pandas as pd
import numpy as np
from typing import List, Dict, Optional
from tqdm import tqdm

from unsloth import FastLanguageModel

from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

from ragas import evaluate
from ragas.metrics import Faithfulness, AnswerRelevancy, ContextPrecision
from datasets import Dataset
from langchain_openai import ChatOpenAI
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from keybert import KeyBERT
import textstat
import evaluate as hf_evaluate



In [None]:
# LOAD FINE-TUNED MODEL

def setup_google_drive():
    from google.colab import drive
    drive.mount('/content/drive')
    return "/content/drive/MyDrive"


def load_finetuned_model(max_seq_length: int = 2048):
    import os

    possible_paths = [
        "lora_model"
    ]

    model_location = None

    print(" Searching for model...")

    for path in possible_paths[:2]:
        if os.path.exists(path):
            model_location = path
            print(f"Found LOCAL model at: {path}")
            break

    if not model_location:
        print(" Not found locally. ")
        from google.colab import drive
        if not os.path.exists("/content/drive"):
            drive.mount('/content/drive')

        for path in possible_paths[2:]:
            if os.path.exists(path):
                model_location = path
                print(f" Found DRIVE model at: {path}")
                break

    if not model_location:
        raise FileNotFoundError("Critical Error: Could not find 'lora_model' locally OR in Google Drive. ")

    # 3. Load it
    print(f" Loading from: {model_location}")
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_location,
        max_seq_length=max_seq_length,
        dtype=None,
        load_in_4bit=True
    )

    FastLanguageModel.for_inference(model)
    print(" Model loaded and ready.")
    return model, tokenizer



In [None]:

#  SETUP RAG VECTOR DATABASE

def load_or_copy_file(filename: str, drive_path: str) -> str:

    if os.path.exists(filename):
        print(f"Found {filename} locally")
        return filename

    drive_file = f"{drive_path}/{filename}"
    if os.path.exists(drive_file):
        os.system(f'cp "{drive_file}" .')
        return filename

    raise FileNotFoundError(f" Please upload '{filename}' to Colab or Drive")


def build_vector_database(documents_file: str = "nutrition_documents_v2.json"):

    with open(documents_file, "r", encoding="utf-8") as f:
        raw_docs = json.load(f)

    documents = [
        Document(
            page_content=doc["text"],
            metadata={"source": doc["source"]}
        )
        for doc in raw_docs
    ]

    # Split into chunks
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = splitter.split_documents(documents)

    print(f"   Split {len(documents)} documents → {len(chunks)} chunks")

    # Create embeddings
    device = "cuda" if torch.cuda.is_available() else "cpu"
    embeddings = HuggingFaceEmbeddings(
        model_name="BAAI/bge-large-en-v1.5",
        model_kwargs={'device': device},
        encode_kwargs={'batch_size': 32}
    )

    # Build ChromaDB
    vector_db = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        collection_name="finetuned_evaluation"
    )

    print(" Vector database ready")
    return vector_db, embeddings


In [None]:

#  GENERATE ANSWERS

def generate_answers(model, tokenizer, vector_db, questions: List[str],
                     k_docs: int = 3, max_tokens: int = 512) -> List[Dict]:

    print(f"\n Generating answers for {len(questions)} questions...")
    results = []

    for question in tqdm(questions, desc="Answering"):
        # Retrieve relevant context
        retrieved_docs = vector_db.similarity_search(question, k=k_docs)
        context_text = "\n\n".join([doc.page_content for doc in retrieved_docs])

        # Format prompt (same as training format)
        prompt = f"""Context information is below.
---------------------
{context_text}
---------------------
Given the context information and not prior knowledge, answer the query.

Query: {question}"""

        messages = [
            {
                "role": "system",
                "content": "You are a professional AI health coach. Answer strictly based on the provided context."
            },
            {
                "role": "user",
                "content": prompt
            }
        ]

        # Generate answer
        inputs = tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to("cuda")

        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=max_tokens,
            use_cache=True,
            temperature=0.1
        )

        # Extract answer from model output
        full_response = tokenizer.batch_decode(outputs)[0]
        answer = full_response.split("<|start_header_id|>assistant<|end_header_id|>")[-1]\
                              .replace("<|eot_id|>", "").strip()

        results.append({
            "question": question,
            "answer": answer,
            "contexts": [doc.page_content for doc in retrieved_docs],
            "ground_truth": answer  # Proxy for Context Precision
        })

    print(f" Generated {len(results)} answers")
    return results


def save_results(results: List[Dict], filename: str = "finetuned_eval_data.json"):
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2)
    print(f" Results saved to: {filename}")


In [None]:
#  RAGAS EVALUATION

def evaluate_with_ragas(results_file: str, embeddings,
                       use_gpt4: bool = True) -> pd.DataFrame:

    # Load results
    with open(results_file, "r") as f:
        data = json.load(f)

    dataset = Dataset.from_list(data)

    # Choose evaluator
    if use_gpt4:
        print("   Using GPT-4o-mini as evaluator")
        if "OPENAI_API_KEY" not in os.environ:
            import getpass
            os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter OpenAI API Key: ")
        evaluator_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
    else:
        print("   Using LLaMA 3 as evaluator")
        from langchain_ollama import ChatOllama
        evaluator_llm = ChatOllama(model="llama3", temperature=0)

    # Define metrics
    metrics = [
        Faithfulness(llm=evaluator_llm),
        AnswerRelevancy(llm=evaluator_llm, embeddings=embeddings),
        ContextPrecision(llm=evaluator_llm)
    ]

    # Evaluate
    results = evaluate(
        dataset=dataset,
        metrics=metrics,
        llm=evaluator_llm,
        embeddings=embeddings,
        raise_exceptions=False
    )

    # Save results
    df = results.to_pandas()
    df.to_csv("ragas_results_finetuned.csv", index=False)
    df.to_excel("ragas_results_finetuned.xlsx", index=False)

    print("\n RAGAS Evaluation Complete!")
    print(f"   Avg Faithfulness: {df['faithfulness'].mean():.4f}")
    print(f"   Avg Relevancy: {df['answer_relevancy'].mean():.4f}")
    print(f"   Avg Context Precision: {df['context_precision'].mean():.4f}")

    return df


In [None]:
#  BERTScore

def calculate_nlp_metrics(results_file: str) -> pd.DataFrame:


    with open(results_file, "r") as f:
        data = json.load(f)

    # Extract answers and contexts
    answers = [item['answer'] for item in data]
    contexts = [" ".join(item['contexts']) for item in data]


    # Calculate BERTScore
    print("   Computing BERTScore (takes ~30 seconds)...")
    P, R, F1 = bert_score(answers, contexts, lang="en", verbose=False)

    # Create results DataFrame
    df = pd.DataFrame({
        'bert_score': F1.tolist()
    })

    df.to_csv("nlp_metrics_finetuned.csv", index=False)

    print(f"\n NLP Metrics Complete!")
    print(f"   Avg BERTScore: {F1.mean().item():.4f}")

    return df


In [None]:
#  TRAINING DATA QUALITY ANALYSIS

def analyze_training_data(excel_file: str) -> pd.DataFrame:


    df = pd.read_excel(excel_file)
    print(f"   Loaded {len(df)} training examples")

    # Auto-detect column names
    answer_col = 'answer' if 'answer' in df.columns else 'rewritten_answer'

    if 'context' not in df.columns:
        print("  Warning: 'context' column missing. Skipping context-based metrics.")
        has_context = False
    else:
        has_context = True

    # Metric 1: Readability
    df['grade_level'] = df[answer_col].apply(
        lambda x: textstat.flesch_kincaid_grade(str(x))
    )

    if has_context:
        answers = df[answer_col].astype(str).tolist()
        contexts = df['context'].astype(str).tolist()


        # Metric 3: BERTScore
        P, R, F1 = bert_score(answers, contexts, lang="en", verbose=False)
        df['bert_score'] = F1.tolist()

        # Metric 4: Keyword Recall
        kw_model = KeyBERT()
        keyword_recalls = []

        for answer, context in tqdm(zip(answers, contexts), total=len(df), desc="Keywords"):
            # Extract top keywords from context
            keywords = kw_model.extract_keywords(
                context,
                keyphrase_ngram_range=(1, 2),
                stop_words='english',
                top_n=10
            )

            if len(keywords) == 0:
                keyword_recalls.append(0)
                continue

            # Count how many appear in answer
            found = sum(1 for kw, _ in keywords if kw in answer.lower())
            keyword_recalls.append(found / len(keywords))

        df['keyword_recall'] = keyword_recalls

    # Save results
    output_file = "training_data_quality_metrics.csv"
    df.to_csv(output_file, index=False)
    df.to_excel("training_data_quality_metrics.xlsx", index=False)

    print(f"\n Training Data Analysis Complete!")
    print(f"   Avg Grade Level: {df['grade_level'].mean():.2f}")
    if has_context:
        print(f"   Avg BERTScore: {F1.mean().item():.4f}")
        print(f"   Avg Keyword Recall: {np.mean(keyword_recalls)*100:.1f}%")

    return df


In [None]:
#  LOAD METRICS FROM JSON

def load_metrics_from_json(json_file: str) -> pd.DataFrame:

    with open(json_file, "r") as f:
        data = json.load(f)

    if isinstance(data, list):
        df = pd.DataFrame(data)
    elif isinstance(data, dict):
        df = pd.DataFrame([data])
    else:
        raise ValueError(" JSON format not recognized")

    print(f" Loaded {len(df)} entries with {len(df.columns)} metrics")
    print(f"   Available metrics: {list(df.columns)}")

    return df


In [None]:

# MAIN EXECUTION PIPELINE

if __name__ == "__main__":
    print("="*60)
    print("COMPREHENSIVE EVALUATION SYSTEM")
    print("="*60)

    # Setup
    drive_path = setup_google_drive()

    # Step 1: Load fine-tuned model
    model_path = f"{drive_path}/My_Thesis_Model"
    model, tokenizer = load_finetuned_model()

    # Step 2: Setup RAG database
    docs_file = load_or_copy_file("nutrition_documents_v2.json", drive_path)
    vector_db, embeddings = build_vector_database(docs_file)

    # Step 3: Load questions
    questions_file = load_or_copy_file("questions.json", drive_path)
    with open(questions_file, "r") as f:
        questions = json.load(f)

    # Step 4: Generate answers
    results = generate_answers(model, tokenizer, vector_db, questions)
    save_results(results, "finetuned_eval_data.json")

    # Step 5: Evaluate with RAGAS
    ragas_df = evaluate_with_ragas("finetuned_eval_data.json", embeddings, use_gpt4=True)

    # Step 6: Calculate NLP metrics
    nlp_df = calculate_nlp_metrics("finetuned_eval_data.json")

    # Step 7: Analyze training data (if available)
    # training_df = analyze_training_data("dataset_with_context_backup.xlsx")

    #  Load pre-calculated metrics
    # metrics_df = load_metrics_from_json("all_metrics.json")

    print("\n" + "="*60)
    print("EVALUATION COMPLETE!")
    print("="*60)
    print("\nGenerated files:")
    print("  • finetuned_eval_data.json - Generated answers")
    print("  • ragas_results_finetuned.csv/xlsx - RAGAS metrics")
    print("  • nlp_metrics_finetuned.csv - ROUGE & BERTScore")
    print("  • training_data_quality_metrics.csv/xlsx - Training analysis")