<a href="https://colab.research.google.com/github/rhaveri/master-thesis/blob/main/1_full_code_no_sft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain langchain-community chromadb sentence-transformers ollama langchain-huggingface newspaper3k trafilatura readability-lxml ragas datasets langchain_ollama openai

In [None]:
!sudo apt-get install -y zstd
!curl -fsSL https://ollama.com/install.sh | sh

import os
import asyncio

async def run_ollama_serve():
    process = await asyncio.create_subprocess_shell(
        'ollama serve',
        preexec_fn=os.setsid
    )
    print("Ollama server started in the background.")

await run_ollama_serve()

import time
time.sleep(5)

In [None]:
!ollama pull llama3

In [None]:
import os
import getpass
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API Key: ")


In [None]:
import json
import logging
import time
from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Optional
from urllib.parse import urlparse

from newspaper import Article, Config
from pydantic import BaseModel, HttpUrl, Field
from tqdm import tqdm

import torch
import pandas as pd
from newspaper import Article
from pydantic import BaseModel, HttpUrl, Field
from tqdm import tqdm


from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from langchain_community.vectorstores import Chroma
from langchain_ollama import ChatOllama
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from ragas import evaluate
from ragas.metrics import Faithfulness, AnswerRelevancy, ContextPrecision, ContextRecall
from datasets import Dataset

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class NutritionDocument(BaseModel):
    source: HttpUrl
    title: str

    site: str
    authors: List[str]
    publish_date: Optional[str] = None

#  URL GATHERING
def get_scraper_config():
    """Configures the scraper to look like a real Chrome browser."""
    config = Config()
    config.browser_user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    config.request_timeout = 15
    return config

def get_nutrition_urls() -> List[str]:
    urls = [
        # Healthline
        "https://www.healthline.com/nutrition/healthy-eating-for-beginners", "https://www.healthline.com/nutrition/50-super-healthy-foods",
        "https://www.healthline.com/nutrition/how-to-count-macros", "https://www.healthline.com/nutrition/11-brain-foods",
        "https://www.healthline.com/nutrition/10-health-benefits-of-intermittent-fasting", "https://www.healthline.com/nutrition/how-to-eat-healthy-guide",
        "https://www.healthline.com/nutrition/healthy-eating-tips", "https://www.healthline.com/nutrition/12-best-foods-to-eat-in-morning",
        "https://www.healthline.com/nutrition/20-most-weight-loss-friendly-foods", "https://www.healthline.com/nutrition/protein-for-vegans-vegetarians",
        "https://www.healthline.com/nutrition/10-super-healthy-high-fat-foods", "https://www.healthline.com/nutrition/14-foods-to-avoid-on-low-carb",
        "https://www.healthline.com/nutrition/17-ways-to-eat-more-veggies", "https://www.healthline.com/nutrition/mediterranean-diet-meal-plan",
        "https://www.healthline.com/nutrition/vegetarian-diet-plan", "https://www.healthline.com/nutrition/keto-diet-meal-plan-and-menu",
        "https://www.healthline.com/nutrition/plant-based-diet-guide", "https://www.healthline.com/nutrition/paleo-diet-meal-plan-and-menu",
        "https://www.healthline.com/nutrition/26-evidence-based-weight-loss-tips", "https://www.healthline.com/nutrition/how-to-lose-weight-as-fast-as-possible",
        "https://www.healthline.com/nutrition/30-ways-to-lose-weight-naturally", "https://www.healthline.com/nutrition/weight-loss-plateau",
        "https://www.healthline.com/nutrition/how-to-meal-prep", "https://www.healthline.com/nutrition/easy-healthy-meals",
        "https://www.healthline.com/nutrition/healthy-dinner-ideas-for-two", "https://www.healthline.com/nutrition/soluble-vs-insoluble-fiber",
        "https://www.healthline.com/nutrition/good-carbs-bad-carbs", "https://www.healthline.com/nutrition/saturated-fat",
        "https://www.healthline.com/nutrition/how-much-water-should-you-drink-per-day", "https://www.healthline.com/nutrition/how-much-protein-per-day",
        "https://www.healthline.com/nutrition/different-types-of-fiber",
        # Harvard
        "https://www.hsph.harvard.edu/nutritionsource/healthy-eating-plate/", "https://www.hsph.harvard.edu/nutritionsource/carbohydrates/",
        "https://www.hsph.harvard.edu/nutritionsource/protein/", "https://www.hsph.harvard.edu/nutritionsource/fats/",
        "https://www.hsph.harvard.edu/nutritionsource/vitamins/", "https://www.hsph.harvard.edu/nutritionsource/water/",
        "https://www.hsph.harvard.edu/nutritionsource/healthy-weight/", "https://www.health.harvard.edu/blog/what-is-a-plant-based-diet-and-why-should-you-try-it-2018092614760",
        "https://www.hsph.harvard.edu/nutritionsource/salt-and-sodium/", "https://www.hsph.harvard.edu/nutritionsource/sugar/",
        # NIH
        "https://newsinhealth.nih.gov/2023/08/breaking-down-food", "https://newsinhealth.nih.gov/2021/02/eating-plan-healthy-heart",
        "https://pmc.ncbi.nlm.nih.gov/articles/PMC9455721/", "https://newsinhealth.nih.gov/2015/09/better-nutrition-every-day",
        "https://newsinhealth.nih.gov/2018/03/creating-healthy-habits", "https://www.medicalnewstoday.com/articles/249190",
        "https://newsinhealth.nih.gov/2022/08/biology-breast-milk",
        # Mayo Clinic
        "https://www.mayoclinic.org/healthy-lifestyle/nutrition-and-healthy-eating/basics/healthy-diets/hlv-20049477",
        "https://www.mayoclinic.org/healthy-lifestyle/nutrition-and-healthy-eating/in-depth/water/art-20044256",
        "https://www.mayoclinic.org/healthy-lifestyle/nutrition-and-healthy-eating/in-depth/fiber/art-20043983",
        "https://www.mayoclinichealthsystem.org/hometown-health/speaking-of-health/are-you-getting-too-much-protein",
        "https://www.mayoclinic.org/healthy-lifestyle/nutrition-and-healthy-eating/in-depth/carbohydrates/art-20045705",
        "https://www.mayoclinic.org/healthy-lifestyle/nutrition-and-healthy-eating/in-depth/fat/art-20045550",
        # CDC
        "https://www.cdc.gov/healthy-weight-growth/healthy-eating/index.html", "https://www.cdc.gov/healthy-weight-growth/healthy-eating/meals-snacks.html",
        "https://www.cdc.gov/healthyweight/healthy_eating/portion_size.html", "https://www.cdc.gov/healthyweight/healthy_eating/energy_density.html",
        "https://www.cdc.gov/healthyweight/healthy_eating/shopping_cooking_eating_out.html",



        # World Health Organization (WHO)
        "https://www.who.int/news-room/fact-sheets/detail/healthy-diet",
        "https://www.who.int/initiatives/behealthy/healthy-diet",
        "https://www.who.int/publications/i/item/9789241549950",
        "https://www.who.int/publications/i/item/9789241514873",

        # American Heart Association
        "https://www.heart.org/en/healthy-living/healthy-eating/eat-smart/nutrition-basics/aha-diet-and-lifestyle-recommendations",
        "https://www.heart.org/en/healthy-living/healthy-eating/eat-smart/nutrition-basics/how-to-eat-healthy-without-dieting",
        "https://www.heart.org/en/healthy-living/healthy-eating/eat-smart/fats/saturated-fats",
        "https://www.heart.org/en/healthy-living/healthy-eating/eat-smart/sugar/added-sugars",
        "https://www.heart.org/en/healthy-living/healthy-eating/eat-smart/sodium/how-much-sodium-should-i-eat-per-day",

        # Academy of Nutrition and Dietetics
        "https://www.eatright.org/health/wellness/healthful-habits/healthy-weights-for-healthy-older-adults",
        "https://www.eatright.org/health/wellness/healthful-habits/eat-right-for-life",
        "https://www.eatright.org/health/wellness/healthful-habits/how-to-keep-your-immune-system-healthy",
        "https://www.eatright.org/health/wellness/nutrition-panels-and-food-labels/use-the-dietary-guidelines-myplate-and-food-labels-to-make-healthy-choices",
        "https://www.eatright.org/health/health-conditions/allergies-and-intolerances/5-ways-parents-can-keep-food-allergic-children-safe-at-school",
        "https://www.eatright.org/health/wellness/healthful-habits/how-sleep-habits-affect-healthy-weight",
        "https://www.eatright.org/fitness/sports-and-athletic-performance/beginner-and-intermediate/eat-right-to-play-hard",
        "https://www.eatright.org/food/planning/food-security-and-sustainability/reduce-plate-waste-school-home-and-eating-out",

        # British Nutrition Foundation
        "https://www.nutrition.org.uk/healthy-sustainable-diets/healthy-eating/",
        "https://www.nutrition.org.uk/healthy-sustainable-diets/healthy-and-sustainable-diets/",
        "https://www.nutrition.org.uk/healthy-sustainable-diets/hydration/",

        # Cleveland Clinic
        "https://health.clevelandclinic.org/why-youll-feel-alcohols-effects-more-after-age-65",
        "https://health.clevelandclinic.org/gluten-sensitivity-celiac-disease-wheat-allergy-differences",
        "https://health.clevelandclinic.org/benefits-of-citrus-fruits",
        "https://health.clevelandclinic.org/how-to-stop-period-cramps",
        "https://health.clevelandclinic.org/dry-cough-vs-wet-cough/",
        "https://health.clevelandclinic.org/beets-turn-poop-and-pee-red",

        # Johns Hopkins Medicine
        "https://www.hopkinsmedicine.org/health/wellness-and-prevention/intermittent-fasting-what-is-it-and-how-does-it-work",
        "https://www.hopkinsmedicine.org/health/wellness-and-prevention",
        "https://www.hopkinsmedicine.org/health/wellness-and-prevention/how-to-maintain-a-balanced-diet-as-a-vegetarian-or-vegan",
        "https://www.hopkinsmedicine.org/health/wellness-and-prevention/6-heart-health-mistakes-made-by-women-and-how-to-avoid-them",

                # Stanford Medicine
        "https://stanfordhealthcare.org/medical-clinics/bariatric-surgery/medical-weight-loss-program.html",
        "https://med.stanford.edu/news/insights/2025/07/ultra-processed-food--five-things-to-know.html",
        "https://med.stanford.edu/metabolichealthcenter/faq.html",

        # UCSF Health
        "https://www.ucsfhealth.org/education/top-ten-foods-for-health",
        "https://www.ucsfhealth.org/education/tips-for-staying-healthy",
        "https://www.ucsfhealth.org/education/nutrition-tips-for-inflammatory-bowel-disease",
        "https://www.ucsfhealth.org/education/healthy-lifestyles-healthy-outlook",

        # National Institute of Diabetes and Digestive and Kidney Diseases (NIDDK)
        "https://www.niddk.nih.gov/health-information/weight-management/healthy-eating-physical-activity-for-life/health-tips-for-adults",
        "https://www.niddk.nih.gov/health-information/weight-management/take-charge-health-guide-teenagers",
        "https://www.niddk.nih.gov/health-information/weight-management/just-enough-food-portions",
        "https://www.niddk.nih.gov/health-information/weight-management",




        # American Diabetes Association
        "https://diabetes.org/food-nutrition",
        "https://diabetes.org/food-nutrition/eating-healthy",
        "https://diabetes.org/food-nutrition/reading-food-labels",

        # International Food Information Council
        "https://foodinsight.org/healthy-eating/",
        "https://foodinsight.org/nutrition-101/",
        "https://foodinsight.org/dietary-guidelines/",

        # Precision Nutrition
        "https://www.precisionnutrition.com/all-about-nutrition",
        "https://www.precisionnutrition.com/healthy-eating",
        "https://www.precisionnutrition.com/calorie-control-guide-infographic",

        # Examine.com (Evidence-based nutrition)
        "https://www.nhs.uk/live-well/eat-well/how-to-eat-a-balanced-diet/eight-tips-for-healthy-eating/",
        "https://examine.com/?srsltid=AfmBOorw9zcTbMhQi2Oy7r5jPo5M-_IUN-mixFE-os23YRgpakZ9wRyc",
        "https://examine.com/guides/fat-loss/?srsltid=AfmBOoooFUTnIV0J2e2OD9VfC4FjqDmSyFbjAKa3_YUMf0xyu1Bq_q4Z",

        # Mindful Eating & Psychology
        "https://www.psychologytoday.com/us/blog/mind-body-food/202009/the-psychology-nutrition",
        "https://www.health.harvard.edu/staying-healthy/8-steps-to-mindful-eating",
        "https://www.urmc.rochester.edu/encyclopedia/content.aspx?contenttypeid=1&contentid=4466",
        "https://www.psychologytoday.com/us/basics/diet",


        # Sports & Exercise Nutrition
        "https://www.scribd.com/document/655881153/protein-intake-for-optimal-muscle-maintenance",
        "https://www.mayoclinic.org/healthy-lifestyle/fitness/in-depth/exercise/art-20045506",
        "https://www.acefitness.org/resources/everyone/blog/7599/the-importance-of-nutrition-for-exercise/",

        # Gut Health & Microbiome
        "https://www.hsph.harvard.edu/nutritionsource/microbiome/",
        "https://www.mayoclinic.org/healthy-lifestyle/consumer-health/expert-answers/probiotics/faq-20058065",
        "https://www.health.harvard.edu/staying-healthy/can-gut-bacteria-improve-your-health",

        # Food Safety & Sustainability
        "https://www.fda.gov/food/buy-store-serve-safe-food/selecting-and-serving-produce-safely",
        "https://www.epa.gov/sustainable-management-food/sustainable-management-food-basics",
        "https://www.foodsafety.gov/food-safety-charts/safe-minimum-internal-temperatures",

        # Cultural & Special Diets
        "https://oldwayspt.org/traditional-diets/mediterranean-diet",
        "https://oldwayspt.org/explore-heritage-diets/asian-heritage-diet/",

        # Aging & Nutrition
        "https://www.nia.nih.gov/health/healthy-eating-nutrition-and-diet/healthy-meal-planning-tips-older-adults",
        "https://www.nia.nih.gov/health/healthy-eating-nutrition-and-diet/maintaining-healthy-weight",
        "https://www.nia.nih.gov/health/healthy-eating-nutrition-and-diet/how-much-should-i-eat-quantity-and-quality",

        # Pediatric & Family Nutrition
        "https://www.healthychildren.org/English/healthy-living/nutrition/Pages/default.aspx",
        "https://www.cdc.gov/nutrition/infantandtoddlernutrition/index.html",

        # Mental Health & Nutrition
        "https://www.health.harvard.edu/blog/nutritional-psychiatry-your-brain-on-food-201511168626",
        "https://www.psychologytoday.com/us/blog/evidence-based-living/202001/the-foods-we-eat-do-affect-our-mental-health-heres-the-proof",
        "https://www.mentalhealth.org.uk/explore-mental-health/a-z-topics/diet-and-mental-health",

        # Sleep & Nutrition
        "https://www.sleepfoundation.org/nutrition",
        "https://www.health.harvard.edu/staying-healthy/8-secrets-to-a-good-nights-sleep",
        "https://www.hopkinsmedicine.org/news/articles/2020/11/how-healthy-sleep-and-weight-may-improve-survival",


        # Hydration & Electrolytes
        "https://www.hsph.harvard.edu/news/hsph-in-the-news/the-importance-of-hydration/",
        "https://www.mayoclinic.org/healthy-lifestyle/nutrition-and-healthy-eating/in-depth/water/art-20044256",
        "https://www.cedars-sinai.org/blog/electrolytes.html",

        # Food Allergies & Intolerances
        "https://www.foodallergy.org/resources/food-allergy-myths-and-factss",
        "https://www.mayoclinic.org/diseases-conditions/food-allergy/symptoms-causes/syc-20355095",
        "https://www.aaaai.org/tools-for-the-public/conditions-library/allergies/food-intolerance",

        # Budget & Economic Nutrition
        "https://www.choosemyplate.gov/budget",
        "https://www.eatright.org/food/planning-and-prep/eat-right-on-a-budget",
        "https://www.ers.usda.gov/topics/food-nutrition-assistance/food-security-in-the-u-s/",

        # Cooking & Food Preparation
        "https://www.heart.org/en/healthy-living/healthy-eating/cooking-skills",
        "https://www.cdc.gov/healthy-weight-growth/healthy-eating/meals-snacks.html",
        "https://www.fda.gov/food/buy-store-serve-safe-food/safe-food-handling",


    ]
    unique_urls = sorted(list(set(urls)))
    return unique_urls

def scrape_article(url: str) -> Optional[NutritionDocument]:
    """
    Downloads and validates a single article.
    """
    try:
        config = Config()
        config.browser_user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        config.request_timeout = 15

        article = Article(url, config=get_scraper_config())
        article.download()
        article.parse()

        # nutrition_keywords = ['nutrition', 'diet', 'food', 'eating', 'healthy', 'weight']
        # if sum(keyword in article.text.lower() for keyword in nutrition_keywords) < 3:
        #     logging.warning(f"Skipped (not nutrition-focused): {url}")
        #     return None

        return {
            "source": str(url),
            "title": str(article.title),
            "text": str(article.text.strip()),
            "site": str(urlparse(url).netloc),
            "publish_date": str(article.publish_date) if article.publish_date else None
        }
    except Exception as e:
        logging.error(f"Failed: {url} - {e}")
        return None


def collect_articles(urls: List[str], max_workers: int = 10) -> List[NutritionDocument]:
    """
    Downloads articles in parallel for speed.
    """
    documents = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(scrape_article, url): url for url in urls}

        for future in tqdm(as_completed(futures), total=len(urls), desc="Scraping"):
            result = future.result()
            if result:
                documents.append(result)

    logging.info(f"Successfully scraped {len(documents)}/{len(urls)} articles")
    return documents




In [None]:
# VECTOR DATABASE SETUP
def prepare_and_split_documents(json_path: str) -> List[Document]:
    """
    Loads articles and splits them into smaller chunks.
    """
    try:
        with open(json_path, "r", encoding="utf-8") as f:
            raw_data = json.load(f)
    except FileNotFoundError:
        raise FileNotFoundError(f" Could not find {json_path}. Did the scraper run?")

    if not raw_data:
        raise ValueError(" JSON file is empty!")

    print(f"Keys in first document: {list(raw_data[0].keys())}")

    langchain_docs = []
    for doc in raw_data:
        content = doc.get("text", "") or doc.get("content", "")

        if not content:
            continue

        langchain_docs.append(
            Document(
                page_content=content,
                metadata={
                    "source": doc.get("source", ""),
                    "title": doc.get("title", "")
                }
            )
        )

    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = splitter.split_documents(langchain_docs)

    print(f" Loaded {len(langchain_docs)} docs, split into {len(chunks)} chunks.")
    return chunks


def build_vector_store(chunks: List[Document]) -> Chroma:
    """
    Creates searchable database using embeddings.
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
    logging.info(f"Using device: {device}")

    embeddings = HuggingFaceEmbeddings(
        model_name="BAAI/bge-large-en-v1.5",
        model_kwargs={'device': device}
    )

    return Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory="chroma_db"
    )
    return vector_store


In [None]:
# RAG CHAIN (Question Answering)

def create_rag_system(vector_store: Chroma):
    """
    Builds the question-answering pipeline.
    """
    llm = ChatOllama(model="llama3", temperature=0.2)

    retriever = vector_store.as_retriever(
        search_type="mmr",
        search_kwargs={'k': 5, 'fetch_k': 20, 'lambda_mult': 0.3}
    )

    prompt = ChatPromptTemplate.from_template("""
You are an AI nutrition coach. Answer based ONLY on the context below.

Context:
{context}

Question: {question}

Answer:""")


    # combine_chunks
    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    # The RAG chain: Retrieve → Format → Generate
    chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | RunnablePassthrough.assign(
            answer=(
                RunnableLambda(lambda x: {
                    "context": format_docs(x["context"]),
                    "question": x["question"]
                })
                | prompt
                | llm
                | StrOutputParser()
            )
        )
    )

    return chain



In [None]:
# SAFETY GUARDRAILS

def check_safety(question: str, llm) -> Optional[str]:
    """
    Uses LLM to detect unsafe medical queries.
    CHANGE: Simplified prompt, clearer logic
    RETURNS: Warning message if unsafe, None if safe
    """
    safety_prompt = ChatPromptTemplate.from_template("""
Classify this query as SAFE or UNSAFE:

UNSAFE = Medical emergency, diagnosis request, or prescription request
SAFE = General nutrition questions, definitions, or hypotheticals

Query: {question}

Reply with only one word: SAFE or UNSAFE
""")

    chain = safety_prompt | llm | StrOutputParser()
    verdict = chain.invoke({"question": question}).strip().upper()

    if "UNSAFE" in verdict:
        return " I'm a nutrition coach, not a doctor. Please consult a healthcare professional."
    return None


# EVALUATION (RAGAS Metrics)

def evaluate_system(rag_chain, questions: List[str], embeddings):
    """
    Evaluates RAG system using 3 metrics:
    1. Faithfulness: Does answer stick to retrieved context? (detects hallucinations)
    2. Answer Relevancy: Is the answer useful for the question?
    3. Context Precision: Did retriever find the right documents?

    """
    INPUT_FILE = "ragas_all_metics.json"

    if not os.path.exists(INPUT_FILE):
        print(f" {INPUT_FILE} not found.")
        return None

    with open(INPUT_FILE, 'r') as f:
        raw_data = json.load(f)

    eval_data = {
        "question": [],
        "answer": [],
        "contexts": [],
        "ground_truth": []
    }

    for entry in tqdm(raw_data):
      q = entry.get("question")
      gt = entry.get("ground_truth")

      result = rag_chain.invoke(q)

      eval_data["question"].append(q)
      eval_data["answer"].append(result["answer"])
      eval_data["contexts"].append([doc.page_content for doc in result["context"]])
      eval_data["ground_truth"].append(gt)

    dataset = Dataset.from_dict(eval_data)

    evaluator = ChatOpenAI(model="gpt-4o-mini", temperature=0)

    metrics = [
      Faithfulness(llm=evaluator),
      AnswerRelevancy(llm=evaluator, embeddings=embeddings),
      ContextPrecision(llm=evaluator),
      ContextRecall(llm=evaluator)
  ]

    results = evaluate(
        dataset=dataset,
        metrics=metrics,
        llm=evaluator,
        embeddings=embeddings,
        raise_exceptions=False
    )
    df = results.to_pandas()
    print(df[["faithfulness", "answer_relevancy", "context_precision", "context_recall"]].mean())
    df.to_csv("ragas_results.csv", index=False)
    logging.info("Results saved to ragas_results.csv")

    return df


In [None]:
#  TRAINING DATA PREPARATION

def prepare_training_data(excel_file: str, vector_store: Chroma, output_file: str = "training.jsonl"):
    """
    Converts Q&A pairs + retrieved context into LLaMA 3 training format.

    CHANGE: Simplified, removed redundant backups, clearer structure
    INPUT: Excel with 'question' and 'answer' columns
    OUTPUT: JSONL file ready for fine-tuning
    """
    if not os.path.exists(excel_file):
      print(f" {excel_file} not found. ")
      return None

    df = pd.read_excel(excel_file)
    df = df.dropna(subset=['question']).astype(str)
    logging.info(f"Loaded {len(df)} training examples")

    training_examples = []

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Preparing training data"):
        docs = vector_store.similarity_search(row['question'], k=3)
        context = "\n\n".join([doc.page_content for doc in docs])

        example = {
            "messages": [
                {
                    "role": "system",
                    "content": "You are a professional nutrition coach. Answer based on provided context."
                },
                {
                    "role": "user",
                    "content": f"Context:\n{context}\n\nQuestion: {row['question']}"
                },
                {
                    "role": "assistant",
                    "content": row['answer']
                }
            ]
        }
        training_examples.append(example)

    with open(output_file, 'w', encoding='utf-8') as f:
        for example in training_examples:
            json.dump(example, f)
            f.write('\n')

    logging.info(f" Training data saved to {output_file}")
    return output_file



In [None]:
 from langchain_openai import ChatOpenAI


In [None]:
# MAIN EXECUTION

if __name__ == "__main__":
    urls = get_nutrition_urls()
    docs = collect_articles(urls)

    if not docs:
      print(" CRITICAL: No documents scraped. Check internet or URL list.")
    else:
        output_file = "nutrition_documents_v2.json"
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(docs, f, indent=2, ensure_ascii=False)

        print(f"Saved {len(docs)} documents (checked 'text' field present).")

        # print(f"Debug: First doc keys: {list(docs[0].keys())}")
        # if 'text' not in docs[0]:
        #     raise ValueError("CRITICAL: 'text' field missing from saved docs!")

    if os.path.exists("nutrition_documents_v2.json"):

      chunks = prepare_and_split_documents("nutrition_documents_v2.json")
      vector_store = build_vector_store(chunks)
      embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5")
      rag_chain = create_rag_system(vector_store)

    safety_llm = ChatOllama(model="llama3", temperature=0)

    test_questions = [
       "How can I create balanced meals when I have limited food options at home?",

        "How can I include more plant-based foods without fully changing my diet?",

        "How can I improve digestion by adjusting how I eat, not just what I eat?",

        "How can I maintain healthy eating habits when my routine changes often?",

        "How can I safely increase movement if I feel stiff or inactive?",

        "How can I improve sleep quality if I wake up during the night?",

        "How can I manage daily stress without feeling overwhelmed?",

        "Why does skipping meals often lead to overeating later?",

        "What should I do if I feel tired despite eating regularly?",

        "What should I do if I feel mild stiffness or discomfort after sitting too long?"

    ]

    for question in test_questions:
        print(f"\n {question}")

        warning = check_safety(question, safety_llm)
        if warning:
            print(f" {warning}")
            continue

        result = rag_chain.invoke(question)
        print(f" {result['answer']}")

    if os.path.exists("questions.json"):
        with open("questions.json", "r") as f:
            raw_data = json.load(f)

        if isinstance(raw_data[0], str):
            questions = raw_data
        else:
            questions = [x['question'] for x in raw_data]

        print(f"Evaluating {len(questions)} questions...")
        eval_results = evaluate_system(rag_chain, questions, embeddings)

        # Print Results
        print("\n=== RAGAS RESULTS ===")
        cols = ['user_input', 'faithfulness', 'answer_relevancy', 'context_precision']
        print(eval_results[[c for c in cols if c in eval_results.columns]].head())

    if os.path.exists("dataset - sft.xlsx"):
        print("Preparing Training Data...")
        df = pd.read_excel("dataset - sft.xlsx")

        df['question'] = df['question'].astype(str)
        df['answer'] = df['answer'].astype(str)

        df.to_excel("temp_fixed_dataset.xlsx", index=False)
        prepare_training_data("temp_fixed_dataset.xlsx", vector_store)
    else:
        print(" 'dataset - sft.xlsx' not found.")