### 1- Scrape and Extract Website Data
Use BeautifulSoup or Scrapy to scrape text.

## BeautifulSoup Approach

In [14]:
import requests #Used to send HTTP requests to fetch web pages
from bs4 import BeautifulSoup #Parses HTML content to extract useful information.
import re #Handles regular expressions for text cleaning.
import nltk #Natural Language Toolkit for text processing (e.g., sentence tokenization)
from collections import OrderedDict 
import difflib #Compares text similarity.
from cleantext import clean
import ftfy #Fixes Unicode issues in text.
from unidecode import unidecode #Converts Unicode text to ASCII.
from concurrent.futures import ThreadPoolExecutor, as_completed #Enables parallel processing 

# Function to check similarity
def is_similar(a, b, threshold=0.9): #still needs to be tuned
    return difflib.SequenceMatcher(None, a, b).ratio() > threshold

# Download NLTK sentence tokenizer which is used to split text into sentences.
nltk.download('punkt')

# List of URLs to fetch
urls = [
    "https://www.desy.de/index_eng.html",
    "https://www.desy.de/news/index_eng.html",
    "https://www.desy.de/about_desy/desy/index_eng.html",
    "https://www.desy.de/contact/index_eng.html",
    "https://www.desy.de/about_desy/directorate/helmut_dosch/index_eng.html",
    "https://www.desy.de/research/index_eng.html",
    "https://www.desy.de/research/accelerators/index_eng.html",
    "https://www.desy.de/research/photon_science/index_eng.html",
    "https://www.desy.de/research/particle_physics/index_eng.html",
    "https://www.desy.de/research/astroparticle_physics/index_eng.html",
    "https://www.desy.de/research/facilities__projects/index_eng.html",
    "https://www.desy.de/research/cooperations__institutes/index_eng.html",
    "https://www.desy.de/research/facilities__projects/european_xfel/index_eng.html",
    "https://www.desy.de/research/facilities__projects/petra_iv/index_eng.html"
    # "https://it.desy.de/index_eng.html",
    # "https://it.desy.de/availability/index_eng.html",
    # "https://it.desy.de/help_uco/index_eng.html",
    # "https://it.desy.de/e16/e2036/e55767/index_eng.html?preview=preview"
]

# Function to fetch and clean webpage content
def fetch_and_clean_url(url):
    try:
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.0.0 Safari/537.36"}
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        print(f"Fetching {url} - Status Code: {response.status_code}")
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch {url}: {e}")
        return None

# Function to extract main content- fallback mechanism!
# def extract_main_content(soup):
#     for tag in soup(['nav', 'footer', 'script', 'style', 'aside', 'header', 'form', 'iframe', 'img']):# Remove unwanted tags (e.g., nav, footer, script, style)
#         tag.decompose()
#     return soup.find('div', id='content') or soup.find('body') #body tag contains all the visible content of the webpage

def extract_main_content(soup):
    for tag in soup(['nav', 'footer', 'script', 'style', 'aside', 'header', 'form', 'iframe', 'img']):
        tag.decompose()
    
    # Prioritize content divs
    main_content = soup.find('div', id='content') or soup.find('main') or soup.find('body')
    
    if main_content:
        return main_content
    else:
        return soup  # Fallback to entire page if no main content is found

# Function to filter tags based on exclusion keywords
def filter_tags(tags, excluded_keywords):
    return [tag for tag in tags if not any(kw in tag.get_text().lower() for kw in excluded_keywords)]

# Function to clean the extracted text
def clean_text(raw_text):
    raw_text = ftfy.fix_text(raw_text)  # Fix Unicode issues
    raw_text = unidecode(raw_text)  # Convert to ASCII
    raw_text = re.sub(r'http\S+', '', raw_text)  # Remove URLs
    raw_text = re.sub(r'\d{2}\.\d{2}\.\d{4}', '', raw_text)  # Remove dates
  # raw_text = re.sub(r'\b\w{1,2}\b', '', raw_text)  # Remove single or double-letter words

    raw_text = re.sub(r'\s+', ' ', raw_text)  # Normalize spaces
    raw_text = re.sub(r'\[.*?\]', '', raw_text)  # Remove content inside square brackets
    raw_text = re.sub(r'\n+', '\n', raw_text)  # Remove excess newlines
    raw_text = re.sub(r'\s([?.!,;])', r'\1', raw_text)  # Fix spacing before punctuation
    
    raw_text = re.sub(r'([a-z])([A-Z])', r'\1. \2', raw_text)  # Fix missing spaces between sentences
    raw_text = re.sub(r'(?<=[a-zA-Z])\s*\n\s*(?=[a-zA-Z])', '. ', raw_text)  # Convert newlines to periods if missing punctuation
    return raw_text.strip()






def process_chunks(text):
    sentences = nltk.sent_tokenize(text)
    chunks = [sentence.strip() for sentence in sentences if len(sentence.split()) > 5]  # Keep only meaningful sentences
    
    unique_chunks = OrderedDict()  # Maintain order while removing duplicates
    for sentence in chunks:
        normalized_sentence = sentence.lower().strip()
        if not any(is_similar(normalized_sentence, existing) for existing in unique_chunks):
            unique_chunks[normalized_sentence] = sentence  # Store original sentence
    
    return list(unique_chunks.values())  # Return unique, meaningful chunks

#===============================================================================
# Function to handle the entire extraction process for a URL
def extract_text_from_url(url):
    content = fetch_and_clean_url(url)
    if content:
        soup = BeautifulSoup(content, "html.parser")
        main_content = extract_main_content(soup)
        if main_content:
            tags = main_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'div', 'li', 'article', 'section', 'td', 'th'], lang="en")
            if not tags:
                tags = main_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'div', 'li', 'article', 'section', 'td', 'th'])
            raw_text = " ".join(tag.get_text(strip=True) for tag in filter_tags(tags, excluded_keywords))
            cleaned_text = clean_text(raw_text)
            return process_chunks(cleaned_text)
    return []

# Initialize a list to store all the extracted chunks
all_text_chunks = []
excluded_keywords = [
    "contact", "privacy", "terms", "login", "menu", "search", 
    "subscribe", "cookie", "policy", "newsletter", "copyright", 
    "footer", "disclaimer", "faq", "sitemap"
]

# Function to process URLs in parallel
def extract_text_from_urls(urls):
    all_text_chunks = []
    with ThreadPoolExecutor(max_workers=5) as executor:  # Adjust max_workers based on your system
        # Submit tasks to the executor
        future_to_url = {executor.submit(extract_text_from_url, url): url for url in urls}
        
        # Process completed tasks
        for future in as_completed(future_to_url):
            url = future_to_url[future]
            try:
                unique_chunks = future.result()
                all_text_chunks.extend(unique_chunks)
                print(f"Extracted {len(unique_chunks)} meaningful text chunks from {url}")
            except Exception as e:
                print(f"Error processing {url}: {e}")
    return all_text_chunks

# Use parallel processing to extract text from all URLs
all_text_chunks = extract_text_from_urls(urls)

# Final output

for i, chunk in enumerate(all_text_chunks[:5]):  # Print first 5 chunks
    print(f"Chunk {i+1}: {chunk}\n")

# Save chunks to a text file
with open("desy_content.txt", "w", encoding="utf-8") as f:
    for chunk in all_text_chunks:
        f.write(chunk + "\n")

if not all_text_chunks:
    raise ValueError("No text chunks were extracted from the URLs.")

print(f"Total extracted chunks: {len(all_text_chunks)}")

[nltk_data] Downloading package punkt to /home/taheri/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Fetching https://www.desy.de/news/index_eng.html - Status Code: 200
Extracted 3 meaningful text chunks from https://www.desy.de/news/index_eng.html
Fetching https://www.desy.de/about_desy/desy/index_eng.html - Status Code: 200
Fetching https://www.desy.de/contact/index_eng.html - Status Code: 200
Fetching https://www.desy.de/index_eng.html - Status Code: 200
Extracted 8 meaningful text chunks from https://www.desy.de/about_desy/desy/index_eng.html
Extracted 7 meaningful text chunks from https://www.desy.de/contact/index_eng.html
Fetching https://www.desy.de/research/index_eng.html - Status Code: 200
Fetching https://www.desy.de/about_desy/directorate/helmut_dosch/index_eng.html - Status Code: 200
Extracted 8 meaningful text chunks from https://www.desy.de/research/index_eng.html
Extracted 8 meaningful text chunks from https://www.desy.de/about_desy/directorate/helmut_dosch/index_eng.html
Fetching https://www.desy.de/research/accelerators/index_eng.html - Status Code: 200
Fetching https

### 2-Convert Data into Embeddings (Vector Database)
Once we collect DESY’s text, we store it in a vector database for fast retrieval.
- Use OpenAI’s text-embedding-ada-002 or Hugging Face models (all-MiniLM-L6-v2).
- Store the vectors in FAISS, Pinecone, KDB, or ChromaDB.

| **Tool**       | **Performance** | **Scalability** | **Ease of Use** | **Cost** | **Support for Distance Metrics** | **Cloud Integration** |
|----------------|-----------------|-----------------|-----------------|----------|----------------------------------|-----------------------|
| **FAISS**      | Very Fast (especially for large datasets) | High (needs setup for very large data) | Medium (requires setup, more control) | Free (open-source) | Cosine, Euclidean, Inner product, etc. | Works with local or cloud setups |
| **Pinecone**   | Fast (optimized for similarity search) | Very High (highly scalable) | Very Easy (cloud-based, API-driven) | Pay-as-you-go (expensive for large datasets) | Cosine, Euclidean, Dot product, etc. | Cloud-based (fully managed service) |
| **KDB**        | Fast (best for time-series data, but can be used for vector search) | Very High (designed for large datasets) | Medium (requires more setup for vector storage) | Expensive (enterprise-focused) | Custom metrics (can handle various types) | Cloud-based (supports large-scale data) |
| **ChromaDB**   | Fast (optimized for embeddings, especially with vector search) | Medium (good scalability but less than Pinecone) | Easy (cloud and local integration) | Free with limited usage, Paid for large scale | Cosine, Euclidean, Inner product | Both cloud and local setups |



**GOAL :** evaluates the performance of multiple embedding models (both Hugging Face and OpenAI models) 
Cosine similarity method: It measures how close the embeddings of the query and documents are in the vector space. 

**What I learned:** 
- This doesn't always correlate with the quality of the retrieved documents for a specific task.
- Some models are better suited for specific tasks. For example, all-mpnet-base-v2 is a general-purpose model, while multi-qa-mpnet-base-dot-v1 is optimized for question-answering tasks.

**How to improve:**
- Instead of relying solely on cosine similarity, evaluate the models based on task-specific metrics (e.g., Precision@K, Recall@K, Mean Reciprocal Rank (MRR), NDCG, or F1 score for document retrieval)
- Embedding models fine tuning on our specific dataset to improve their performance.




In [7]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
# import torch
import random
# import os
# from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
# from langchain.vectorstores import FAISS


import os
import torch
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain.indexes import VectorstoreIndexCreator
from langchain_community.vectorstores import FAISS
from urllib.request import urlretrieve
from langchain_huggingface import HuggingFaceEmbeddings

os.environ["OPENAI_API_BASE"] = "https://api.helmholtz-blablador.fz-juelich.de/v1"
os.environ["OPENAI_API_KEY"] = "glpat-QJocA6joz1gm7XD8rrRo"


# Free up GPU memory if using CUDA
torch.cuda.empty_cache()


# Define a list of queries
queries = [
    "How many employees does DESY have?"
   # "What are the main research fields at DESY?",
   # "Who is the Chairman of the DESY Board of Directors?",
   # "Who is head of IT department?",
   # "What is European XFEL, and how is DESY involved?",
   # "How does DESY contribute to particle physics?"
]

# Function to preprocess text for consistency -> ensures fair comparisons by removing inconsistencies like capitalization and extra whitespace.
def preprocess_text(text):
    return text.strip().lower()  # Lowercasing and trimming spaces

# Apply preprocessing to each query in the list
queries = [preprocess_text(query) for query in queries]



# Define a test query
#query = "What are the main research fields at DESY?"


# Apply preprocessing to the query
#query = preprocess_text(query)

# Use a subset of all_text_chunks to reduce memory usage (e.g., 100 random chunks)
subset_text_chunks = random.sample(all_text_chunks, 100)
subset_text_chunks = [preprocess_text(text) for text in subset_text_chunks]

# List of Hugging Face and OpenAI models to evaluate

models = [
    "text-davinci-003", # OpenAI
    "text-embedding-ada-002", # OpenAI 
    "multi-qa-mpnet-base-dot-v1", #very sensitive to normalization & For multilingual support
    "all-MiniLM-L6-v2", #nor
    "msmarco-distilbert-base-v4", #nor
    "all-mpnet-base-v2",
    "paraphrase-MiniLM-L6-v2",
    "distiluse-base-multilingual-cased-v1" #nor
    #"GritLM-7B",  #"alias-embeddings",
    #"gpt-3.5-turbo" # (from OpenAI-bot not designed for embedding-gets the same score as text-embedding-ada-002)
]

# Iterate over each model
for model_name in models:
    print(f"Processing model: {model_name}")

    # Create an embedding instance for the current model
    if model_name in ["text-davinci-003", "text-embedding-ada-002"]:
        embeddings = OpenAIEmbeddings(model=model_name)
    else:
        embeddings = HuggingFaceEmbeddings(model_name=model_name)



        # Iterate over each query
    for query in queries:
        #print(f"Evaluating query: {query}")

        # Compute the normalized query embedding
        query_embedding = embeddings.embed_query(query)
        query_embedding = normalize([query_embedding])[0]

        # Build a FAISS index from the text chunks
        vectorstore = FAISS.from_texts(subset_text_chunks, embeddings)

        # Perform a similarity search for the query (retrieve the top 5 documents)
        docs = vectorstore.similarity_search(query, k=5)

        # Compute the average cosine similarity for the top 5 retrieved documents
        similarity_scores = []
        for doc in docs:
            doc_content = preprocess_text(doc.page_content)
            
            # Get the non-normalized embedding for the document
            doc_embedding = embeddings.embed_query(doc_content)

            # Normalize the document embedding
            doc_embedding_normalized = normalize([doc_embedding])[0]

            # Compute the cosine similarity between the query and the document
            similarity = cosine_similarity([query_embedding], [doc_embedding_normalized])[0][0]
            similarity_scores.append(similarity)

        # Calculate the average similarity score for the query
        avg_similarity_score = sum(similarity_scores) / len(similarity_scores)

        
        print(f"Model: {model_name} -- Query: {query}\n-- Normalized Average Top-5 Document Similarity Score: {round(avg_similarity_score, 2)}\n")
    
        # Clean up FAISS index to free memory
        del vectorstore
        torch.cuda.empty_cache()  # Clear GPU memory if applicable

    print("===============================")




Processing model: text-davinci-003
Model: text-davinci-003 -- Query: how many employees does desy have?
-- Normalized Average Top-5 Document Similarity Score: 0.56

Processing model: text-embedding-ada-002
Model: text-embedding-ada-002 -- Query: how many employees does desy have?
-- Normalized Average Top-5 Document Similarity Score: 0.56

Processing model: multi-qa-mpnet-base-dot-v1
Model: multi-qa-mpnet-base-dot-v1 -- Query: how many employees does desy have?
-- Normalized Average Top-5 Document Similarity Score: 0.61

Processing model: all-MiniLM-L6-v2
Model: all-MiniLM-L6-v2 -- Query: how many employees does desy have?
-- Normalized Average Top-5 Document Similarity Score: 0.52

Processing model: msmarco-distilbert-base-v4
Model: msmarco-distilbert-base-v4 -- Query: how many employees does desy have?
-- Normalized Average Top-5 Document Similarity Score: 0.43

Processing model: all-mpnet-base-v2
Model: all-mpnet-base-v2 -- Query: how many employees does desy have?
-- Normalized Ave

## Creating and storing vector space

In [15]:


import os
import torch
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings




import os
import torch
#from urllib.request import urlretrieve
#from langchain_openai import ChatOpenAI
#from langchain_openai import OpenAIEmbeddings
from langchain.indexes import VectorstoreIndexCreator
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import TextLoader

# You need to set them as environment variables, because the OpenAI API client uses them
# multiple times
os.environ["OPENAI_API_BASE"] = "https://api.helmholtz-blablador.fz-juelich.de/v1"
os.environ["OPENAI_API_KEY"] = "glpat-QJocA6joz1gm7XD8rrRo"


# Set environment variable to reduce memory fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Clear GPU memory
torch.cuda.empty_cache()


# Step 1: Load the FAISS index (if it exists)
# FAISS Index Directory
faiss_index_path = "desy_faiss_index"
os.makedirs(faiss_index_path, exist_ok=True)

# Initialize HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
##embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")

# Initialize OpenAI embeddings
#embeddings = OpenAIEmbeddings(model="text-davinci-003") 
#embeddings = OpenAIEmbeddings(model="text-embedding-ada-002") 

# Create and save the vector store
vectorstore = FAISS.from_texts(all_text_chunks, embeddings)
vectorstore.save_local(faiss_index_path)

### 3-Build the RAG Pipeline (LLM + Retrieval)
When a user asks a question, we:
- **Retrieve relevant DESY documents** from the vector database.
- **Feed them into an LLM** (LLaMA 2, GPT, or Mistral) to generate responses.


Example: Retrieval + LLM Response



| **Scenario**               | **Use "stuff"** | **Use "refine"** | **Use "map_reduce"** |
|----------------------------|---------------|---------------|----------------|
| Short documents            | ✅ | ❌ | ❌ |
| Long documents             | ❌ (token limit issue) | ✅ | ✅ |
| Needs detailed reasoning   | ❌ | ✅ (step-by-step) | ❌ |
| Needs summarization        | ❌ | ❌ | ✅ |
| Avoiding bias from a single doc | ❌ | ❌ | ✅ |



## chain_type= "stuff"
how the retrieved documents are combined and processed

In [16]:
API_KEY = "glpat-QJocA6joz1gm7XD8rrRo"
API_URL = "https://api.helmholtz-blablador.fz-juelich.de/v1/models"  # Correct endpoint to list models


from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_openai import ChatOpenAI
import numpy as np
from transformers import BitsAndBytesConfig
from IPython.display import display, Markdown

cache_dir = "/afs/desy.de/user/t/taheri/scratch/cache"

#Optimize Model Loading and Memory Usage (3 steps)
# 1: Clear GPU memory before loading the model
torch.cuda.empty_cache()

# Step 2: Load the Llama Model
model_name = "openlm-research/open_llama_3b"
#model_name = "openlm-research/open_llama_3b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir) #, use_fast=False

###device = "cpu" #torch.device("cuda" if torch.cuda.is_available() else "cpu")
###model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)  # Use FP16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device)

#2: 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # 4-bit quantization
    bnb_4bit_compute_dtype=torch.float16, # Use FP16 for reduced memory usage
    bnb_4bit_use_double_quant=True  # Further reduces memory usage
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    #device_map="auto",
    device_map=device,
    quantization_config=bnb_config,
    cache_dir=cache_dir
)

#3 Enable gradient checkpointing
model.gradient_checkpointing_enable()


# Print the cache directory
print(tokenizer.name_or_path)  # Model name


def extract_and_clean_answer(response, split_key="Answer:"):
    """
    Extract and clean the answer from the response.
    Removes duplicate lines, ensures the answer is not empty, and handles extra "Context:" or unwanted information.
    """
    # Split the answer based on the provided split_key (e.g., "Answer:")
    answer = response.split(split_key)[-1].strip()
    
    # Remove any lines that start with "Context:" or similar unwanted information
    cleaned_answer = []
    for line in answer.split("\n"):
        #if not line.lower().startswith("context:"):  # Remove lines starting with "context:"
        if not line.lower().startswith(("context:", "question:")):  # Remove lines starting with "context:" or "question:"
            cleaned_answer.append(line.strip())
        
          
    
    # Remove duplicate lines and reconstruct the answer
    unique_lines = list(dict.fromkeys(cleaned_answer))  # Remove duplicate lines
    return "\n".join(unique_lines).strip()  # Reconstruct answer without repetition



llama_model = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    #max_length=512,  # 128,256,512 for faster/slower inference & to control cost
    max_new_tokens=500  #256 Ensures short answers
    #temperature=0.5,  # [0-2] set randomness- increase for more creative answers like image/poem generator-decrease for more precise answer for fact based models
    #top_p=1.0,  # [0-1] Ensures only high-probability tokens are used- It controls how deterministic the model is. 
    #top_k=10,
    #do_sample=False,
    #num_return_sequences=1 #,
    #stop_sequence="\n"  # Stop generating after a newline
)



#llm = HuggingFacePipeline(pipeline=llama_model)
llm = ChatOpenAI(model="gpt-3.5-turbo")# , temperature=1




prompt_template = """Answer the question based on the context provided below. If the context does not contain the answer, say "I don't know." 
- Make sure to check **all parts of the context** carefully before answering, even if the answer is spread across multiple sections.
- Do **not** repeat words or phrases.
- Provide a **complete and well-structured sentence** as your answer.
- If the question asks for multiple points, provide a **list** or **detailed explanation**.
- If the answer requires interpretation or synthesis of multiple pieces of information, ensure that the answer reflects the entire context accurately.
- Do **not** repeat.

Context: {context}

Question: {question}

Answer:"""


PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])



# Step 3: Build the RAG pipeline
qa_with_data = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type= "stuff", # "stuff",  # Use "map_reduce": Step-by-Step Refinement or "refine"-> Step-by-Step Refinement for more complex tasks
    #retriever=vectorstore.as_retriever(search_kwargs={"k": 10, "score_threshold": 0.25, "search_type": "similarity"}),  # Adjust k based on document length #score_threshold to filter out low-confidence retrievals
    retriever = vectorstore.as_retriever(search_kwargs={"k": 30, "search_type": "mmr"}), #, "score_threshold": 0.3- "mmr" , "similarity"  
    return_source_documents=True , # Optionally return source documents for debugging
    chain_type_kwargs={"prompt": PROMPT}
)



def score_retrieved_documents(retrieved_docs, query_embedding):
    scores = []
    query_embedding = query_embedding / np.linalg.norm(query_embedding)  # Normalize query embedding
    for doc in retrieved_docs:
        doc_embedding = embeddings.embed_query(doc.page_content)
        doc_embedding = doc_embedding / np.linalg.norm(doc_embedding)  # Normalize document embedding
        similarity = np.dot(query_embedding, doc_embedding)  # Cosine similarity
        scores.append((doc, similarity))
    return sorted(scores, key=lambda x: x[1], reverse=True)



# Define the list of queries
queries = [
    "How many employees does DESY have?",
    "What are the main research fields at DESY?",
    "Who is the Chairman of the DESY Board of Directors?",
    "Who funds DESY, and how is it managed?",
    "What is European XFEL, and how is DESY involved?",
    "How does DESY contribute to particle physics?"
]



# Loop through each query and get the response
for query in queries:
    #print(f"Question: {query}")
    display(Markdown(f"**Question: {query}**"))
    print("===================================")

        # Get the answer using RAG
    result = qa_with_data({"query": query})
       
    answer_with_rag = extract_and_clean_answer(result.get('result', ''))
    
    print(f"Answer with RAG: {answer_with_rag}")

    print("===================================")

    # Generate response without using RAG
    formatted_prompt = PROMPT.format(context="No context available.", question=query)
    response_without_data = llama_model(formatted_prompt, num_return_sequences=1, do_sample=False) #, max_length=200, truncation=True, top_p=0.9

    # Extract only the answer from the response
    answer_without_rag = extract_and_clean_answer(response_without_data[0]["generated_text"])
    
    print(f"Answer without RAG: {answer_without_rag}")

    


Device set to use cuda


openlm-research/open_llama_3b


**Question: How many employees does DESY have?**

Answer with RAG: DESY has approximately 3000 employees.
Answer without RAG: 1000


**Question: What are the main research fields at DESY?**

Answer with RAG: The main research fields at DESY are particle physics, photon science, neutrino physics, and astroparticle physics.
Answer without RAG: - Particle physics
- Astroparticle physics
- Nuclear physics
- Astrophysics
- Computational physics
- Medical physics
- Industrial physics
-


**Question: Who is the Chairman of the DESY Board of Directors?**

Answer with RAG: The Chairman of the DESY Board of Directors is Prof. Dr. Dr. h.c. Helmut Dosch.
Answer without RAG: The DESY Board of Directors is responsible for the overall strategic direction of DESY.


**Question: Who funds DESY, and how is it managed?**

Answer with RAG: DESY is a member of the Helmholtz Association, which is a federation of 19 German research centers. The Helmholtz Association is funded primarily by the federal government of Germany. DESY is managed by a Board of Directors, which is responsible for setting the organization's overall direction, implementing strategies, and ensuring efficient operations. The Chairman of the Board of Directors, Prof. Dr. Dr. h.c. Helmut Dosch, plays a key role in the management of DESY.
Answer without RAG: The DESY laser is a powerful laser that is used to study the properties of matter.


**Question: What is European XFEL, and how is DESY involved?**

Answer with RAG: European XFEL is a X-ray Free-Electron Laser facility, which is Europe's big X-ray laser. DESY is the main shareholder and operates the accelerator of the facility, with the involvement of eleven other countries in the project.
Answer without RAG: DESY is a research institute in Germany that is involved in the project.


**Question: How does DESY contribute to particle physics?**

Answer with RAG: DESY contributes to particle physics by being involved in the Belle II experiment at the Super, developing, operating and utilizing state-of-the-art accelerator facilities, designing and construction of extremely sensitive pixel vertex detector, exploring various options for developing theories, playing a leading role in the Radio Neutrino Observatory in Greenland (RNO-G) and the Ice. Cube collaboration, participating in the ULTRASAT satellite mission, and driving the design, development and upgradeas well as the data analysis of Ice. Cube-Gen2, the neutrino observatory at the South Pole is to be extended with radio antennaslike those tested at RNO-G.
Answer without RAG: DESY is a research center in Germany that is dedicated to particle physics. It is located in the city of Hamburg and is home to the world's largest particle accelerator, the LHC. DESY also houses a number of other facilities that are used to study the fundamental particles and forces that govern the un

## chain_type="map_reduce"

#### map_reduced need to have two prompts- They need to be improved

In [17]:
API_KEY = "glpat-QJocA6joz1gm7XD8rrRo"
API_URL = "https://api.helmholtz-blablador.fz-juelich.de/v1/models"  # Correct endpoint to list models


from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_openai import ChatOpenAI
import numpy as np
from transformers import BitsAndBytesConfig

cache_dir = "/afs/desy.de/user/t/taheri/scratch/cache"

#Optimize Model Loading and Memory Usage (3 steps)
# 1: Clear GPU memory before loading the model
torch.cuda.empty_cache()

# Step 2: Load the Llama Model
model_name = "openlm-research/open_llama_3b"
#model_name = "openlm-research/open_llama_3b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir) #, use_fast=False

###device = "cpu" #torch.device("cuda" if torch.cuda.is_available() else "cpu")
###model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)  # Use FP16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device)

#2: 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # 4-bit quantization
    bnb_4bit_compute_dtype=torch.float16, # Use FP16 for reduced memory usage
    bnb_4bit_use_double_quant=True  # Further reduces memory usage
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device,
    quantization_config=bnb_config,
    cache_dir=cache_dir
)

#3 Enable gradient checkpointing
model.gradient_checkpointing_enable()


# Print the cache directory
print(tokenizer.name_or_path)  # Model name


llama_model = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    #max_length=512,  # 128,256,512 for faster/slower inference & to control cost
    max_new_tokens=500  #256 Ensures short answers
)



#llm = HuggingFacePipeline(pipeline=llama_model)
llm = ChatOpenAI(model="gpt-3.5-turbo")# , temperature=1


#========================================================


question_prompt_template = """Answer the question based on the context provided below. If the context does not contain the answer, say "I don't know."
- **If the context contains numerical or factual details, use them exactly.**
- If the question asks for a number, return only the number if possible.
- **Do not ignore any information in the provided context.**
- If multiple pieces of information are available, synthesize them.

Context: {context}

Question: {question}

Answer:"""

question_prompt = PromptTemplate(
    template=question_prompt_template,
    input_variables=["context", "question"]
)


combine_prompt_template = """
Synthesize the answers from the following pieces of context to provide a final answer. If the context does not contain the answer, say "I don't know."
- Make sure to **synthesize** the information from all parts of the context.
- Provide a **detailed answer**, including all numbers, quantities, or lists if mentioned.
- If the question asks for multiple points, make sure to list them all or provide a comprehensive explanation.

Context: {context}
Question: {question}
Answer:
"""


combine_prompt = PromptTemplate(
    template=combine_prompt_template,
    input_variables=["context", "question"]
)



# Step 3: Build the RAG pipeline
qa_with_data = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="map_reduce",  # Use "map_reduce" for more complex tasks
    retriever=vectorstore.as_retriever(
        search_kwargs={
            "k": 30,  # Number of documents to retrieve
            "search_type": "mmr"  # Use Maximal Marginal Relevance
        }
    ),
    return_source_documents=True,  # Optionally return source documents for debugging
    chain_type_kwargs={
    "question_prompt": question_prompt,
    "combine_prompt": combine_prompt,
    "combine_document_variable_name": "context"  # Ensure correct variable name
}

)




# Query the pipeline
query = "How many employees does DESY have?"






print(f"Question: {query}")
print("===================================")

result = qa_with_data({"query": query})
# Post-process the output to extract only the answer
answer = result['result'].split("Answer:")[-1].strip()
if not result.get('result', None):
    print("Sorry, I couldn't find an answer.")
else:
    print(f"Answer with RAG: {answer}")

#print(f"Answer with RAG: {answer}")



Device set to use cuda


openlm-research/open_llama_3b
Question: How many employees does DESY have?
Answer with RAG: Approximately 3000


In [None]:
# Print retrieved documents
retrieved_docs = retriever.get_relevant_documents(query)
for i, doc in enumerate(retrieved_docs):
    print(f"Document {i+1}: {doc.page_content[:200]}...")  # Print the first 200 characters of each document

In [25]:
! git branch
! git branch main  # Create a new branch called 'main'
! git checkout main  # Switch to the 'main' branch

* [32mmain[m
  master[m


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


fatal: a branch named 'main' already exists
M	DESY-IT-LLM-2025-02-21.ipynb
A	DESY-IT-LLM-2025-03-07.ipynb
Already on 'main'


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
