### 1- Scrape and Extract Website Data
Use BeautifulSoup or Scrapy to scrape text.

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import nltk
from collections import OrderedDict
import difflib


def is_similar(a, b, threshold=0.9):
    return difflib.SequenceMatcher(None, a, b).ratio() > threshold
    
# Download NLTK sentence tokenizer
nltk.download('punkt')
#nltk.download('punkt_tab')

# List of URLs to fetch
urls = [
    "https://www.desy.de/index_eng.html",
    "https://www.desy.de/news/index_eng.html",
    "https://www.desy.de/about_desy/desy/index_eng.html",
    "https://www.desy.de/contact/index_eng.html",
    "https://www.desy.de/about_desy/directorate/helmut_dosch/index_eng.html",
    "https://www.desy.de/research/index_eng.html",
    "https://www.desy.de/research/accelerators/index_eng.html",
    "https://www.desy.de/research/photon_science/index_eng.html",
    "https://www.desy.de/research/particle_physics/index_eng.html",
    "https://www.desy.de/research/astroparticle_physics/index_eng.html",
    "https://www.desy.de/research/facilities__projects/index_eng.html",
    "https://www.desy.de/research/cooperations__institutes/index_eng.html",
    "https://www.desy.de/research/facilities__projects/european_xfel/index_eng.html",
    "https://www.desy.de/research/facilities__projects/petra_iv/index_eng.html"
]

all_text_chunks = []
# Fetch the webpage
#url = "https://www.desy.de/index_eng.html"




for url in urls:
    try:
        #Many websites block requests from scripts that don’t look like normal browsers. Try adding a User-Agent to your request:
        headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.0.0 Safari/537.36"
        }
        response = requests.get(url, headers=headers, timeout=10)
        #response = requests.get(url, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch {url}: {e}")
        continue
    print(f"Fetching {url} - Status Code: {response.status_code}")
    #response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

 
    #main_content = soup.find('main')
    #main_content = soup.find('div', id='content')  # Example if content is in a div with class="content" "research-field"
    main_content = soup.find('div', id='content') or soup.find('body')
    

    #if main_content:
    #    print(f"Extracted main content from {url}")
    #else:
    #    print(f"No <main> tag found in {url}, using full page")


    
    if main_content:
      #  text = main_content.get_text(strip=True, separator=" ")  # Extract and clean text
        text = main_content.get_text(strip=True, separator="\n")  # Extract and clean text
    else:
        text = soup.get_text(strip=True, separator=" ")  # Fallback to full-page text extraction


    if main_content:
    # Find all relevant text elements, but only if they are marked as English (lang="en")
        tags = main_content.find_all(
            ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'div', 'li', 'article', 'section', 'td', 'th'],
            lang="en")

    # If no explicit lang="en" is found, fallback to extracting all tags
        if not tags:
            tags = main_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'div', 'li', 'article', 'section', 'td', 'th'])

    else:
        tags = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'div', 'li'])


    
    # if main_content:
    #     #tags = main_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'div', 'li'])
    #     tags = main_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'div', 'li', 'article', 'section', 'td', 'th'],lang="en")

    # else:
    #     tags = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'div', 'li'])

    raw_text = " ".join(tag.get_text(strip=True) for tag in tags)
    #print(f"Raw extracted text from {url}:\n{raw_text[:500]}...\n")


# Extract and clean text
# Remove unwanted content (e.g., navigation menu, footer text)
#    excluded_keywords = [ "contact", "privacy", "terms", "login"] #, "menu", "search"
    excluded_keywords = [
    "contact", "privacy", "terms", "login", "menu", "search", 
    "subscribe", "cookie", "policy", "newsletter", "copyright", 
    "footer", "disclaimer", "faq", "sitemap"
    ]

    filtered_tags = [
        tag for tag in tags if not any(kw in tag.get_text().lower() for kw in excluded_keywords)
    ]
    # Clean text    
    text = " ".join(tag.get_text(strip=True) for tag in filtered_tags)
    text = re.sub(r'\s+', ' ', text)  # Normalize spaces
    text = re.sub(r'\[.*?\]', '', text)  # Remove content inside square brackets
    text = re.sub(r'\n+', '\n', text)  # Remove excess newlines
    text = re.sub(r'\s([?.!,;])', r'\1', text)  # Fix spacing before punctuation
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters
    text = re.sub(r'([a-z])([A-Z])', r'\1. \2', text)  # Fix missing spaces between sentences
    text = re.sub(r'(?<=[a-zA-Z])\s*\n\s*(?=[a-zA-Z])', '. ', text)  # Convert newlines to periods if missing punctuation

    # Split text into sentences
    sentences = nltk.sent_tokenize(text)
    

    #print("===============")
    #print("Sample tokenized sentences:", sentences[:5])
    #print("===============")
    # Filter out very short sentences
    chunks = [sentence.strip() for sentence in sentences if len(sentence.split()) > 3]  # Adjust the length as needed



    # Normalize sentences (lowercase, strip extra spaces)
    normalized_chunks = list(set(sentence.lower().strip() for sentence in chunks))


    # Remove near-duplicates (keep only one version)
    filtered_chunks = []
    for sentence in normalized_chunks:
        if not any(is_similar(sentence, existing) for existing in filtered_chunks):
            filtered_chunks.append(sentence)

    # # Convert to a set to remove duplicates
    # unique_chunks = list(set(chunks))
    # # Sort by original order (optional, to maintain readability)
    # unique_chunks.sort(key=lambda x: chunks.index(x))
    # # Append extracted unique chunks
    # all_text_chunks.extend(unique_chunks)
    # print(f"Extracted {len(unique_chunks)} unique text chunks")

    # Preserve original order while ensuring uniqueness
    unique_chunks = list(OrderedDict.fromkeys(filtered_chunks))  # Removes duplicates while keeping order

    #Append extracted unique chunks
    all_text_chunks.extend(unique_chunks)
    
    # Append extracted chunks to the main list
    #all_text_chunks.extend(chunks)
    
    print(f"Extracted {len(chunks)} meaningful text chunks")


print(f"Total extracted text chunks: {len(all_text_chunks)}")
for i, chunk in enumerate(all_text_chunks[:5]):  # Print first 5 chunks
    print(f"Chunk {i+1}: {chunk}\n")



# Save chunks to a text file
with open("desy_content.txt", "w", encoding="utf-8") as f:
    for chunk in all_text_chunks:
        f.write(chunk + "\n")

if not all_text_chunks:
    raise ValueError("No text chunks were extracted from the URLs.")





print(f"Total extracted chunks: {len(all_text_chunks)}")

[nltk_data] Downloading package punkt to /home/taheri/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Fetching https://www.desy.de/index_eng.html - Status Code: 200
Extracted 65 meaningful text chunks
Fetching https://www.desy.de/news/index_eng.html - Status Code: 200
Extracted 4 meaningful text chunks
Fetching https://www.desy.de/about_desy/desy/index_eng.html - Status Code: 200
Extracted 9 meaningful text chunks
Fetching https://www.desy.de/contact/index_eng.html - Status Code: 200
Extracted 8 meaningful text chunks
Fetching https://www.desy.de/about_desy/directorate/helmut_dosch/index_eng.html - Status Code: 200
Extracted 19 meaningful text chunks
Fetching https://www.desy.de/research/index_eng.html - Status Code: 200
Extracted 13 meaningful text chunks
Fetching https://www.desy.de/research/accelerators/index_eng.html - Status Code: 200
Extracted 12 meaningful text chunks
Fetching https://www.desy.de/research/photon_science/index_eng.html - Status Code: 200
Extracted 20 meaningful text chunks
Fetching https://www.desy.de/research/particle_physics/index_eng.html - Status Code: 200
Ex

### 2-Convert Data into Embeddings (Vector Database)
Once we collect DESY’s text, we store it in a vector database for fast retrieval.
- Use OpenAI’s text-embedding-ada-002 or Hugging Face models (all-MiniLM-L6-v2).
- Store the vectors in FAISS, Pinecone, or ChromaDB.


In [2]:

import os
import torch
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS


# Set environment variable to reduce memory fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Clear GPU memory
torch.cuda.empty_cache()


# Step 1: Load the FAISS index (if it exists)
# FAISS Index Directory
faiss_index_path = "desy_faiss_index"
os.makedirs(faiss_index_path, exist_ok=True)


embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
vectorstore = FAISS.from_texts(all_text_chunks, embeddings)
vectorstore.save_local("desy_faiss_index")





### 3-Build the RAG Pipeline (LLM + Retrieval)
When a user asks a question, we:
- **Retrieve relevant DESY documents** from the vector database.
- **Feed them into an LLM** (LLaMA 2, GPT, or Mistral) to generate responses.


Example: Retrieval + LLM Response



In [3]:

from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Step 2: Load the Llama Model
model_name = "openlm-research/open_llama_3b"
#model_name = "openlm-research/open_llama_3b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)  # Use fast tokenizer if available
device = "cpu" #torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
#model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)  # Use FP16


# Enable gradient checkpointing
model.gradient_checkpointing_enable()



llama_model = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    #max_length=512,  # 128,256,512 for faster/slower inference & to control cost
    #max_new_tokens=500,  # Ensures short answers
    temperature=1.0,  # [0-2] set randomness- increase for more creative answers like image/poem generator-decrease for more precise answer for fact based models
    top_p=1.0,  # [0-1] Ensures only high-probability tokens are used- It controls how deterministic the model is. 
    #top_k=10,
    #do_sample=False,
    #num_return_sequences=1 #,
    #stop_sequence="\n"  # Stop generating after a newline
)



llm = HuggingFacePipeline(pipeline=llama_model)

prompt_template = """Answer the question based on the context provided below. If the context does not contain the answer, say "I don't know." 
- Make sure to check **all parts of the context** carefully before answering, even if the answer is spread across multiple sections.
- Do **not** repeat words or phrases.
- Provide a **complete and well-structured sentence** as your answer.
- If the question asks for multiple points, provide a **list** or **detailed explanation**.
- If the answer requires interpretation or synthesis of multiple pieces of information, ensure that the answer reflects the entire context accurately.

Context: {context}

Question: {question}

Answer:"""





PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])


#print("Final Context to LLM:")
#for doc in retrieved_docs:
#    print(doc.page_content)


# Step 3: Build the RAG pipeline
qa_with_data = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # Use "map_reduce" or "refine" for more complex tasks
    #retriever=vectorstore.as_retriever(search_kwargs={"k": 10, "score_threshold": 0.25, "search_type": "similarity"}),  # Adjust k based on document length #score_threshold to filter out low-confidence retrievals
    retriever = vectorstore.as_retriever(search_kwargs={"k": 30, "search_type": "mmr"}), #, "score_threshold": 0.3   
    return_source_documents=True , # Optionally return source documents for debugging
    chain_type_kwargs={"prompt": PROMPT}
)


# Query the pipeline
query = "How many employees does DESY have?"
#query = "What are the main research fields at DESY?"
#query = "Who is the Chairman of the DESY Board of Directors?"
#query = "Who funds DESY, and how is it managed?"
#quary = "What is European XFEL, and how is DESY involved?"
#query="How does DESY contribute to particle physics?"


print(f"Question: {query}")
print("===================================")

result = qa_with_data({"query": query})
# Post-process the output to extract only the answer
answer = result['result'].split("Answer:")[-1].strip()
print(f"Answer with RAG: {answer}")

print("===================================")

formatted_prompt = PROMPT.format(context="No context available.", question=query)
response_without_data = llama_model(formatted_prompt, num_return_sequences=1, do_sample=False)


#response_without_data = llama_model(query, num_return_sequences=1, do_sample=False) #max_length=256, truncation=True
print("Answer without RAG:", response_without_data[0]["generated_text"].split("Answer:")[-1].strip())

#print("===================================")
#print(f"Source Documents: {result['source_documents']}")

#scoring function? 


You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggin

Question: How many employees does DESY have?
Answer with RAG: DESY has approximately 3000 employees.

Question: What is DESY
Answer without RAG: 1000


In [14]:
!git --version
!git config --global user.email "taheri@mail.desy.de"
!git config --global user.name "taheri"


git version 2.43.5


In [None]:
!ssh-keygen -t rsa -b 4096 -C "taheri@mail.desy.de"


In [None]:
!cat ~/.ssh/id_rsa.pub


In [11]:

!git init
!git add DESY-IT-LLM-2025-02-21.ipynb
!git commit -m "Initial commit"
!git remote add origin https://gitlab.desy.de/taheri/repo.git
!git push -u origin main

Reinitialized existing Git repository in /home/taheri/.git/
[master (root-commit) 10bf5a6] Initial commit
 1 file changed, 582 insertions(+)
 create mode 100644 DESY-IT-LLM-2025-02-21.ipynb
error: remote origin already exists.
error: src refspec main does not match any
[31merror: failed to push some refs to 'https://github.com/username/repo.git'
[m