In [1]:
!pip install bs4 requests sentence-transformers langchain chromadb transformers langchain-community

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting langchain
  Downloading langchain-0.3.10-py3-none-any.whl.metadata (7.1 kB)
Collecting chromadb
  Downloading chromadb-0.5.23-py3-none-any.whl.metadata (6.8 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.10-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-core<0.4.0,>=0.3.22 (from langchain)
  Downloading langchain_core-0.3.22-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_text_splitters-0.3.2-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.147-py3-none-any.whl.metadata (14 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.

# Importing Libraries

In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

#  Initialize Embeddings and ChromaDB # 

In [3]:
# # ---------- STEP 1: Initialize Embeddings and ChromaDB ----------
# Initialize sentence transformer for embeddings
embedding_model = SentenceTransformer("all-MiniLM-L12-v2")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2")

# Create ChromaDB instance
chroma_db = Chroma(collection_name="qa_collection_1", embedding_function=embeddings)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2")
  chroma_db = Chroma(collection_name="qa_collection_1", embedding_function=embeddings)


# Filtering text

In [4]:
from bs4 import BeautifulSoup
import requests

def extract_relevant_text(soup):
    """
    Extracts relevant information from elements with the class name 'ind'.

    Args:
        soup (BeautifulSoup): Parsed HTML content.

    Returns:
        str: Cleaned and relevant text content.
    """
    # Find the element with the class 'ind'
    relevant_section = soup.find(class_="ind")  # Assuming 'ind' is the correct class name

    if not relevant_section:
        return "No content found with the specified class."

    # Initialize a list to store the extracted text
    relevant_text = []

    # Extract text from relevant tags within the section
    for tag in ['p', 'h1', 'h2', 'h3', 'ul', 'ol', 'li']:
        elements = relevant_section.find_all(tag)
        for element in elements:
            text = element.get_text(separator=' ', strip=True)
            if text:  # Skip empty strings
                relevant_text.append(text)

    # Join all the text content into a single string
    cleaned_text = ' '.join(relevant_text)

    return cleaned_text

In [5]:

# ---------- STEP 2: Helper Functions for Scraping ----------
def is_valid_url(url, base_url):
    """Check if a URL is valid and belongs to the same domain as the base URL."""
    parsed_base = urlparse(base_url)
    parsed_url = urlparse(url)
    if parsed_url.scheme not in ['http', 'https']:
        return False
    if parsed_base.netloc != parsed_url.netloc:
        return False
    return True

def chunk_text(text, chunk_size=500):
    """Split text into smaller chunks."""
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

def get_version_from_link(link):
    """Extract Opera Cloud version from the URL."""
    parts = link.split('/')
    for part in parts:
        if part.count('.') == 1 and part[0].isdigit():  # Simplistic version check
            return part
    return "Unknown version"

# ---------- STEP 3: Scraping and Storing in ChromaDB ----------
def scrape_and_store(start_url):
    visited = set()
    stack = [(start_url, None, 0)]  # (url, parent_titles, depth)
    count=0

    while stack:
        url, parent_titles, depth = stack.pop()
        if url in visited or depth > 4:
            continue
        visited.add(url)
        

        try:
            if "prerequisites" in url.lower():
                print(f"Skipping URL (contains 'prerequisites'): {url}")
                continue
            response = requests.get(url)
            if response.status_code != 200:
                print(f"Failed to retrieve data from {url}")
                count = count +1
                continue

            soup = BeautifulSoup(response.text, 'html.parser')
            title = soup.find('title').text if soup.find('title') else 'No title available'
            # page_text = soup.get_text(separator=' ', strip=True)
            page_text = extract_relevant_text(soup)
            text_chunks = chunk_text(page_text)
            #print(f"Processing page: {title} with {len(text_chunks)} chunks")

            # Extract metadata
            version = get_version_from_link(url)
            current_titles = parent_titles + title if parent_titles else title

            #print('meta_data--> title', title,' current title-->' ,current_titles,'version-->' ,version,'link-->', url)
            # Store chunks in ChromaDB

            for chunk in text_chunks:
                doc = Document(
                    page_content=chunk,
                    metadata={
                        "title": title,
                        "tags": current_titles,
                        "version": version,
                        "link": url
                    }
                )

                chroma_db.add_documents([doc])

            print(f"Stored page: {title} with {len(text_chunks)} chunks")

            # Add links to the stack for further crawling
            if depth < 3:  # Crawl up to a depth of 3
                links = soup.find_all('a', href=True)
                for link in links:
                    full_link = urljoin(url, link['href'])
                    if is_valid_url(full_link, start_url):
                        stack.append((full_link, current_titles, depth + 1))

        except Exception as e:
            print(f"Error processing {url}: {e}")

In [6]:
# ---------- STEP 6: Run the Scraper and RAG System ----------
# Start URL for scraping
start_url = "https://docs.oracle.com/en/industries/hospitality/opera-cloud/24.4/ocsuh/part_opera_cloud_menu.htm"

# Scrape and store data in ChromaDB
scrape_and_store(start_url)

Stored page: OPERA Cloud with 1 chunks
Stored page: About Reports with 1 chunks
Stored page: Configuring Printers with 4 chunks
Stored page: Interfaces Administration with 1 chunks
Stored page: About Reservation Alerts with 1 chunks
Stored page: Managing Scheduled Reports with 20 chunks
Stored page: Generating Reports with 6 chunks
Stored page: Global Alert (Rule) Definitions with 2 chunks
Stored page: Administration with 1 chunks
Stored page: Oracle Hospitality OPERA Cloud Services User Guide, Release 24.4 with 1 chunks
Stored page: Configuring Stationery Custom Message Texts with 3 chunks
Stored page: Configuring a Mobile ID Document Scanner with 4 chunks
Stored page: Accessing Reporting And Analytics with 1 chunks
Stored page: OPERA Reporting and Analytics 24.4 - Get Started with 1 chunks
Stored page: OPERA Cloud Mobile Application with 1 chunks
Stored page: Managing QR Code Definitions with 5 chunks
Stored page: Copying Reports to Multiple Properties with 1 chunks
Stored page: Conf

# Rag with Llama-3.2-1B-Instruct model

In [8]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, AutoModelForCausalLM

# Set your Hugging Face API token
hf_token = " "

# ---------- STEP 4: Hugging Face Model for Generation ----------
# Load a Hugging Face model for text generation (e.g., FLAN-T5)
model_name_1 = "meta-llama/Llama-3.2-1B-Instruct"  # You can also use "facebook/bart-large"
tokenizer_1 = AutoTokenizer.from_pretrained(model_name_1, use_auth_token=hf_token)
model_1 = AutoModelForCausalLM.from_pretrained(model_name_1, use_auth_token=hf_token).to("cuda")



tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

# Testing the pipeline

In [10]:
# Create a pipeline for text generation
qa_pipeline_1 = pipeline("text-generation", model=model_1, tokenizer= tokenizer_1,device=0)

In [7]:
# ---------- STEP 5: RAG Logic ----------
def generate_answer_with_context(question, retriever, qa_pipeline):
    # Retrieve relevant documents
    results = retriever.get_relevant_documents(question)

    # Combine retrieved chunks into a single context
    context = "\n".join([doc.page_content for doc in results])

    # Format the prompt for the model
    prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
    #print('heres the prompt-->',prompt)

    # Generate the answer using the Hugging Face model
    response = qa_pipeline(prompt, max_new_tokens=200, truncation=False)
    answer = response[0]['generated_text']

    # Return the answer and source documents
    return {
        "answer": answer,
        "source_documents": results
    }

# setting retriver

In [15]:

# Initialize retriever
retriever = chroma_db.as_retriever(search_type="similarity", search_kwargs={"k": 5})

In [16]:
# Ask a question
query = "How to make reservations?"
response = generate_answer_with_context(query, retriever, qa_pipeline_1)

import re
pattern = r"(?<=Answer:).*"
# Searching the text for the pattern
match = re.search(pattern, response['answer'],re.DOTALL)

# Extracting the answer if a match is found
if match:
    answer = match.group(0)
    print("Answer:", answer)
else:
    print("No answer found.")
# Display source documents
print("\nSource Documents:")
for doc in response["source_documents"]:
    print(f"- Title: {doc.metadata['title']}, Link: {doc.metadata['link']}")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Answer:  To make reservations, you can use the following methods:

1. Reservation Management System: The Reservation Management System is a web-based application that allows you to manage reservations, including creating, editing, and canceling reservations. You can access the system through a web browser or a mobile app.
2. Reservation Portal: The Reservation Portal is a web-based application that provides a user-friendly interface for making reservations. You can access the portal through a web browser or a mobile app.
3. Reservation Call Center: The Reservation Call Center is a phone-based system that allows you to make reservations over the phone.
4. Reservation Email: You can also make reservations by sending an email to the hotel or resort.

Note: The Reservation Management System, Reservation Portal, and Reservation Call Center are available at the hotel or resort, while the Reservation Email is an alternative method for making reservations.

Source Documents:
- Title: Managing 

# Importing QnA dataset

In [8]:
import json
with open('/kaggle/input/qna-data-2/QnA.json', 'r') as file:
    qa_dataset = json.load(file)

In [9]:
!pip install rouge-score

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=09576a99a5a35332898645533cbd69d46702d85f7e443c8ed9477d7a7cd9d9ef
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [10]:
from rouge_score import rouge_scorer

def calculate_rogue_score(qa_pipeline, retriver):
    # Create a ROUGE scorer instance with stemmer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    # Initialize dictionaries to accumulate scores
    total_scores = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
    count = 0
    
    # Loop over each question-answer pair in the dataset
    for qa in qa_dataset:
        # Generate a response based on the question using some pipeline
        response = generate_answer_with_context(qa['question'], retriver, qa_pipeline)
        
        # Score the generated response against the reference answer
        scores = scorer.score(qa['answer'], response['answer'])
        
        # Accumulate the scores
        for key in scores:
            total_scores[key] += scores[key].fmeasure  # Assuming we're interested in F-measure
        
        # Increase the count
        count += 1
    
    # Calculate the average scores
    if count > 0:
        avg_scores = {key: total_score / count for key, total_score in total_scores.items()}
    else:
        avg_scores = total_scores  # Prevent division by zero
    
    # Print the average scores
    print("Average ROUGE scores:", avg_scores)
    return avg_scores


# Vanilla pipeline

In [21]:

# Initialize retriever
retriever_vanilla = chroma_db.as_retriever(search_type="similarity", search_kwargs={"k": 5})
# Create a pipeline for text generation
qa_pipeline_vanilla = pipeline("text-generation", model=model_1, tokenizer= tokenizer_1,device=0)
vanilla_rogue_score= calculate_rogue_score(qa_pipeline_vanilla,retriever_vanilla)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id

Average ROUGE scores: {'rouge1': 0.025794500729876838, 'rouge2': 0.011856981809662496, 'rougeL': 0.020456494160930632}


# top-k parameter = 5

In [22]:

# Initialize retriever
retriever_tk_5 = chroma_db.as_retriever(search_type="similarity", search_kwargs={"k": 5})
# Create a pipeline for text generation
qa_pipeline_tk_5 = pipeline("text-generation", model=model_1, tokenizer= tokenizer_1,device=0, top_k=5 )
tk_5_rogue_score= calculate_rogue_score(qa_pipeline_tk_5,retriever_tk_5)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

Average ROUGE scores: {'rouge1': 0.02548906432512992, 'rouge2': 0.011323712588426451, 'rougeL': 0.02017432505993047}


# top-k parameter = 10

In [23]:

# Initialize retriever
retriever_tk_10 = chroma_db.as_retriever(search_type="similarity", search_kwargs={"k": 5})
# Create a pipeline for text generation
qa_pipeline_tk_10 = pipeline("text-generation", model=model_1, tokenizer= tokenizer_1,device=0, top_k=10 )
tk_5_rogue_score= calculate_rogue_score(qa_pipeline_tk_10,retriever_tk_10)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

Average ROUGE scores: {'rouge1': 0.02556005915139631, 'rouge2': 0.011707615852853084, 'rougeL': 0.020678997199137888}
