# 1. Reading Websites, PDFs and Saving Data to .txt Files

In this section, we extract the content from the provided URLs (webpages) and save the text data into `.txt` files. This is done by scraping the content from the web pages, removing unnecessary formatting, and storing the clean text for further processing.

**Key steps:**
- Identify the URLs for webpages.
- Scrape the webpage content.
- Clean and process the content.
- Save the processed text into `.txt` files for further use.

In [None]:
import os
import os
import re
import json
import random
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re
import random
import json
import requests
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Ensure dataset directory exists
os.makedirs("dataset", exist_ok=True)

# Combined dictionary mapping URLs to filenames (both webpages and PDFs)
url_filename_mapping = {
    "https://www.larsentoubro.com/corporate/about-lt-group/overview/": "Larsen_Toubro_Overview.txt",
    "https://www.larsentoubro.com/corporate/about-lt-group/technology-for-growth/": "Larsen_Toubro_Technology_for_Growth.txt",
    "https://www.larsentoubro.com/corporate/about-lt-group/awards-recognition/": "Larsen_Toubro_Awards_Recognition.txt",
    "https://www.larsentoubro.com/corporate/about-lt-group/leadership/": "Larsen_Toubro_Leadership.txt",
    "https://www.larsentoubro.com/corporate/about-lt-group/facilities/": "Larsen_Toubro_Facilities.txt",
    "https://www.larsentoubro.com/corporate/about-lt-group/experience-centre-mumbai/": "Larsen_Toubro_Experience_Centre_Mumbai.txt",
    "https://www.lntsustainability.com/overview/": "LNT_Sustainability_Overview.txt",
    "https://www.lntsustainability.com/climate-strategy/": "LNT_Climate_Strategy.txt",
    "https://www.lntsustainability.com/environment/": "LNT_Environment.txt",
    "https://www.lntsustainability.com/green-business/": "LNT_Green_Business.txt",
    "https://www.larsentoubro.com/corporate/careers/learning-development/": "LNT_Careers_Learning_Development.txt",
    "https://www.larsentoubro.com/corporate/careers/diversity-equity-inclusion/": "LNT_Careers_Diversity_Equity_Inclusion.txt",
    "https://www.larsentoubro.com/corporate/careers/recruitment-caution/": "LNT_Careers_Recruitment_Caution.txt",
    "https://www.larsentoubro.com/corporate/careers/campus-recruitment/": "LNT_Careers_Campus_Recruitment.txt",
    "https://www.larsentoubro.com/corporate/careers/renew-career-re-entry-for-women/": "LNT_Careers_ReEntry_for_Women.txt",
    
    # PDF URLs (Added these to the same dictionary)
    "https://annualreview.larsentoubro.com/download/L&T-Annual-Review-2024.pdf": "LT_Annual_Review_2024.pdf",
    "https://annualreview.larsentoubro.com/download/L&T%20Annual%20Review%202023.pdf": "LT_Annual_Review_2023.pdf",
    "https://annualreview.larsentoubro.com/download/Annual%20Review%202022.pdf": "Annual_Review_2022.pdf",
    "https://annualreview.larsentoubro.com/download/Annual_Review_2021.pdf": "Annual_Review_2021.pdf",
    "https://annualreview.larsentoubro.com/download/L&T%20Annual%20Review%202020.pdf": "LT_Annual_Review_2020.pdf"
}

# Ensure dataset directory exists
os.makedirs("dataset", exist_ok=True)

def save_webpages_as_text(urls):
    for url, filename in urls.items():
        try:
            response = requests.get(url)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, "html.parser")
            text = soup.get_text(separator="\n")

            with open(f"dataset/{filename}", "w", encoding="utf-8") as file:
                file.write(text)

            print(f"Saved: {filename}")
        except Exception as e:
            print(f"Error processing {url}: {e}")

def download_and_parse_pdfs(pdf_urls):
    for url, filename in pdf_urls.items():
        pdf_filepath = f"dataset/{filename}"  # Save inside dataset folder
        txt_filename = filename.replace(".pdf", ".txt")  # Convert to text file name
        txt_filepath = f"dataset/{txt_filename}"

        try:
            # Step 1: Download the PDF
            response = requests.get(url, stream=True)
            response.raise_for_status()

            with open(pdf_filepath, "wb") as file:
                for chunk in response.iter_content(1024):
                    file.write(chunk)

            print(f"Downloaded: {pdf_filepath}")

            # Step 2: Extract text from the PDF
            with open(pdf_filepath, "rb") as pdf_file:
                reader = PdfReader(pdf_file)
                extracted_text = "\n".join([page.extract_text() or "" for page in reader.pages])

            # Step 3: Save extracted text to a .txt file
            with open(txt_filepath, "w", encoding="utf-8") as txt_file:
                txt_file.write(extracted_text)

            print(f"Extracted text saved: {txt_filepath}")

        except Exception as e:
            print(f"Error processing {url}: {e}")

def process_text_files(urls):
    """
    Processes all URLs, sending PDFs to download_and_parse_pdfs and webpages to save_webpages_as_text.
    """
    pdf_urls = {url: filename for url, filename in urls.items() if url.endswith('.pdf')}
    webpage_urls = {url: filename for url, filename in urls.items() if not url.endswith('.pdf')}

    # Process PDFs
    if pdf_urls:
        download_and_parse_pdfs(pdf_urls)

    # Process Webpages
    if webpage_urls:
        save_webpages_as_text(webpage_urls)

# Run the function to process all URLs (PDFs and Webpages)
process_text_files(url_filename_mapping)



# 2. Chunk Data and Create JSON Files

Once we have the raw text data from the `.txt` files, the next step is to split it into smaller, manageable chunks. This is important to keep the context intact while making the data easier to handle for later steps. After chunking the data, we store the chunks along with metadata (like the filename and URL) in a JSON file.

**Key steps:**
- Read the text data from `.txt` files.
- Split the data into smaller chunks using a recursive text splitter.
- Save each chunk with metadata in a single JSON file for easy access and processing.

In [None]:
def preprocess_text(text, filename, url):
    """
    Preprocess the text by:
    1. Converting to lowercase.
    2. Removing excessive newlines (\n), keeping max 2 consecutive.
    3. Splitting into smaller chunks using Recursive Text Splitter.
    4. Storing chunks in the global list with metadata.
    """
    text = text.lower()
    text = re.sub(r'\n{3,}', '\n\n', text)  # Limit newlines to max 2 consecutive

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,  # Adjust chunk size as needed
        chunk_overlap=100  # Overlapping to maintain context
    )
    chunks = text_splitter.split_text(text)
    
    chunks_data = []

    # Append data to the global list
    for chunk in chunks:
        chunks_data.append({
            "chunk_data": chunk,
            "metadata": {
                "filename": filename,
                "url": url
            }
        })

    print(f"Processed and stored chunks for {filename}.")
    return chunks_data

def chunk_data(urls):
    # Global list to store all chunks data
    all_chunks_data = []
    # Process text files for chunking and JSON storage
    for url, filename in urls.items():
        if filename.endswith('.txt'):
            try:
                with open(f"dataset/{filename}", "r", encoding="utf-8") as file:
                    text = file.read()
                chunks_data = preprocess_text(text, filename, url)
                all_chunks_data += chunks_data
            except Exception as e:
                print(f"Error processing {filename}: {e}")

    # Once all files are processed, save all chunks to a single JSON file
    with open("dataset/all_chunks_data.json", "w", encoding="utf-8") as json_file:
        json.dump(all_chunks_data, json_file, ensure_ascii=False, indent=4)

    print("All chunks processed and saved to 'all_chunks_data.json'.")
    
# Run the function to chunk all data
chunk_data(url_filename_mapping)

# 3. Create Embeddings

Embeddings are a way to represent textual data in a numerical format that machine learning models can process. Here, we use a pre-trained model from `sentence-transformers` to generate embeddings for each of the chunks. These embeddings capture the semantic meaning of the text and can be used for similarity comparisons.

**Key steps:**
- Use the `sentence-transformers` library to load a pre-trained model.
- Convert each chunk of text into an embedding.
- Store the embeddings for further processing and querying.

In [None]:
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)
print(embeddings)


# 4. Take Chunks and Convert Them to Embeddings and Store in Collection

This step involves taking the chunked data, converting each chunk into an embedding using the `sentence-transformers` model, and then storing these embeddings in a collection for efficient similarity search. We use a tool like `chromadb` to store the embeddings in a database-like structure that supports fast similarity lookups.

**Key steps:**
- Convert each chunk of text into embeddings using the pre-trained model.
- Store the embeddings in a collection (e.g., `chromadb`) for efficient retrieval.
- Ensure the collection is indexed and ready for querying.

In [None]:
import json
from sentence_transformers import SentenceTransformer
import chromadb

# Load the SentenceTransformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Initialize Chroma client and create a collection
client = chromadb.PersistentClient(path="../knowledge_base")

collection = client.get_or_create_collection(name="embeddings_collection")

def create_embeddings_from_json(json_filename):
    """
    Read the JSON file containing chunk data, generate embeddings using SentenceTransformer,
    and create a chroma collection.
    """
    # Load the chunk data from the JSON file
    with open(json_filename, 'r', encoding='utf-8') as file:
        chunk_data = json.load(file)

    # Prepare sentences (chunks) and metadata for embedding
    sentences = [chunk['chunk_data'] for chunk in chunk_data]
    metadatas = [chunk['metadata'] for chunk in chunk_data]

    # Generate embeddings using the SentenceTransformer model
    embeddings = model.encode(sentences)

    # Insert the embeddings and metadata into the Chroma collection
    for idx, embedding in enumerate(embeddings):
        # You can optionally store metadata alongside each embedding
        collection.add(
            ids=[str(idx)],  # Unique ID for each chunk
            embeddings=[embedding],
            metadatas=[metadatas[idx]],  # Store metadata (filename, URL)
            documents=[sentences[idx]]  # Store the sentence (chunk)
        )

    print(f"Created embeddings for {len(sentences)} chunks and added to Chroma collection.")

# Run the function to create embeddings and add to Chroma collection
create_embeddings_from_json('dataset/all_chunks_data.json')


In [None]:
num_documents = len(collection.get()['documents'])

print(f"Number of documents in the collection: {num_documents}")

# 5. Query and See the Top 10 Results

Now that the data is stored as embeddings in a collection, we can query the collection using a query string. The system will return the top 10 most relevant results based on semantic similarity to the query. This is useful for finding the most relevant documents or text chunks based on a user’s input.

**Key steps:**
- Take a user query and convert it into an embedding.
- Search the collection for the most similar embeddings.
- Retrieve and display the top 10 relevant results, showing their similarity scores and metadata.

In [None]:
import numpy as np

def fetch_top_relevant_queries(query, collection, top_k=10):
    """
    Takes a query, encodes it using the SentenceTransformer, and fetches the top `top_k` relevant queries from the Chroma collection.
    """
    
    # Encode the query into an embedding
    query_embedding = model.encode([query])

    # Perform a similarity search in the Chroma collection
    results = collection.query(
        query_embeddings=query_embedding,  # The query embedding
        n_results=top_k  # Number of top results to return
    )

    # Process the results
    relevant_queries = []
    for result in results['documents']:
        relevant_queries.append({
            "document": result,  # The chunk or sentence text
            "metadata": results['metadatas'][results['documents'].index(result)],  # Metadata for each chunk
            "score": results['distances'][results['documents'].index(result)]  # Similarity score (distance)
        })

    return relevant_queries


query = "Who are the founders"
top_queries = fetch_top_relevant_queries(query, collection, top_k=10)

# Display the top 10 relevant queries
for i in range(len(top_queries[0]['metadata'])):
    print("Document: " + str(top_queries[0]['document'][i].replace("\n", " ")))
    print("Metadata: " + str(top_queries[0]['metadata'][i]))
    print("Similarity Score: " + str(top_queries[0]['score'][i]))
    print("\n\n")


# 6. How to Use OpenAI with System and User Prompts  

### Description  
In this section, we explore how to interact with OpenAI models using **system and user prompts**. The **system prompt** sets the AI's behavior and context, while the **user prompt** provides the input for generating responses. We demonstrate how to structure these messages effectively for improved outputs.  

In [None]:
from openai import OpenAI
import os
from dotenv import load_dotenv

load_dotenv()

TOKEN = os.environ.get('TOKEN')
HOST = os.environ.get('HOST')
MODEL = os.environ.get('MODEL')

client = OpenAI(
  api_key = TOKEN,
  base_url = HOST
)

chat_completion = client.chat.completions.create(
  messages=[
  {
    "role": "system",
    "content": "You are an AI assistant"
  },
  {
    "role": "user",
    "content": "Tell me about Large Language Models"
  }
  ],
  model=MODEL,
  max_tokens=256
)

print(chat_completion.choices[0].message.content)

# 7. How to Create Augmented Chunks  

### Description  
Augmented chunks refer to **enhanced text segments** that contain additional context, metadata, or summaries to improve retrieval and response generation. This section covers methods to **generate meaningful chunks** from text, **enrich them with metadata**, and **store them efficiently** for downstream tasks like document retrieval and question-answering.  

In [None]:
import random
import json
from openai import OpenAI
import os
from dotenv import load_dotenv
import random

random.seed(10)

# Load environment variables
load_dotenv()

TOKEN = os.environ.get('TOKEN')
HOST = os.environ.get('HOST')
MODEL = os.environ.get('MODEL')

# Initialize OpenAI client
client = OpenAI(
  api_key=TOKEN,
  base_url=f"{HOST}/serving-endpoints"
)

def get_random_chunk_and_generate_questions(json_file_path):
    """
    Selects a random chunk from the provided JSON file and sends it to OpenAI to generate a list of questions
    that can be answered using the chunk of text.
    """
    try:
        # Read the JSON file and load the chunks data
        with open(json_file_path, 'r', encoding='utf-8') as json_file:
            chunks_data = json.load(json_file)

        random_chunk = random.choice(chunks_data)
        
        random_chunk_data = random_chunk['chunk_data']

        # Prepare the prompt for OpenAI
        prompt = f"Please generate a set of questions that could be answered using this information: \n\n{random_chunk_data}\n\n"

        # Send the prompt to OpenAI
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are an AI assistant with the ability to generate relevant questions based on provided text. "
                            "Your task is to analyze the text and create insightful questions that can be answered using that text."
                            "return only the questions in plain text in multiple lines. no headings, no titles, nothing, no bulletpoints"
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            model=MODEL,
        )

        # Get the response and print the generated questions
        questions = chat_completion.choices[0].message.content
        
        return random_chunk, questions

    except Exception as e:
        print(f"Error: {e}")

# Example usage
chunk, llm_output = get_random_chunk_and_generate_questions('dataset/all_chunks_data.json')


print(chunk)
print("\n\n\n")
print(llm_output)

In [None]:
questions = llm_output.split("\n")

qes = questions[0]
augumented_chunk = f"{qes}{chunk['chunk_data']}"
print(augumented_chunk)

In [None]:
augumented_chunks = []
for question in questions:
    chunk = {
        "chunk_data": f"{question}\n\n{chunk['chunk_data']}",
        "metadata": chunk['metadata']
    }
    chunk['metadata']['is_augumented'] = True
    augumented_chunks.append(chunk)

augumented_chunks

In [None]:
len(augumented_chunks)
print("Number of augumented chunks: ", len(augumented_chunks))

In [None]:
import json
from sentence_transformers import SentenceTransformer
import chromadb

# Load the SentenceTransformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def add_augumented_chunks_to_collection(augumented_chunks):
    # Prepare sentences (chunks) and metadata for embedding
    sentences = [chunk['chunk_data'] for chunk in augumented_chunks]
    metadatas = [chunk['metadata'] for chunk in augumented_chunks]

    # Generate embeddings using the SentenceTransformer model
    embeddings = model.encode(sentences)
    last_idx = len(collection.get()['documents'])
    # Insert the embeddings and metadata into the Chroma collection
    for idx, embedding in enumerate(embeddings):
        # You can optionally store metadata alongside each embedding
        
        collection.add(
            ids=[str(idx+last_idx)],  # Unique ID for each chunk
            embeddings=[embedding],
            metadatas=[metadatas[idx]],  # Store metadata (filename, URL)
            documents=[sentences[idx]]  # Store the sentence (chunk)
        )

    print(f"Created embeddings for {len(sentences)} chunks and added to Chroma collection.")

# Run the function to create embeddings and add to Chroma collection
add_augumented_chunks_to_collection(augumented_chunks)


# 8. How to Perform Re-Ranking  

### Description  
Re-ranking helps improve **search relevance** by refining initial retrieval results using **semantic similarity scoring** or **cross-encoder models**. This section explains how to use techniques like **CrossEncoders** to **re-score and sort** retrieved documents, ensuring that the most relevant results appear at the top. We explore practical implementations using **sentence transformers** for better ranking and accuracy.

In [14]:
from sentence_transformers import CrossEncoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [None]:
query = "Who is the chairman of L&T?"
top_queries = fetch_top_relevant_queries(query, collection, top_k=10)

# Display the top 10 relevant queries
for i in range(len(top_queries[0]['metadata'])):
    print("Document: " + str(top_queries[0]['document'][i].replace("\n", " ")))
    print("Metadata: " + str(top_queries[0]['metadata'][i]))
    print("Similarity Score: " + str(top_queries[0]['score'][i]))
    print("\n\n")

In [None]:
from sentence_transformers import CrossEncoder

# Load the cross-encoder model
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

query = "Who is the chairman of L&T?"
top_queries = fetch_top_relevant_queries(query, collection, top_k=10)

# Compute cross-encoder scores
for i in range(len(top_queries[0]['metadata'])):
    document_text = str(top_queries[0]['document'][i].replace("\n", " "))
    
    # Compute cross-encoder relevance score
    score = cross_encoder.predict([(query, document_text)])
    
    print("Document: " + document_text)
    print("Similarity Score: " + str(top_queries[0]['score'][i]))
    print("Cross-Encoder Score: " + str(score[0]))  # Displaying the cross-encoder score
    print("\n\n")


In [None]:
from sentence_transformers import CrossEncoder

# Load the cross-encoder model
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

query = "Who is the chairman of L&T?"
top_queries = fetch_top_relevant_queries(query, collection, top_k=10)

# Compute cross-encoder scores and store them with the query data
scored_queries = []
for i in range(len(top_queries[0]['metadata'])):
    document_text = str(top_queries[0]['document'][i].replace("\n", " "))
    
    # Compute cross-encoder relevance score
    score = cross_encoder.predict([(query, document_text)])[0]  
    
    # Append to list with all relevant data
    scored_queries.append({
        "document": document_text,
        "metadata": top_queries[0]['metadata'][i],
        "similarity_score": top_queries[0]['score'][i],
        "cross_encoder_score": score
    })

# Sort the queries based on cross-encoder score in descending order
scored_queries.sort(key=lambda x: x['cross_encoder_score'], reverse=True)

# Display sorted results
for item in scored_queries:
    print("Document: " + item["document"])
    print("Metadata: " + str(item["metadata"]))
    print("Similarity Score: " + str(item["similarity_score"]))
    print("Cross-Encoder Score: " + str(item["cross_encoder_score"]))
    print("\n\n")
