In [1]:
!pip list

Package                            Version
---------------------------------- -----------
alembic                            1.14.0
annotated-types                    0.7.0
anyio                              4.7.0
assemblyai                         0.36.0
asttokens                          3.0.0
blinker                            1.9.0
cachetools                         5.5.0
certifi                            2024.12.14
charset-normalizer                 3.4.0
click                              8.1.8
cloudpickle                        3.1.0
colorama                           0.4.6
comm                               0.2.2
contourpy                          1.3.1
cycler                             0.12.1
databricks-sdk                     0.40.0
debugpy                            1.8.11
decorator                          5.1.1
Deprecated                         1.2.15
distlib                            0.3.9
distro                             1.9.0
docker                             7.1



In [1]:
!pip list

Package                            Version
---------------------------------- -----------
alembic                            1.14.0
annotated-types                    0.7.0
anyio                              4.7.0
assemblyai                         0.36.0
asttokens                          3.0.0
blinker                            1.9.0
cachetools                         5.5.0
certifi                            2024.12.14
charset-normalizer                 3.4.0
click                              8.1.8
cloudpickle                        3.1.0
colorama                           0.4.6
comm                               0.2.2
contourpy                          1.3.1
cycler                             0.12.1
databricks-sdk                     0.40.0
debugpy                            1.8.11
decorator                          5.1.1
Deprecated                         1.2.15
distlib                            0.3.9
distro                             1.9.0
docker                             7.1



In [None]:
from transformers import AutoTokenizer

# Load a pre-trained tokenizer (e.g., BERT)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def semantic_chunk(text, max_length=512):
    """
    Splits text into chunks based on semantic boundaries using a language model.
    """
    chunks = []
    current_chunk = ""
    current_length = 0

    for sentence in text.split("."):
        sentence_length = len(tokenizer.tokenize(sentence))
        if current_length + sentence_length < max_length:
            current_chunk += sentence + "."
            current_length += sentence_length
        else:
            chunks.append(current_chunk)
            current_chunk = sentence + "."
            current_length = sentence_length

    if current_chunk:
        chunks.append(current_chunk)

    return chunks

In [None]:
from nltk.corpus import wordnet

def expand_query(query):
    """
    Expands the query with synonyms using WordNet.
    """
    expanded_query = []
    for word in query.split():
        synonyms = [syn.name() for syn in wordnet.synsets(word)]
        expanded_query.extend(synonyms)
    expanded_query.append(query)  # Include the original query
    return " ".join(expanded_query)

In [13]:
import os
import faiss
import numpy as np
import requests
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

# Configure paths
DATA_DIR = "./data"  # Directory with PDF files
LLAMA_API_URL  = "http://localhost:11434/api/v1/completion"  # Llama API endpoint
CHUNK_SIZE = 512
CHUNK_OVERLAP = 50

# Load SentenceTransformer for embeddings
print("Loading SentenceTransformer model...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize FAISS Index
dimension = 384  # Embedding size for all-MiniLM-L6-v2
faiss_index = faiss.IndexFlatL2(dimension)

# Metadata store
metadata_store = []

### Step 1: Pre-Retrieval Optimization - Chunking ###
def load_and_chunk_pdfs(data_dir, chunk_size, chunk_overlap):
    """
    Load PDF files, extract text, and chunk them using a text splitter.
    """
    print("Loading and chunking PDFs...")
    chunks = []
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

    for filename in os.listdir(data_dir):
        if filename.endswith(".pdf"):
            filepath = os.path.join(data_dir, filename)
            print(f"Processing file: {filename}")
            try:
                reader = PdfReader(filepath)
                text = ""
                for page in reader.pages:
                    text += page.extract_text()
                doc_chunks = splitter.split_text(text)
                chunks.extend([(chunk, {"source": filename}) for chunk in doc_chunks])
            except Exception as e:
                print(f"Error processing file {filename}: {e}")
    
    print(f"Total chunks created: {len(chunks)}")
    return chunks

# Load and chunk PDFs
#chunked_data = load_and_chunk_pdfs(DATA_DIR, CHUNK_SIZE, CHUNK_OVERLAP)  ############

### Step 2: Pre-Retrieval Optimization - Embedding and Indexing ###
def index_chunks(chunks, model, index, metadata_store):
    """
    Generate embeddings for chunks and add them to the FAISS index along with metadata.
    """
    print("Generating embeddings and indexing chunks...")
    embeddings = []
    for idx, (chunk, metadata) in enumerate(chunks):
        try:
            embedding = model.encode(chunk)
            embeddings.append(embedding)
            metadata_store.append(metadata)
            if idx % 10 == 0:
                print(f"Processed {idx} chunks...")
        except Exception as e:
            print(f"Error generating embedding for chunk {idx}: {e}")
    index.add(np.array(embeddings).astype('float32'))
    print("Indexing complete.")

# Index the chunks
#index_chunks(chunked_data, embedding_model, faiss_index, metadata_store)   ####

### Step 3: Retrieval Optimization ###
def retrieve_relevant_chunks(query, model, index, metadata_store, top_k=5):
    """
    Retrieve top-k relevant chunks for a given query using FAISS index.
    """
    print("Retrieving relevant chunks...")
    try:
        query_embedding = model.encode(query)
        distances, indices = index.search(np.array([query_embedding]).astype('float32'), top_k)
        retrieved_chunks = [(chunked_data[i][0], metadata_store[i]) for i in indices[0]]
        print(f"Retrieved {len(retrieved_chunks)} chunks.")
        return retrieved_chunks
    except Exception as e:
        print(f"Error during retrieval: {e}")
        return []

### Step 4: Post-Retrieval Optimization - Context and Prompt Engineering ###
def generate_context(retrieved_chunks):
    """
    Generate a context string by concatenating retrieved chunks.
    """
    print("Generating context from retrieved chunks...")
    return " ".join([chunk for chunk, _ in retrieved_chunks])

def format_prompt(context, query):
    """
    Format the prompt with the retrieved context and user query.
    """
    print("Formatting the prompt...")
    return f"""
    Context:
    {context}

    Query:
    {query}

    Answer:
    """

### Step 5: Generation with Llama ###
def call_llama_api(prompt):
    """
    Call the Llama model's API to generate a response based on the prompt.
    """
    print("Calling Llama API...")
    headers = {"Content-Type": "application/json"}
    payload = {
        "prompt": prompt,
        "max_tokens": 256,
        "temperature": 0.7,
    }
    try:
        response = requests.post(LLAMA_API_URL, json=payload, headers=headers)
        response.raise_for_status()
        print("Llama API call successful.")
        return response.json()["choices"][0]["text"]
    except Exception as e:
        print(f"Error calling Llama API: {e}")
        return "Error generating response."

### Step 6: RAG Pipeline ###
def rag_pipeline(query, top_k=5):
    """
    Complete RAG pipeline:
    1. Retrieve relevant chunks.
    2. Generate context.
    3. Format prompt.
    4. Generate response using Llama.
    """
    print("Executing RAG pipeline...")
    # Retrieve relevant chunks
    retrieved_chunks = retrieve_relevant_chunks(query, embedding_model, faiss_index, metadata_store, top_k)

    if not retrieved_chunks:
        return "No relevant chunks retrieved."

    # Generate context
    context = generate_context(retrieved_chunks)

    # Format prompt
    prompt = format_prompt(context, query)

    # Generate response using Llama
    response = call_llama_api(prompt)

    return response

# Example Query
if __name__ == "__main__":
    user_query = "Explain the concept of Retrieval-Augmented Generation."
    print(f"User Query: {user_query}")
    answer = rag_pipeline(user_query)
    print(f"Q: {user_query}\nA: {answer}")


Loading SentenceTransformer model...
User Query: Explain the concept of Retrieval-Augmented Generation.
Executing RAG pipeline...
Retrieving relevant chunks...
Error during retrieval: list index out of range
Q: Explain the concept of Retrieval-Augmented Generation.
A: No relevant chunks retrieved.


Loading SentenceTransformer model...
Loading and chunking PDFs...
Processing file: 3.1 complet_ctst_gpt.pdf
Processing file: Assignment 01.pdf
Processing file: BSDSf21M007_Assignment_1.pdf
Processing file: Handout 1.1 (Lab Environment Setup).pdf
Processing file: Handout 1.2 (Recap of OS with Linux).pdf
Processing file: Handout 1.3 (Recap of InterNetworking Concepts with Linux).pdf
Processing file: Handout 2.1 (Ethical Hacking Pentesting and Anonymity).pdf
Processing file: Handout 2.10 (Web App Pen Testing - II).pdf
Processing file: Handout 2.2 (Reconnaissance Info Gathering and OSINT) (1).pdf
Processing file: Handout 2.3 (Scanning and Vulnerability Analysis - I).pdf
Processing file: Handout 2.4 (Scanning and Vulnerability Analysis - II).pdf
Processing file: Handout 2.5 (Exploitation and Gaining Access).pdf
Processing file: Handout 2.6 (Generating your own Payloads).pdf
Processing file: Handout 2.7 (Privilege Escalation).pdf
Processing file: Handout 2.8 (Persistence and Removing Tracks)

In [16]:
import os
import faiss
import numpy as np
import requests
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

# Configure paths
DATA_DIR = "./data"  # Directory with PDF files
LLAMA_API_URL = "http://localhost:11434/api/generate"  # Updated Llama API endpoint with correct port
CHUNK_SIZE = 512
CHUNK_OVERLAP = 50

# Load SentenceTransformer for embeddings
print("Loading SentenceTransformer model...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize FAISS Index
dimension = 384  # Embedding size for all-MiniLM-L6-v2
faiss_index = faiss.IndexFlatL2(dimension)

# Metadata store
metadata_store = []

### Step 1: Pre-Retrieval Optimization - Chunking ###
def load_and_chunk_pdfs(data_dir, chunk_size, chunk_overlap):
    """
    Load PDF files, extract text, and chunk them using a text splitter.
    """
    print("Loading and chunking PDFs...")
    chunks = []
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

    for filename in os.listdir(data_dir):
        if filename.endswith(".pdf"):
            filepath = os.path.join(data_dir, filename)
            print(f"Processing file: {filename}")
            try:
                reader = PdfReader(filepath)
                text = ""
                for page in reader.pages:
                    text += page.extract_text()
                doc_chunks = splitter.split_text(text)
                chunks.extend([(chunk, {"source": filename}) for chunk in doc_chunks])
            except Exception as e:
                print(f"Error processing file {filename}: {e}")
    
    print(f"Total chunks created: {len(chunks)}")
    return chunks

# Load and chunk PDFs
chunked_data = load_and_chunk_pdfs(DATA_DIR, CHUNK_SIZE, CHUNK_OVERLAP)

### Step 2: Pre-Retrieval Optimization - Embedding and Indexing ###
def index_chunks(chunks, model, index, metadata_store):
    """
    Generate embeddings for chunks and add them to the FAISS index along with metadata.
    """
    print("Generating embeddings and indexing chunks...")
    embeddings = []
    for idx, (chunk, metadata) in enumerate(chunks):
        try:
            embedding = model.encode(chunk)
            embeddings.append(embedding)
            metadata_store.append(metadata)
            if idx % 10 == 0:
                print(f"Processed {idx} chunks...")
        except Exception as e:
            print(f"Error generating embedding for chunk {idx}: {e}")
    index.add(np.array(embeddings).astype('float32'))
    print("Indexing complete.")

# Index the chunks
index_chunks(chunked_data, embedding_model, faiss_index, metadata_store)

### Step 3: Retrieval Optimization ###
def retrieve_relevant_chunks(query, model, index, metadata_store, top_k=5):
    """
    Retrieve top-k relevant chunks for a given query using FAISS index.
    """
    print("Retrieving relevant chunks...")
    try:
        query_embedding = model.encode(query)
        distances, indices = index.search(np.array([query_embedding]).astype('float32'), top_k)
        retrieved_chunks = [(chunked_data[i][0], metadata_store[i]) for i in indices[0]]
        print(f"Retrieved {len(retrieved_chunks)} chunks.")
        return retrieved_chunks
    except Exception as e:
        print(f"Error during retrieval: {e}")
        return []

### Step 4: Post-Retrieval Optimization - Context and Prompt Engineering ###
def generate_context(retrieved_chunks):
    """
    Generate a context string by concatenating retrieved chunks.
    """
    print("Generating context from retrieved chunks...")
    return " ".join([chunk for chunk, _ in retrieved_chunks])

def format_prompt(context, query):
    """
    Format the prompt with the retrieved context and user query.
    """
    print("Formatting the prompt...")
    return f"""
    Context:
    {context}

    Query:
    {query}

    Answer:
    """

### Step 5: Generation with Llama ###
def call_llama_api(prompt):
    """
    Call the Llama model's API to generate a response based on the prompt.
    """
    print("Calling Llama API...")
    headers = {"Content-Type": "application/json"}
    payload = {
        "model": "llama2:7b",  # Model name
        "prompt": prompt,
        "max_tokens": 256,
        "temperature": 0.7,
    }
    try:
        response = requests.post(LLAMA_API_URL, json=payload, headers=headers)
        response.raise_for_status()
        print("Llama API call successful.")
        return response.json()["choices"][0]["text"]
    except Exception as e:
        print(f"Error calling Llama API: {e}")
        return "Error generating response."

### Step 6: RAG Pipeline ###
def rag_pipeline(query, top_k=5):
    """
    Complete RAG pipeline:
    1. Retrieve relevant chunks.
    2. Generate context.
    3. Format prompt.
    4. Generate response using Llama.
    """
    print("Executing RAG pipeline...")
    # Retrieve relevant chunks
    retrieved_chunks = retrieve_relevant_chunks(query, embedding_model, faiss_index, metadata_store, top_k)

    if not retrieved_chunks:
        return "No relevant chunks retrieved."

    # Generate context
    context = generate_context(retrieved_chunks)

    # Format prompt
    prompt = format_prompt(context, query)

    # Generate response using Llama
    response = call_llama_api(prompt)

    return response

# Example Query
if __name__ == "__main__":
    user_query = "Explain the concept of Retrieval-Augmented Generation."
    print(f"User Query: {user_query}")
    answer = rag_pipeline(user_query)
    print(f"Q: {user_query}\nA: {answer}")


Loading SentenceTransformer model...
Loading and chunking PDFs...
Processing file: 3.1 complet_ctst_gpt.pdf
Processing file: Assignment 01.pdf
Processing file: BSDSf21M007_Assignment_1.pdf
Processing file: Handout 1.1 (Lab Environment Setup).pdf
Processing file: Handout 1.2 (Recap of OS with Linux).pdf
Processing file: Handout 1.3 (Recap of InterNetworking Concepts with Linux).pdf
Processing file: Handout 2.1 (Ethical Hacking Pentesting and Anonymity).pdf
Processing file: Handout 2.10 (Web App Pen Testing - II).pdf
Processing file: Handout 2.2 (Reconnaissance Info Gathering and OSINT) (1).pdf
Processing file: Handout 2.3 (Scanning and Vulnerability Analysis - I).pdf
Processing file: Handout 2.4 (Scanning and Vulnerability Analysis - II).pdf
Processing file: Handout 2.5 (Exploitation and Gaining Access).pdf
Processing file: Handout 2.6 (Generating your own Payloads).pdf
Processing file: Handout 2.7 (Privilege Escalation).pdf
Processing file: Handout 2.8 (Persistence and Removing Tracks)