### Import libraries


In [65]:
from langchain_ollama import ChatOllama
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore import InMemoryDocstore
import faiss
import os

### Split the doucment into Chunks & Store them in Vector Store

In [47]:
def ingest():
    # Load PDF and split pages
    loader = PyPDFLoader("Dancing-Write-up-challenges.pdf")
    pages = loader.load_and_split()

    # Split text into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1024,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(pages)
    print(f"Split {len(pages)} documents into {len(chunks)} chunks.")

    # Create embeddings
    embedding = FastEmbedEmbeddings()
    sample_vector = embedding.embed_query("test")
    embedding_dimension = len(sample_vector)  # Automatically detect dimension
    print(f"Embedding Dimension: {embedding_dimension}")

    # Initialize FAISS index with correct dimensions
    index = faiss.IndexFlatL2(embedding_dimension)
    docstore = InMemoryDocstore()
    index_to_docstore_id = {}

    # Initialize FAISS vector store
    faiss_store = FAISS(
        index=index,
        docstore=docstore,
        index_to_docstore_id=index_to_docstore_id,
        embedding_function=embedding,
    )

    # Store documents in FAISS
    faiss_index = FAISS.from_documents(documents=chunks, embedding=embedding)
    faiss_index.save_local("./faiss_index")
    print("✅ FAISS index created and saved successfully!")
    for chunk in chunks:
        print(chunk)


In [48]:
# only run this once to generate vector store
ingest()

Split 11 documents into 13 chunks.
Embedding Dimension: 384
✅ FAISS index created and saved successfully!
page_content='Dancing Write-up  
Prepared by: 0ne-nine9
Introduction  
There are multiple ways to transfer a file between two hosts (computers) on the same network. One of 
these protocols is studied in this example, and that is SMB (Server Message Block). This communication 
protocol provides shared access to files, printers, and serial ports between endpoints on a network. We 
mostly see SMB services running on Windows machines.
During scanning, we will typically see port 445 TCP open on the target, reserved for the SMB protocol. 
Usually, SMB runs at the Application or Presentation layers of the OSI model, pictured below. Due to this, it 
relies on lower-level protocols for transport. The Transport layer protocol that Microsoft SMB Protocol is 
most often used with is NetBIOS over TCP/IP (NBT). This is why, during scans, we will most likely see both 
protocols with open ports ru

In [26]:
from huggingface_hub import login
access_token_read = "your_token_from_huggingface"
access_token_write = "your_token_from_huggingface"
login(token = access_token_read)

### Create a RAG chain that retreives relevent chunks and prepares a response

In [61]:
def rag_chain():
    # Initialize the model
    model = ChatOllama(model="llama3.2:3b")

    # Define the prompt template
    prompt = PromptTemplate.from_template(
        """
        <s> [Instructions] You are "DarkCircuit Model," a highly skilled black-hat hacker specializing in exploitation and penetration testing.
        Your objective is to identify and exploit vulnerabilities efficiently, maintain stealth, and escalate privileges when needed.
        Answer the question based only on the following context.
        If you don't know the answer, then reply, "No Context available for this question." {input} [/Instructions] </s>

        [Instructions] Question: {input}
        Context: {context}
        Answer: [/Instructions]
        """
    )

    # Create the embedding model
    embedding = FastEmbedEmbeddings()
    sample_vector = embedding.embed_query("test")
    print(f"DEBUG: Embedding dimension during chain creation: {len(sample_vector)}")  # Print dimension
    embedding_dimension = len(sample_vector)

    # Load existing FAISS index or create a new one if not found
    index_path = "./faiss_index"
    if os.path.exists(index_path):
        print(f"DEBUG: Loading existing FAISS index from {index_path}")
        vector_store = FAISS.load_local(
            index_path, 
            embeddings=embedding, 
            allow_dangerous_deserialization=True
        )
    else:
        print("DEBUG: FAISS index not found, creating a new one")
        index = faiss.IndexFlatL2(embedding_dimension)
        docstore = InMemoryDocstore()
        index_to_docstore_id = {}

        vector_store = FAISS(
            index=index,
            docstore=docstore,
            index_to_docstore_id=index_to_docstore_id,
            embedding_function=embedding,
        )

    # Debug: Check the number of documents in the vector store
    print(f"DEBUG: Number of documents in vector store: {len(vector_store.index_to_docstore_id)}")

    # Configure the retriever with similarity search
    retriever = vector_store.as_retriever(
        search_type="similarity",
        search_kwargs={
            "k": 3  # Retrieve top 3 results
        },
    )

    # Create the RAG chain using RetrievalQA
    chain = RetrievalQA.from_chain_type(
        llm=model,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True,
        verbose=True,
    )

    return chain


In [62]:
def ask(query: str):
    # Create the chain
    chain = rag_chain()
    assert chain is not None, "The chain object is None"
    
    # Print the query to ensure it's correct
    print(f"Query: {query}")
    
    # Invoke the chain
    result = chain({"query": query})
    assert result is not None, "The result is None"
    
    # Print results
    print(f"Answer: {result['result']}")
    print("\nSource Documents:")
    for doc in result.get("source_documents", []):
        source = doc.metadata.get("source", "No source metadata")
        content = doc.page_content[:500]  # Show more content for debugging
        print(f"Source: {source} - Content: {content}")


### Ask Question to get relevant information from the document

In [64]:
query = "What is the objective of the Dancing Write-up Challenge?"
ask(query)

DEBUG: Embedding dimension during chain creation: 384
DEBUG: Loading existing FAISS index from ./faiss_index
DEBUG: Number of documents in vector store: 13
Query: What is the objective of the Dancing Write-up Challenge?


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Answer: I don't know, but based on the context provided, it appears that the goal of the Dancing Write-up Challenge is to obtain a flag from the "flag.txt" file by exploiting an SMB (Server Message Block) vulnerability and then successfully transferring files between hosts on the same network.

Source Documents:
Source: Dancing-Write-up-challenges.pdf - Content: The flag.txt file, however, is what we are after. We read it and input the flag into the platform, owning 
the Dancing machine. 
Congratulations!
Source: Dancing-Write-up-challenges.pdf - Content: Dancing Write-up  
Prepared by: 0ne-nine9
Introduction  
There are multiple ways to transfer a file between two hosts (computers) on the same net