In [9]:
import os
import faiss
import pickle
import requests
from sentence_transformers import SentenceTransformer
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [11]:
def extract_and_split_text_from_pdf(pdf_path, chunk_size=500, chunk_overlap=50):
    """
    Extracts text from a PDF and splits it into smaller chunks.
    """
    try:
        reader = PdfReader(pdf_path)
        text = " ".join(page.extract_text() for page in reader.pages if page.extract_text())

        # Use LangChain's RecursiveCharacterTextSplitter
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", ".", "!", "?"]
        )
        chunks = text_splitter.split_text(text)
        return chunks
    except Exception as e:
        print(f"Error extracting or splitting text from {pdf_path}: {e}")
        return []

def load_and_split_texts_from_pdfs(pdf_folder, chunk_size=500, chunk_overlap=50):
    """
    Load, extract, and split texts from all PDFs in a folder.
    """
    pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.endswith('.pdf')]
    texts, metadata = [], []

    for pdf_file in pdf_files:
        chunks = extract_and_split_text_from_pdf(pdf_file, chunk_size, chunk_overlap)
        if chunks:
            texts.extend(chunks)
            metadata.extend([{"filename": os.path.basename(pdf_file), "text": chunk} for chunk in chunks])
        else:
            print(f"Warning: No text extracted from {pdf_file}")

    return texts, metadata

def create_faiss_index(embeddings):
    """
    Create a FAISS index for given embeddings.
    """
    try:
        dimension = embeddings.shape[1]
        index = faiss.IndexFlatL2(dimension)
        index.add(embeddings)
        return index
    except Exception as e:
        print(f"Error creating FAISS index: {e}")
        raise

def save_faiss_index(index, metadata, index_path, metadata_path):
    """
    Save FAISS index and metadata to disk.
    """
    try:
        os.makedirs(os.path.dirname(index_path), exist_ok=True)
        faiss.write_index(index, index_path)
        with open(metadata_path, "wb") as f:
            pickle.dump(metadata, f)
        print(f"FAISS index and metadata saved to {index_path} and {metadata_path}")
    except Exception as e:
        print(f"Error saving FAISS index or metadata: {e}")
        raise

def load_faiss_index(index_path, metadata_path):
    """
    Load FAISS index and metadata from disk.
    """
    try:
        index = faiss.read_index(index_path)
        with open(metadata_path, "rb") as f:
            metadata = pickle.load(f)
        return index, metadata
    except Exception as e:
        print(f"Error loading FAISS index or metadata: {e}")
        raise

def build_and_save_faiss_database(pdf_folder, output_folder, model_name="all-MiniLM-L6-v2"):
    """
    Extract texts from PDFs, build FAISS index, and save both index and metadata.
    """
    index_path = os.path.join(output_folder, "index/faiss_index.bin")
    metadata_path = os.path.join(output_folder, "index/metadata.pkl")

    texts, metadata = load_and_split_texts_from_pdfs(pdf_folder)
    if not texts:
        raise ValueError("No valid texts found in PDF folder!")

    # Initialize the embedding model
    model = SentenceTransformer(model_name)
    embeddings = model.encode(texts, show_progress_bar=True)
    index = create_faiss_index(embeddings)

    # Save the index and metadata
    save_faiss_index(index, metadata, index_path, metadata_path)

def query_faiss_index(query, index, metadata, model, top_k=5):
    """
    Query FAISS index and return top results with metadata.
    """
    try:
        query_embedding = model.encode([query])
        distances, indices = index.search(query_embedding, top_k)

        results = []
        for idx in indices[0]:
            if idx < len(metadata):
                results.append(metadata[idx])
        return results
    except Exception as e:
        print(f"Error querying FAISS index: {e}")
        return []

def ollama_generate(query, context, api_url="http://localhost:11434/api/generate", model="llama3.2"):
    """
    Generate an answer using the Ollama API with context, with error handling.
    """
    prompt = f"""
    Use the following context to answer the question:

    Context:
    {context}

    Question:
    {query}

    Answer concisely:
    """
    payload = {
        "model": model,
        "prompt": prompt,
        "stream": False
    }

    try:
        response = requests.post(api_url, json=payload)
        response.raise_for_status()
        return response.json().get("response", "No response from API.")
    except requests.exceptions.RequestException as e:
        print(f"Error connecting to Ollama API: {e}")
        return "Error: Unable to connect to API."
    except Exception as e:
        print(f"Unexpected error: {e}")
        return "Error: Unexpected issue during API call."

def rag_with_faiss_database(query, output_folder, model_name="all-MiniLM-L6-v2", api_url="http://localhost:11434/api/generate"):
    """
    Perform RAG using pre-built FAISS database.
    """
    index_path = os.path.join(output_folder, "index/faiss_index.bin")
    metadata_path = os.path.join(output_folder, "index/metadata.pkl")

    # Load FAISS database
    index, metadata = load_faiss_index(index_path, metadata_path)
    model = SentenceTransformer(model_name)

    # Query the database
    results = query_faiss_index(query, index, metadata, model)

    if not results:
        return "No relevant results found in the database."

    # Compile context
    context = "\n\n".join([res["text"] for res in results])

    # Generate response using Ollama
    answer = ollama_generate(query, context, api_url)
    return answer, context

In [12]:
pdf_folder = "RAG/papers"
output_folder = "RAG/FAISS"

build_and_save_faiss_database(pdf_folder, output_folder, model_name="all-MiniLM-L6-v2")

Batches: 100%|██████████| 58/58 [00:09<00:00,  5.85it/s]

FAISS index and metadata saved to RAG/FAISS/index/faiss_index.bin and RAG/FAISS/index/metadata.pkl





In [13]:
query = "What is the main concept of metacognition?"
answer, context = rag_with_faiss_database(
    query=query,
    output_folder=output_folder,
    model_name="all-MiniLM-L6-v2",
    api_url="http://localhost:11434/api/generate"
)

print("Generated Answer:", answer)
print("Context:", context)

Generated Answer: The main concept of metacognition is the ability to think about, understand, and manage one's own cognitive processes, including knowledge about learning, monitoring, and regulating one's own thoughts, actions, and cognitions.
Context: control and evaluation of them.  
For Aguirre (2016) , metacognition is the ability of thinking that allows knowing what is known, planni ng 
strategies to do the action of knowing, being aware of thoughts during the knowledge process, and reflecting 
and evaluating the moments and actio ns of the knowledge process. McCluskey , Treffinger,  Baker and 
Lamoureux  (2013) Says that metacognition is the awareness of the learning processes itself, the strengths and

among other things, to the active monitoring and consequent regulation and orchestration of these processes in relation to 
the cognitive objects or data on which they bear, usually in se rvice of some concrete goal or objective."[10]. In essence, 
metacognition is  the knowledge