In [13]:
# 1) Imports
import json
from pathlib import Path

import fitz            # PyMuPDF
import pdfplumber      # table extraction
import pandas as pd

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import LlamaCpp
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

from huggingface_hub import login, hf_hub_download

# 2) Ingest PDF (text, tables, images)
PDF = Path("data/thesis.pdf")
TXT_DIR = Path("data/text"); TXT_DIR.mkdir(exist_ok=True, parents=True)
TBL_DIR = Path("data/tables"); TBL_DIR.mkdir(exist_ok=True, parents=True)
IMG_DIR = Path("data/images"); IMG_DIR.mkdir(exist_ok=True, parents=True)
META   = Path("data/metadata.json")

doc = fitz.open(PDF)
meta = []
with pdfplumber.open(PDF) as pp:
    for i,(page,ppage) in enumerate(zip(doc, pp.pages), start=1):
        # text
        txt = page.get_text()
        tf = TXT_DIR/f"page_{i:03d}.txt"
        tf.write_text(txt, encoding="utf-8")
        meta.append({"type":"text","path":str(tf),"page":i})
        # tables
        for ti,tbl in enumerate(ppage.extract_tables()):
            df = pd.DataFrame(tbl[1:], columns=tbl[0])
            if not df.empty:
                f = TBL_DIR/f"table_{i:03d}_{ti}.csv"
                df.to_csv(f, index=False)
                meta.append({"type":"table","path":str(f),"page":i,"rows":df.shape[0]})
        # images
        for j,img in enumerate(page.get_images(full=True)):
            xref = img[0]; pix = fitz.Pixmap(doc, xref)
            if pix.n>4: pix = fitz.Pixmap(fitz.csRGB, pix)
            ip = IMG_DIR/f"image_{i:03d}_{j}.png"
            pix.save(ip); pix=None
            meta.append({"type":"image","path":str(ip),"page":i})

with open(META, "w", encoding="utf-8") as f:
    json.dump(meta, f, indent=2)
print(f"✅ Ingested {len(meta)} items → {META}")

# 3) Chunk & build FAISS index
with open(META, "r", encoding="utf-8") as f:
    items = json.load(f)
texts  = {m["path"]:m for m in items if m["type"]=="text"}
tables = [m for m in items if m["type"]=="table"]

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = []
for path,m in texts.items():
    txt = Path(path).read_text(encoding="utf-8")
    for idx,chunk in enumerate(splitter.split_text(txt)):
        docs.append(Document(page_content=chunk,
                             metadata={"source":path,"page":m["page"],"chunk":idx}))
for m in tables:
    df = pd.read_csv(m["path"])
    for idx,row in df.iterrows():
        line = " | ".join(f"{c}: {row[c]}" for c in df.columns)
        docs.append(Document(page_content=line,
                             metadata={"source":m["path"],"page":m["page"],"row":idx}))

emb = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vs  = FAISS.from_documents(docs, emb)
vs.save_local("faiss_index")
print(f"✅ FAISS index built with {len(docs)} chunks → faiss_index/")

# 4) Download Qwen2.5-Omni-7B GGUF (4-bit small)
login()  # paste your HF token
REPO = "mradermacher/Qwen2.5-Omni-7B-GGUF"
MDIR = Path("models/Qwen2.5-Omni-7B"); MDIR.mkdir(exist_ok=True, parents=True)
GGUF = "Qwen2.5-Omni-7B.Q4_K_S.gguf"
model_path = MDIR/GGUF
if not model_path.exists():
    hf_hub_download(repo_id=REPO, filename=GGUF,
                    local_dir=str(MDIR), local_dir_use_symlinks=False)
print(f"✅ Model ready at {model_path}")

# 5) Load LLM with GPU layers & build RetrievalQA chain
llm = LlamaCpp(
    model_path=str(model_path),
    n_ctx=4096,
    n_threads=8,
    n_gpu_layers=12,     # offload first 12 layers to GPU
    temperature=0.1,     # Lower temperature for more focused answers
    max_tokens=300,      # Limit response length
    top_p=0.9,          # Nucleus sampling
    repeat_penalty=1.1   # Reduce repetition
)

emb = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vs  = FAISS.load_local("faiss_index", emb,
                       allow_dangerous_deserialization=True)

retriever = vs.as_retriever(search_type="mmr", top_k=5, fetch_k=10)

# Optimized prompt for concise, direct answers
prompt_template = PromptTemplate(
    input_variables=["context","question"],
    template=(
        "You are a helpful assistant that answers questions based on the provided context.\n"
        "Use ONLY the information from the context below to answer the question.\n"
        "Be concise and direct. Do not add disclaimers or assumptions.\n"
        "If the context doesn't contain enough information, simply state that.\n\n"
        "Context:\n{context}\n\n"
        "Question: {question}\n\n"
        "Answer:"
    )
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={
        "prompt": prompt_template
    }
)

# 6) Test with answer post-processing
def clean_answer(answer):
    """Clean up verbose LLM responses"""
    lines = answer.split('\n')
    # Take only the first substantial line that doesn't contain disclaimers
    for line in lines:
        line = line.strip()
        if line and not any(phrase in line.lower() for phrase in 
                          ['note:', 'disclaimer:', 'assumption', 'if there', 'additionally']):
            return line
    return answer.strip()

res = qa({"query":"what type of models are the best performing mdoels in this thesis and thier accuracy?"})
cleaned_answer = clean_answer(res["result"])
print("\n📝 Answer:\n", cleaned_answer)
for doc in res["source_documents"][:5]:
    print(f"• Page {doc.metadata['page']}: {Path(doc.metadata['source']).name}")

✅ Ingested 224 items → data\metadata.json
✅ FAISS index built with 403 chunks → faiss_index/


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

llama_model_loader: loaded meta data with 36 key-value pairs and 339 tensors from models\Qwen2.5-Omni-7B\Qwen2.5-Omni-7B.Q4_K_S.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2vl
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Qwen2.5 Omni 7B
llama_model_loader: - kv   3:                           general.basename str              = Qwen2.5-Omni
llama_model_loader: - kv   4:                         general.size_label str              = 7B
llama_model_loader: - kv   5:                            general.license str              = other
llama_model_loader: - kv   6:                       general.license.name str              = apache-2.0
llama_model_loader: - kv   7:      

✅ Model ready at models\Qwen2.5-Omni-7B\Qwen2.5-Omni-7B.Q4_K_S.gguf


init_tokenizer: initializing tokenizer for type 2
load: control token: 151661 '<|fim_suffix|>' is not marked as EOG
load: control token: 151647 '<|audio_bos|>' is not marked as EOG
load: control token: 151649 '<|box_end|>' is not marked as EOG
load: control token: 151655 '<|IMAGE|>' is not marked as EOG
load: control token: 151654 '<|vision_pad|>' is not marked as EOG
load: control token: 151659 '<|fim_prefix|>' is not marked as EOG
load: control token: 151656 '<|VIDEO|>' is not marked as EOG
load: control token: 151653 '<|vision_eos|>' is not marked as EOG
load: control token: 151644 '<|im_start|>' is not marked as EOG
load: control token: 151646 '<|AUDIO|>' is not marked as EOG
load: control token: 151648 '<|audio_eos|>' is not marked as EOG
load: control token: 151650 '<|quad_start|>' is not marked as EOG
load: control token: 151651 '<|quad_end|>' is not marked as EOG
load: control token: 151652 '<|vision_bos|>' is not marked as EOG
load: control token: 151660 '<|fim_middle|>' is no


📝 Answer:
 The best performing models in this thesis are SVM, with an accuracy of 92.06% and an AUC-ROC of 97.31%.
• Page 83: page_083.txt
• Page 28: page_028.txt
• Page 20: page_020.txt
• Page 7: page_007.txt
