# NoLimit Data Scientist Technical Test - RAG Chatbot

## 0. Instalasi dependensi

In [None]:
!pip install --upgrade pymupdf
!pip install tools

In [None]:
import os
import re
import fitz

## 1. Ekstraksi teks dari PDF

In [None]:
# Clone repo github untuk akses file PDF
!git clone https://github.com/salmadanu/nolimit-ds-test-salmanadhira.git

In [None]:
def extract_text_from_pdf_folder(folder_path):
    texts = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            doc = fitz.open(pdf_path)

            # Ektraksi per halaman (to cite sources later on)
            page_texts = {}
            for page_num, page in enumerate(doc, start=1):
                page_texts[page_num] = page.get_text()
            doc.close()

            texts[filename] = page_texts
    return texts

In [None]:
folder_path = "/content/nolimit-ds-test-salmanadhira/dataset"
pdf_texts = extract_text_from_pdf_folder(folder_path)

# Check
sample_file = list(pdf_texts.keys())[0]
print(f"Sample File: {sample_file}")
print(pdf_texts[sample_file][1][:500])

## 2. Praproses data
Menghilangkan referensi, sitasi, header

In [None]:
def preprocess_text(text):
  text = re.split(r"\bAcknowledgment\b|\bAcknowledgement\b|\bAcknowledgements\b|\bReferences\b|\bBibliography\b", text, flags=re.IGNORECASE)[0]
  text = re.sub(r"\s+", " ", text)
  return text.strip()

In [None]:
preprocessed_texts = {}
for filename, pages in pdf_texts.items():
    preprocessed_texts[filename] = {}
    for page_num, text in pages.items():
        preprocessed_texts[filename][page_num] = preprocess_text(text)

# Check
sample_file = list(preprocessed_texts.keys())[0]
sample_page = list(preprocessed_texts[sample_file].keys())[0]
print(f"Sample File: {sample_file}")
print(f"Sample Page: {sample_page}")
print(preprocessed_texts[sample_file][sample_page][:500])

## 3. Chunking

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_pdfs(pdf_texts, chunk_size=400, chunk_overlap=50):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    chunks = []
    for filename, pages in pdf_texts.items():
        for page_num, text in pages.items():
            page_chunks = splitter.split_text(text)
            for i, chunk in enumerate(page_chunks):
                chunks.append({
                    "filename": filename,
                    "page_number": page_num,
                    "chunk_id": i,
                    "text": chunk
                })
    return chunks

In [None]:
chunks = chunk_pdfs(preprocessed_texts)

print(f"Total chunks: {len(chunks)}")
print(chunks[0])

## 4. Menambahkan chunks ke database vektor

In [None]:
!pip install faiss-cpu langchain sentence-transformers langchain_huggingface

In [None]:
!pip install -U langchain-community

In [None]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2"
)

def build_faiss_vectorstore(chunks):
    texts = [chunk["text"] for chunk in chunks]
    metadatas = [{"filename": c["filename"], "page_number": c["page_number"], "chunk_id": c["chunk_id"]} for c in chunks]

    vectorstore = FAISS.from_texts(texts, embedding_model, metadatas=metadatas)
    return vectorstore

vectorstore = build_faiss_vectorstore(chunks)

In [None]:
vectorstore.save_local("faiss_index")
vectorstore = FAISS.load_local("faiss_index", embedding_model, allow_dangerous_deserialization=True)

In [None]:
# Memeriksa apakah dokumen yang di-retrieve sudah sesuai
query = "What is stance detection?"
results = vectorstore.similarity_search(query, k=10)

for res in results:
    print(res.page_content[:200])
    print(res.metadata)

In [None]:
# Memeriksa dimensi embedding (menangani AssertionError saat deploy)
sample_vector = embedding_model.embed_query("test query")
print("Embedding dimension:", len(sample_vector))

In [None]:
import faiss
print("FAISS index dimension:", vectorstore.index.d)

## 5. Ngequery LLM
Generate jawaban dari query berdasarkan chunks

In [None]:
from langchain import PromptTemplate, LLMChain
from langchain_huggingface import HuggingFacePipeline

from transformers import pipeline

qa_pipeline = pipeline(
    "text2text-generation",
    model="google/flan-t5-large",
    device=0
)

llm = HuggingFacePipeline(pipeline=qa_pipeline)

template = """
You are an academic assistant helping summarize research papers.
Use the provided CONTEXT to answer the QUESTION clearly and concisely.
- Write the answer in well-formed sentences, even if the context has fragmented text.
- Do not copy broken words or incomplete phrases directly from the context.
- If the question is about methods or models, list them clearly and EXPLAIN their purpose.
- If needed, cite authors or papers mentioned in the context.
- If the answer cannot be found in the context, say "The context does not provide enough information."

CONTEXT:
{context}

QUESTION: {question}

ANSWER:
"""

def format_doc(doc):
    meta = doc.metadata
    source = f"(Source: {meta.get('filename', 'unknown')}, page {meta.get('page_number', '?')})"
    return f"{doc.page_content}\n{source}"

prompt = PromptTemplate(template=template, input_variables=["context", "question"])
qa_chain = LLMChain(llm=llm, prompt=prompt)

def answer_query(query, vectorstore, k=10):
    results = vectorstore.max_marginal_relevance_search(query, k=k, fetch_k=20)
    context = "\n\n".join([format_doc(doc) for doc in results[:3]])
    answer = qa_chain.run({"context": context, "question": query})
    return answer

### Q&A

In [None]:
def clean_answer(text: str) -> str:
    text = re.sub(r"-\s+", "", text)
    text = re.sub(r"\s{2,}", " ", text)
    return text.strip()

In [None]:
query = "What is framing analysis in computational media studies??"
answer = answer_query(query, vectorstore)
answer = clean_answer(answer)
print(answer[0].upper() + answer[1:])

In [None]:
query = "How is propaganda detection defined in computational linguistics?"
answer = answer_query(query, vectorstore)
answer = clean_answer(answer)
print(answer[0].upper() + answer[1:])

In [None]:
query = "What deep learning models are commonly applied to propaganda detection?"
answer = answer_query(query, vectorstore)
answer = clean_answer(answer)
print(answer[0].upper() + answer[1:])

In [None]:
query = "How can Twitter data be preprocessed for misinformation detection?"
answer = answer_query(query, vectorstore)
answer = clean_answer(answer)
print(answer[0].upper() + answer[1:])

In [None]:
query = "How is framing analysis applied to coverage of international conflicts?"
answer = answer_query(query, vectorstore)
answer = clean_answer(answer)
print(answer[0].upper() + answer[1:])

# Interactive Q&A Chatbot

In [None]:
import ipywidgets as widgets
from IPython.display import display

text_box = widgets.Text(
    value='',
    placeholder='Question on computational media analysis...',
    description='Question:',
    layout=widgets.Layout(width='500px')
)

display(text_box)

In [None]:
query = text_box.value
answer = answer_query(query, vectorstore)
answer = clean_answer(answer)
print(answer[0].upper() + answer[1:])