In [1]:
import os
import re
import datetime

import pandas as pd
from tqdm.auto import tqdm

from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEndpoint
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate

from extractor import Store

tqdm.pandas()  # load tqdm's pandas support
pd.set_option("display.max_colwidth", None)

load_dotenv()

True

In [None]:
QUESTION_GEN_LLM = "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO"
QUESTION_GEN_LLM_CONFIG = {
    "max_new_tokens": 256,
    "top_k": 10,
    "top_p": 0.95,
    "typical_p": 0.95,
    "temperature": 0.01,
    "repetition_penalty": 1.03,
}

now = datetime.now()
date_time_str = now.strftime("%d.%m.%Y_%H.%M")

In [2]:
vec_store = Store("main",presist_dir="./chroma_langchain_db")
vec_store.setup()

## Dumbassery
Currently we use a kinda dumbass method of manually picking docs to be contex wherein the model generates a question

In the future maybe we could use something like [AutoRAG](https://docs.auto-rag.com/)

### Page context tuple generation from doc

In [3]:
def divide_chunks(l, n):
    
    # looping till length l
    for i in range(0, len(l), n): 
        yield l[i:i + n]

In [4]:
TUP_PAGE_COUNT = 2
QUESTION_PER_CTX_COUNT = 5
context_doc_tups = []

all_docs_pages = vec_store.store.get()
all_docs_pages_ids = [tuple(doc.split("_")) for doc in all_docs_pages["ids"]]
doc_names = {doc.split("_")[0] for doc in all_docs_pages["ids"]}

# NESTING HELL!
all_docs = {}
for doc_name in doc_names:
    all_docs[doc_name] = []
    for doc in all_docs_pages_ids:
        if doc[0] == doc_name:
            all_docs[doc_name].append("_".join(doc))

for k, v in all_docs.items():
    all_docs[k] = list(divide_chunks(v, TUP_PAGE_COUNT))

In [5]:
all_docs_texts = {}
for k, chunks in all_docs.items():
    chunks_docs = []
    for chunk in chunks:
        # NOTE: Maybe add zip with page_doc names?
        chunk_doc = tuple(vec_store.store.get(chunk)['documents'])
        chunks_docs.append(chunk_doc)
    all_docs_texts[k] = chunks_docs

for context_tups in all_docs_texts.values():
    context_doc_tups.extend(context_tups)

context_doc_tups[0]

('<SENT 01_01>\nAssociations of polymorphisms of eight muscle- or metabolism-related genes\nwith performance in Mount Olympus marathon runners\nGeorgios I. Tsianos,1 Evangelos Evangelou,1 Arnoud Boot,2 M. Carola Zillikens,2 Joyce B. J. van Meurs,2\nAndre G. Uitterlinden,2,3 and John P. A. Ioannidis1,4\n1Department of Hygiene and Epidemiology, University of Ioannina School of Medicine, Ioannina, Greece; Departments\nof 2Internal Medicine and3Epidemiology, Erasmus Medical Center, Rotterdam, The Netherlands; and4Center for Genetic\nEpidemiology and Modeling, Institute for Clinical Research and Health Policy Studies, Tufts Medical Center, Tufts University\nSchool of Medicine, Boston, Massachusetts\nTsianos GI, Evangelou E, Boot A, Zillikens MC, van Meurs JB,\nUitterlinden AG, Ioannidis JP.Associations of polymorphisms of eight\nmuscle- or metabolism-related genes with performance in Mount Olympus marathon runners.J Appl Physiol108: 567–574,2010.\n</SENT 01_01>\n<SENT 01_02>\nFirst publishe

### Question Generation

In [6]:
QUESTION_GEN_PROMPT = """
You are a question-generation assistant. Your sole task is to generate thoughtful, clarifying, and exploratory questions based on the provided context.  

Context: {context}

Provide the questions as follows:
Questions:
1.
2.
3.
...

Generate only questions that aim to explore or refine the given context further. Avoid any assumptions, interpretations, or answers—just ask questions. Seperate different question with a newline

Questions:
"""
prompt = PromptTemplate.from_template(QUESTION_GEN_PROMPT)

In [7]:
llm = HuggingFaceEndpoint(
    repo_id=QUESTION_GEN_LLM,
    huggingfacehub_api_token=os.environ["HUGGINGFACEHUB_API_TOKEN"],
    **QUESTION_GEN_LLM_CONFIG
)
llm_chain = prompt | llm

In [13]:
question_context_pair = []
for context in tqdm(context_doc_tups):
    ctx_text = "\n\n".join(context)
    questions = llm_chain.invoke({"context": ctx_text}).split("\n")[:QUESTION_PER_CTX_COUNT]
    question_context_pair.append((ctx_text, questions))

  0%|          | 0/15 [00:00<?, ?it/s]

In [14]:
question_context_pair

[('<SENT 01_01>\nAssociations of polymorphisms of eight muscle- or metabolism-related genes\nwith performance in Mount Olympus marathon runners\nGeorgios I. Tsianos,1 Evangelos Evangelou,1 Arnoud Boot,2 M. Carola Zillikens,2 Joyce B. J. van Meurs,2\nAndre G. Uitterlinden,2,3 and John P. A. Ioannidis1,4\n1Department of Hygiene and Epidemiology, University of Ioannina School of Medicine, Ioannina, Greece; Departments\nof 2Internal Medicine and3Epidemiology, Erasmus Medical Center, Rotterdam, The Netherlands; and4Center for Genetic\nEpidemiology and Modeling, Institute for Clinical Research and Health Policy Studies, Tufts Medical Center, Tufts University\nSchool of Medicine, Boston, Massachusetts\nTsianos GI, Evangelou E, Boot A, Zillikens MC, van Meurs JB,\nUitterlinden AG, Ioannidis JP.Associations of polymorphisms of eight\nmuscle- or metabolism-related genes with performance in Mount Olympus marathon runners.J Appl Physiol108: 567–574,2010.\n</SENT 01_01>\n<SENT 01_02>\nFirst publish

In [15]:
import json

with open(f"eval_data_generated_{date_time_str}.json", "w") as f:
    json.dump(question_context_pair, f)