In [94]:
!pip3 install langchain-openai langchain-core langchain-community pypdf


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
import json
import random

from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Dict, Tuple
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_core import ChatPromptTemplate, LLMChain

In [96]:
with open('../../config.json') as config_file:
    config = json.load(config_file)
    openai_api_key = config.get("openai_api_key")

with open('../globals.json') as config_file:
    config = json.load(config_file)
    main_file = config.get("main_pdf")
    negative_file = config.get("negative_pdf")

In [97]:
pdf_loader = PyPDFLoader(main_file)
pdf_pages = pdf_loader.load()

pdf_document = ""
for i in range(len(pdf_pages)):
    pdf_document += pdf_pages[i].page_content

Ignoring wrong pointing object 46 0 (offset 0)
Ignoring wrong pointing object 48 0 (offset 0)
Ignoring wrong pointing object 50 0 (offset 0)
Ignoring wrong pointing object 53 0 (offset 0)
Ignoring wrong pointing object 193 0 (offset 0)
Ignoring wrong pointing object 195 0 (offset 0)
Ignoring wrong pointing object 197 0 (offset 0)
Ignoring wrong pointing object 200 0 (offset 0)
Ignoring wrong pointing object 217 0 (offset 0)
Ignoring wrong pointing object 219 0 (offset 0)
Ignoring wrong pointing object 221 0 (offset 0)
Ignoring wrong pointing object 224 0 (offset 0)
Ignoring wrong pointing object 298 0 (offset 0)
Ignoring wrong pointing object 300 0 (offset 0)
Ignoring wrong pointing object 302 0 (offset 0)
Ignoring wrong pointing object 304 0 (offset 0)
Ignoring wrong pointing object 308 0 (offset 0)
Ignoring wrong pointing object 310 0 (offset 0)
Ignoring wrong pointing object 312 0 (offset 0)
Ignoring wrong pointing object 314 0 (offset 0)
Ignoring wrong pointing object 354 0 (offset

In [98]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    model_name="gpt-4",
    chunk_size=800,
    chunk_overlap=400,
)

document_chunks = text_splitter.split_text(pdf_document)

In [None]:
def process_chunk(llm_chain, chunk):
    """Helper function to process a single chunk with error handling."""
    try:
        result = llm_chain.run(chunk=chunk)
        print(result)
        questions = json.loads(result)
        return [{'question': q_text, 'chunk': chunk} for q_text in questions.values()]
    except Exception as e:
        print(f"Error processing chunk: {e}")
        return []


def process_chunks_and_split_parallel(document_chunks: List[str],
                                      openai_api_key: str,
                                      train_ratio: float = 0.8) -> Tuple[List[Dict], List[Dict]]:
    """
    Process document chunks in parallel and split into training and validation sets.

    Args:
        document_chunks: List of text chunks from the document
        openai_api_key: OpenAI API key
        train_ratio: Ratio of data to use for training (default 0.8 for 80/20 split)

    Returns:
        Tuple of (training_data, validation_data)
    """
    label_template = """
You are an AI assistant tasked with generating twenty similar questions based on a given document. The questions should be something a user might naturally ask when seeking information contained in the document.

Given: {chunk}

Instructions:

Analyze the key topics, facts, and concepts in the given document, and choose one to focus on.
Generate twenty similar questions that a user might ask to find the information in this document that does NOT contain any company name.
Use natural language and occasionally include typos or colloquialisms to mimic real user behavior in the questions.
Ensure the questions are semantically related to the document content WITHOUT directly copying phrases.
Make sure that all of the questions are similar to each other, i.e., all asking about a similar topic or requesting the same information.
Output Format: Return a JSON object with the following structure: {{ "question_1": "Generated question text", "question_2": "Generated question text", ... }}

Be creative, think like a curious user, and generate your 20 similar questions that would naturally lead to the given document in a semantic search. Ensure your response is a valid JSON object containing only the questions.

Example 1: Given: The history of Ancient Egypt includes the building of the pyramids, the reign of pharaohs, and advances in writing, architecture, and medicine. The culture and religion of Egypt evolved over centuries and was marked by reverence for gods such as Ra, Isis, and Osiris. 

Output: {{ "question_1": "What were the main achievements of Ancient Egypt?", "question_2": "Who were some of the prominent gods in Egyptian mythology?", "question_3": "Can you tell me about the pharaohs of Ancient Egypt?" }}
Example 2: Given: The process of photosynthesis in plants involves the absorption of sunlight by chlorophyll, which then converts carbon dioxide and water into glucose and oxygen. This process is crucial for plant growth and contributes to the oxygen supply in Earth's atmosphere. 

Output: {{ "question_1": "How does photosynthesis work in plants?", "question_2": "What role does chlorophyll play in photosynthesis?", "question_3": "Can you explain the steps of photosynthesis?" }}
Using these examples as a guide, analyze the key concepts in the document chunk and generate similar questions in JSON format. Ensure that the output starts with curly braces, and don't include backticks or the word json."""

    label_prompt = ChatPromptTemplate.from_template(label_template)
    llm = ChatOpenAI(temperature=0.7, model_name="gpt-4o-mini",
                     openai_api_key=openai_api_key)
    llm_chain = LLMChain(llm=llm, prompt=label_prompt)

    all_question_entries = []
    with ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(process_chunk, llm_chain, chunk)
            for chunk in document_chunks
        ]
        for future in as_completed(futures):
            result = future.result()
            if result:
                all_question_entries.extend(result)

    random.shuffle(all_question_entries)
    split_idx = int(len(all_question_entries) * train_ratio)
    train_data = all_question_entries[:split_idx]
    val_data = all_question_entries[split_idx:]

    return train_data, val_data


def write_json_files(train_data: List[Dict],
                     val_data: List[Dict],
                     train_path: str = "../data/train.json",
                     val_path: str = "../data/validation.json"):
    """
    Write the training and validation data to JSON files.

    Args:
        train_data: List of training data dictionaries
        val_data: List of validation data dictionaries
        train_path: Path to save training data (default: "../data/train.json")
        val_path: Path to save validation data (default: "../data/validation.json")
    """
    with open(train_path, 'w', encoding='utf-8') as f:
        json.dump(train_data, f, ensure_ascii=False, indent=2)

    with open(val_path, 'w', encoding='utf-8') as f:
        json.dump(val_data, f, ensure_ascii=False, indent=2)

    print(f"Written {len(train_data)} examples to {train_path}")
    print(f"Written {len(val_data)} examples to {val_path}")

In [100]:
train_data, val_data = process_chunks_and_split_parallel(
    document_chunks=document_chunks,
    openai_api_key=openai_api_key
)


write_json_files(train_data, val_data)

{ 
  "question_1": "What are the phases in the evolution of language models?", 
  "question_2": "How do statistical language models differ from neural language models?", 
  "question_3": "What’s the significance of word embeddings in language modeling?", 
  "question_4": "Can you explain the concept of pre-trained language models?", 
  "question_5": "What are some examples of pre-trained language models and their functions?", 
  "question_6": "How did ELMo improve upon earlier language models?", 
  "question_7": "What techniques are used in BERT for language understanding?", 
  "question_8": "Why is the n-gram model important in the history of language modeling?", 
  "question_9": "What limitations do high-order language models face?", 
  "question_10": "How do neural networks enhance language modeling?", 
  "question_11": "What role do LSTM networks play in modern language models?", 
  "question_12": "How does the transformer architecture contribute to language processing?", 
  "quest

In [101]:
print(train_data[0])

{'question': 'How do LLMs interpret visual inputs for navigation?', 'chunk': 'trained vision-language features with a 3D reconstruction of the physical world. VLMaps, \nwhen combined with an LLM, translate spatially organized sequences of open-vocabulary \nnavigation goals (e.g., “between the sofa and the TV”) into natural language commands. \nThese commands can be directly localized on a map and generate new obstacle maps in \nreal-time, facilitated by sharing among various robot types. Extensive experiments \nconducted in both simulated environments (using the Habitat simulator with the \nMatterport3D dataset and the AI2THOR simulator) and real-world settings (with the HSR \nmobile robot for indoor navigation) demonstrated that VLMs can navigate based on more \ncomplex language instructions than previous methods. The reviewed papers in this study \nare summarized in Table 5. \nTable 5. Summary of the reviewed papers in this study. \nName Explanation Ref. \nReward Design in \nRL \n• E