In [1]:
from langchain.document_loaders import PyPDFLoader

# Load the PDF
pdf_path = "../synthetics-dataset.pdf"
loader = PyPDFLoader(pdf_path)

# Extract text from PDF
documents = loader.load()

# Combine text from all pages
pdf_text = "\n".join([doc.page_content for doc in documents])

print(f"Total characters extracted: {len(pdf_text)}")

Total characters extracted: 156911


In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  
    chunk_overlap=100,  
    length_function=len,
    separators=["\n\n", "\n", " "]
)

# Split text into chunks
chunks = text_splitter.split_text(pdf_text)

# Display first few chunks
for i, chunk in enumerate(chunks[:1]):
    print(f"Chunk {i+1}:\n{chunk}\n")

Chunk 1:
Analisis Statistik Wakanda: Ekonomi,
Sosial, dan Teknologi
Author: Tim Analisis Wakanda
Bab 1. Pendahuluan: Wakanda yang Tersembunyi -
Gambaran Statistik
1.1. Wakanda: Anomali Geopolitik dan Statistik
Subbab ini akan membahas posisi geopolitik unik Wakanda sebagai negara yang tersembunyi
dan anomali statistiknya dibandingkan dengan negara lain. Wakanda, sebuah negara yang
kaya akan sumber daya alam dan teknologi maju, telah lama memilih untuk mengisolasi diri
dari dunia luar. Keputusan ini bukan tanpa alasan; perlindungan budaya, sumber daya
Vibranium, dan kemajuan teknologi yang tak tertandingi menjadi prioritas utama. Isolasi ini
memiliki implikasi yang signiﬁkan terhadap data statistik Wakanda, menjadikannya sulit
untuk dibandingkan dengan negara-negara lain yang lebih terbuka dan terintegrasi secara
global.
Wakanda sering dianggap sebagai 'statistical outlier' karena kemajuan teknologi dan sumber
daya uniknya, Vibranium. Vibranium, logam langka dengan sifat luar biasa, tel

In [3]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

In [4]:
from langchain_google_genai import ChatGoogleGenerativeAI

# LLM
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0
)

In [5]:
from pydantic import BaseModel, Field
from typing import List, Optional

class InstructionFineTuneData(BaseModel):
    """
    Schema for formatting instruction-tuning dataset for fine-tuning LLMs.
    Ensures consistency and validation across different foundational models.
    """
    
    instruction: str = Field(description="The prompt or question given to the model.")
    input: Optional[str] = Field(None, description="Optional input context or passage relevant to the instruction.")
    output: str = Field(description="The expected response from the model.")

class InstructionFineTuneDataset(BaseModel):
    """
    Schema for a list of instruction fine-tuning data instances.
    """
    data: List[InstructionFineTuneData] = Field(..., description="List of instruction fine-tuning data samples.")

In [6]:
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate


parser = JsonOutputParser(pydantic_object=InstructionFineTuneDataset)

prompt = PromptTemplate(
    template="""
You are an AI assistant trained to create instruction fine-tuning datasets.
Given the following text chunk, generate multiple instruction-response pairs suitable for fine-tuning a language model **only if you are certain about the validity of the generated instructions and responses**.

Text Chunk:
{chunk}

Format the output as a JSON list where each item contains the following fields:
- instruction: Each instruction must be clear, concise, and directly derived from the text chunk.
- input: The input should include the provided text chunk only if necessary—otherwise, leave it empty. Remove unnecessary newlines from the input.
- output: The output must be a well-structured and accurate response to the instruction.

Ensure that the language of the generated instructions and responses matches the language of the input text chunk.

**Format Instructions:**  
{format_instructions}  
""",
    input_variables=["chunk"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)


chain = prompt | llm | parser


In [7]:
from tqdm import tqdm

# Process each chunk

dataset = []

for chunk in tqdm(chunks[:5]):
    output = chain.invoke({"chunk":chunk})
    dataset.extend(output["data"])

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:19<00:00,  3.87s/it]


In [8]:
len(dataset)

15

In [9]:
import json
with open('../data.json', 'w', encoding='utf-8') as f:
    json.dump(dataset, f, ensure_ascii=False, indent=4)