In [1]:
# Import necessary modules and classes
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

from langchain_google_genai import ChatGoogleGenerativeAI

from langsmith import Client
from langchain.agents import AgentExecutor, create_react_agent
from langchain_core.prompts import PromptTemplate

from langchain_core.tools import Tool
from langchain_experimental.utilities import PythonREPL

from pydantic import BaseModel, Field
from typing import List

from langchain_core.output_parsers import JsonOutputParser

In [2]:
# Initialize the ChatGoogleGenerativeAI model with specific parameters
# This model is used for generating synthetic content based on the given topic and metadata
# Parameters:
# - model: Specifies the model version to use, in this case, "gemini-2.0-flash"
# - temperature: Controls the randomness of the output. Lower values make the output more deterministic.
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0.2
)

In [3]:
# Initialize the LangSmith client and pull the prompt for the REACT agent
client = Client()
prompt = client.pull_prompt("hwchase17/react", include_model=True)

# Initialize the Python REPL tool for generating a structured CSV dataset along with its metadata
python_repl = PythonREPL()

# Define the tool for the REPL, specifying its name, description, and function
repl_tool_csv = Tool(
    name="python_repl",
    description="A Python shell for generating a structured CSV dataset along with its metadata. Input must be a valid Python command, and all output should be displayed using print(...).",
    func=python_repl.run,
)

# Add the REPL tool to the list of tools for generating the CSV dataset
tools_csv = [repl_tool_csv]

# Create the REACT agent using the specified LLM, tools, and prompt
agent = create_react_agent(llm, tools_csv, prompt)

# Initialize the agent executor for the CSV generation, setting verbosity and error handling
agent_executor_csv = AgentExecutor(agent=agent, tools=tools_csv, verbose=False, handle_parsing_errors=True)

In [4]:
# Define a tool for executing Python commands related to the provided CSV dataset, ensuring proper handling of outputs and images
repl_tool_paragraph = Tool(
    name="python_repl",
    description="A Python shell for executing commands strictly related to the provided CSV dataset, such as retrieving statistics, generating plots, and creating summary tables. Input must be a valid Python command, and all output should be displayed using `print(...)`. If the output includes an image file or raw image data, the agent **must not print base64-encoded content or MIME types like `image/png;base64`**—instead, it should print only a textual description of the image. Matplotlib must be properly closed after generating plots to prevent rendering issues. Any operations outside of interacting with the given CSV dataset are not supported.",
    func=python_repl.run,
)

# Add the REPL tool to the list of tools for generating paragraphs
tools_paragraph = [repl_tool_paragraph]

# Create the REACT agent using the specified LLM, tools, and prompt for paragraph generation
agent = create_react_agent(llm, tools_paragraph, prompt)
agent_executor_paragraph = AgentExecutor(agent=agent, tools=tools_paragraph, verbose=False, handle_parsing_errors=True)

In [5]:
# Define Pydantic models for chapters, subchapters, and PDF metadata
class Chapter(BaseModel):
    chapter_index: int = Field(description="The chapter number according to the topic.")
    chapter_title: str = Field(description="The title of the chapter based on the topic in Bahasa Indonesia.")
    chapter_outline: str = Field(description="An outline or key points that need to be written.")

class ChapterList(BaseModel):
    chapter_outline_summary: str = Field(description="A summary of all chapter outlines.")
    chapter: List[Chapter]

class SubChapter(BaseModel):
    subchapter_index: int = Field(description="The subchapter number within the chapter. Must be an integer.")
    subchapter_title: str = Field(description="The title of the subchapter based on the topic in Bahasa Indonesia.")
    subchapter_outline: str = Field(description="A structured outline or key points to be covered in the subchapter.")

class SubChapterList(BaseModel):
    chapter_index: int = Field(description="The chapter number as an integer.")
    chapter_title: str = Field(description="The title of the chapter in Bahasa Indonesia.")
    chapter_outline: str = Field(description="An outline or key points that need to be written.")
    subchapter_outline_summary: str = Field(description="A summarized overview of all subchapter outlines within the chapter.")
    chapter_outline_summary: str = Field(description="A summary of all chapter outlines.")
    subchapter: List[SubChapter]

class PdfMetadata(BaseModel):
    title: str = Field(description="The title of the PDF document, extracted from the provided context.")
    author: str = Field(description="The author of the PDF document, as specified in the provided context.")
    subject: str = Field(description="The subject or main topic of the PDF document, derived from the context.")
    keyword: str = Field(description="Keywords associated with the PDF document, extracted from the context. Multiple keywords should be separated by commas.")


In [6]:
def generate_dataset_csv(topic, agent_executor):
    # Define the instruction template for generating a synthetic dataset and its metadata
    instruction = """
    TOPIC:  
    {topic}  

    Task Overview:  
    Generate a structured synthetic dataset based on the given topic. The dataset should be coherent, consistent, and realistic while containing no personally identifiable information or real-world factual data.  

    Step 1: Generate a Synthetic CSV Dataset  
    Create a CSV file named synthetic-dataset.csv with the following specifications:  

    - At least 10 distinct and meaningful columns relevant to the topic.
    - 1,000 rows of structured synthetic data.  
    - Daily data entries if applicable.  
    - If the dataset represents time-series data, ensure it is sorted in descending order, with the most recent date first.  
    - Maintain coherence and consistency to ensure realism.  

    Step 2: Create Dataset Metadata  
    Generate a detailed metadata file named metadata-synthetic-dataset.txt that includes:  

    - A comprehensive overview of the dataset.  
    - Clear descriptions for each column, explaining its purpose and meaning.  
    - Potential use cases to highlight practical applications.  
    - Structural insights to enhance understanding of the dataset.  

    Do not mention that the data is synthetic or artificially generated. The dataset should appear as a professionally structured, realistic dataset.

    """
    
    # Invoke the agent executor with the formatted instruction to generate the dataset and metadata
    agent_executor.invoke({"input":instruction.format(topic=topic)})

In [7]:
def generate_chapter_outline(llm, topic, metadata_dataset):
    # Initialize the JSON output parser with the ChapterList Pydantic model
    parser = JsonOutputParser(pydantic_object=ChapterList)

    # Define the instruction template for generating a structured chapter outline
    instruction = PromptTemplate(
        template="""
    Goal:
    Create a structured outline for each chapter of a synthetic (non-factual) document with a total length of 50 pages, based on the given topic.

    Content Details:

    The document must have a realistic structure but contain fully fabricated information.

    This outline will serve as the foundation for developing subchapter outlines in the next stage.

    The outline should incorporate a mix of elements such as:

    Narrative text

    Graphs, charts, and diagrams

    Workflows and illustrations

    Document Structure & Length:

    Total document length: 50 pages.

    The number of chapters should be structured proportionally, ensuring each chapter has enough subchapters to meet the total required length.

    Each subchapter in a chapter must include at least:

    700 words

    7 paragraphs

    Technical Instructions:

    Use only the provided "Dataset Metadata" as a reference when constructing the outline.

    Ensure the content remains cohesive and contextually relevant, even though it is entirely synthetic.

    Avoid nonsensical or meaningless content.

    Output Format:
    {format_instructions}

    Dataset Metadata:
    {metadata_dataset}

    Topic:
    {topic}
    """,
        input_variables=["topic", "metadata_dataset"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
    )

    # Create a chain of the instruction template, LLM, and parser
    chain = instruction | llm | parser

    # Invoke the chain to generate the chapter outline based on the topic and metadata
    chapter_outputs = chain.invoke(
        {
            "topic": topic,
            "metadata_dataset": metadata_dataset,
        }
    )

    return chapter_outputs

In [8]:
def generate_subchapter_outline(llm, topic, metadata_dataset, chapter_output, chapter_outputs):
    # Initialize the JSON output parser with the SubChapterList Pydantic model
    parser = JsonOutputParser(pydantic_object=SubChapterList)

    # Define the instruction template for generating a structured subchapter outline
    prompt = PromptTemplate(
        template="""
    Goal:  
    This prompt aims to generate a structured outline for each subchapter of a synthetic PDF document with a total length of 50 pages. The subchapter outline will later be used as a foundation for writing the final draft paragraphs.  

    You must carefully consider the Chapter Details, Dataset Metadata, and Topic to ensure the outline remains cohesive, contextually relevant, and well-structured, even though the content is entirely synthetic.  

    Content Requirements:  
    - The document must have a realistic structure but contain fabricated (non-factual) information.  
    - The content should include a mix of text, graphs, charts, workflows, and images for a professional and structured presentation.  
    - Ensure the text is cohesive, contextually relevant, and well-organized, simulating real-world data while remaining clearly synthetic.  
    - Avoid meaningless or nonsensical content.  

    Clarification:  
    - 50 pages refer to the total length of the document, not the number of subchapters.  
    - Each subchapter should contain at least 700 words and 7 paragraphs to help ensure the total document reaches 50 pages.  
    - Subchapter indexes must be strictly integers (e.g., 1, 2, 3, etc.). Do not use non-integer values (e.g., 1.1, 2a, etc.).  

    Source of Knowledge:  
    The only reference for generating the outline and content is the provided Dataset Metadata.  

    Chapter Details:  
    - Chapter Index: {chapter_index}  
    - Chapter Title: {chapter_title}  
    - Chapter Outline: {chapter_outline}  
    - Overall Chapter Outline: {chapter_outline_summary}  

    Format Instructions:  
    {format_instructions}  

    Dataset Metadata:  
    {metadata_dataset}  

    Topic:  
    {topic}
    """,
        input_variables=["topic", "metadata_dataset", "chapter_index", "chapter_title", "chapter_outline", "chapter_outline_summary"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
    )

    # Create a chain of the instruction template, LLM, and parser
    chain = prompt | llm | parser

    # Invoke the chain to generate the subchapter outline based on the topic, metadata, and chapter details
    subchapter_outputs = chain.invoke(
        {
            "topic": topic,
            "metadata_dataset": metadata_dataset,
            "chapter_index": chapter_output["chapter_index"],
            "chapter_title": chapter_output["chapter_title"],
            "chapter_outline": chapter_output["chapter_outline"],
            "chapter_outline_summary": chapter_outputs["chapter_outline_summary"]
        }
    )

    return subchapter_outputs

In [9]:
def generate_paragraph(agent_executor, topic, metadata_dataset, subchapter_output, subchapter_outputs):
  # Define the instruction template for generating a detailed and structured subchapter
  instruction = """
Goal:  
Generate a detailed and structured subchapter for a 50-page PDF document, ensuring the content is based on the provided chapter details, subchapter details, synthetic dataset, and topic. The output must maintain a realistic format with proper Markdown structuring for headings, tables, and tool-generated images while ensuring the content is coherent, well-organized, and written in Bahasa Indonesia.  

Content Requirements:  
- The document must have a clear structure and include a mix of text, graphs, charts, workflows, and tables for a structured and professional presentation.  
- The writing must be cohesive, contextually relevant, and well-organized, presenting data and insights effectively.  
- Use tools **only if you are certain** and can generate code without errors to ensure effectiveness.  
- Use tools only when necessary to generate graphs or retrieve data from synthetic-dataset.csv.  
- Only include a portion of the synthetic dataset in the content—do not include the entire dataset.  

Graph and Chart Guidelines:  
- Ensure generated graphs are human-readable by:  
  - Using clear titles, labels, and legends.  
  - Formatting axes properly for better comprehension.  
  - Using appropriate colors and contrast.  
  - Adding grid lines if necessary.  
- Ensure that date and time axes are properly formatted and clearly visible by:  
  - Using rotated labels (e.g., 45° or 90°) to prevent overlap.  
  - Formatting dates in an easy-to-read manner (e.g., YYYY-MM-DD or HH:MM).  
  - Ensuring adequate spacing between labels.  
  - Using grid lines and tick marks to improve readability.  
  - Adjusting the axis limit to avoid cluttered or compressed labels.  
  - Applying Matplotlib’s auto-formatting for time-based data using fig.autofmt_xdate().  

Table Guidelines:  
- Tables should be used where appropriate to clearly present structured data.  
- Only include a relevant portion of the dataset—do not display the entire synthetic dataset.  
- Ensure that all generated tables are fully formatted and do not contain unfinished or incomplete structures.  
- To prevent truncation when exporting to PDF, tables should have a maximum of **5 columns per table**. If necessary, split larger tables into multiple smaller tables to maintain readability.  

Image and File Handling:  
- Do not include raw images or base64-encoded images in the output.  
- Only include images that are properly saved as PNG files in the /images folder and referenced correctly in Markdown.  

Language and Structure:  
- The output must be written in Bahasa Indonesia while maintaining the correct structure.  

Minimum Requirements:  
- At least 5 paragraphs with a minimum of 400 words.  
- Graphs (if needed):  
  - Generate using Matplotlib **only if you are certain the code will run without errors**.  
  - Ensure they are human-readable and properly formatted.  
  - Save as PNG files in the /images folder.  
  - Embed in Markdown using:  
  ![alt text](images/image.png "Title")  
  - Do not insert images as raw files or base64-encoded data.  
- Tables (if needed):  
  - Properly format them in Markdown, ensuring completeness. Example:  
  | Column 1 | Column 2 | Column 3 | Column 4 | Column 5 |  
  |----------|----------|----------|----------|----------|  
  | Data 1   | Data 2   | Data 3   | Data 4   | Data 5   |  
  | Data 6   | Data 7   | Data 8   | Data 9   | Data 10  |  
  - Ensure tables **do not exceed 5 columns per table** to prevent truncation in PDF export.  
  - If necessary, split wider tables into multiple smaller tables.  
- Tables and images must be properly embedded within the content without being left incomplete or improperly structured.  

Format and Output:  
- Markdown should only be applied to headings, tables, and correctly stored images.  
- Do not start the output with "Markdown".  
- Subchapter Title Format:  
  ### {chapter_index}.{subchapter_index}. {subchapter_title}  
- Chapter Title Handling:  
  - If the subchapter is the first in the chapter (subchapter_index = 1), prepend the chapter title in the following format:  
  ## Bab {chapter_index}. {chapter_title}  
  - If the subchapter is not the first one, do not include the chapter title in the output.  
- The main body of the text should remain plain text in Bahasa Indonesia.  

Source of Knowledge:  
The content must be based only on the following sources:  
1. Synthetic Dataset (synthetic-dataset.csv) – A dataset used for generating structured information in the document. Only a portion of this dataset should be included in the content.  
2. Synthetic Dataset Metadata ({metadata_dataset}) – Metadata that describes the structure, variables, and attributes of the dataset.  

Subchapter Details:  
- Subchapter Index: {subchapter_index}  
- Subchapter Title: {subchapter_title}  
- Subchapter Outline: {subchapter_outline}  

Chapter Context:  
- Chapter Index: {chapter_index}  
- Chapter Title: {chapter_title} (Only included when subchapter_index = 1)  
- Summarized Overview of Subchapter Outlines: {subchapter_outline_summary}  
- Topic: {topic}  
  """

  # Invoke the agent executor to generate the paragraph based on the formatted instruction
  paragprah_output = agent_executor.invoke({"input":instruction.format(
    topic= topic,
    metadata_dataset=metadata_dataset,
    subchapter_index=subchapter_output["subchapter_index"],
    subchapter_title=subchapter_output["subchapter_title"],
    subchapter_outline=subchapter_output["subchapter_outline"],
    chapter_index=subchapter_outputs["chapter_index"],
    chapter_title=subchapter_outputs["chapter_title"],
    subchapter_outline_summary=subchapter_outputs["subchapter_outline_summary"],
  )})

  return paragprah_output

In [10]:
def generate_pdf_metadata(llm, chapter_outputs):
    # Initialize the JSON output parser with the PdfMetadata Pydantic model
    parser = JsonOutputParser(pydantic_object=PdfMetadata)

    # Define the instruction template for generating PDF metadata
    prompt = PromptTemplate(
        template="""
Based on the provided context, generate the title, author, subject, and keywords for the PDF document metadata in Bahasa Indonesia.
Ensure the metadata is relevant to the content without indicating that it is fictional.

Format Instructions:  
{format_instructions}  

Context:
{chapter_outputs}
    """,
        input_variables=["chapter_outputs"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
    )

    # Create a chain of the instruction template, LLM, and parser
    chain = prompt | llm | parser

    # Invoke the chain to generate the PDF metadata based on the chapter outputs
    pdf_metadata = chain.invoke(
        {
            "chapter_outputs": chapter_outputs,
        }
    )

    return pdf_metadata

In [11]:
# Define the topic for the synthetic dataset and generate the dataset CSV using the specified agent executor
topic = "Wakanda Country Statistical Report - The Richest Country Nobody Knows About"
generate_dataset_csv(topic, agent_executor_csv)

Python REPL can execute arbitrary code. Use with caution.


In [12]:
import pandas as pd

# Load CSV file into DataFrame
csv_file = "synthetic-dataset.csv"  # Change this to your actual CSV file path
table_name = "my_table"  # Change this to your desired table name

df = pd.read_csv(csv_file)

# Generate SQL statements
sql_file = "../synthetic-dataset.sql"
with open(sql_file, "w", encoding="utf-8") as f:
    f.write(f"CREATE TABLE {table_name} (\n")
    f.write(",\n".join([f"    {col} TEXT" for col in df.columns]))
    f.write("\n);\n")

    for _, row in df.iterrows():
        values = "', '".join(map(str, row.tolist()))
        f.write(f"INSERT INTO {table_name} VALUES ('{values}');\n")

print(f"SQL file generated: {sql_file}")

SQL file generated: ../synthetic-dataset.sql


In [13]:
# Read the metadata for the synthetic dataset from the file and store it in the variable 'metadata_dataset'
with open('metadata-synthetic-dataset.txt', 'r') as file:
    metadata_dataset = file.read()

In [14]:
# Generate the chapter outline based on the topic and metadata
chapter_outputs = generate_chapter_outline(llm, topic, metadata_dataset)

# Print the index and title of each chapter in the generated outline
for i in range(len(chapter_outputs["chapter"])):
    print(str(chapter_outputs["chapter"][i]["chapter_index"])+" "+chapter_outputs["chapter"][i]["chapter_title"])

1 Pendahuluan: Wakanda yang Tersembunyi - Gambaran Statistik
2 Ekonomi Wakanda: Analisis PDB dan Produksi Vibranium
3 Investasi Sosial: Pendidikan dan Kesehatan di Wakanda
4 Kemajuan Teknologi: Investasi dan Dampak di Wakanda
5 Manajemen Sumber Daya: Produksi Energi dan Keberlanjutan di Wakanda
6 Tantangan dan Peluang: Masa Depan Statistik Wakanda


In [15]:
# Initialize an empty list to store the generated markdown content
markdown_content_list = []

# Iterate over each chapter in the generated chapter outline
for i in range(len(chapter_outputs["chapter"])):
    # Generate the subchapter outline for the current chapter
    subchapter_outputs = generate_subchapter_outline(llm, topic, metadata_dataset, chapter_outputs["chapter"][i], chapter_outputs)

    # Print the index and title of the current chapter
    print(str(chapter_outputs["chapter"][i]["chapter_index"])+" "+chapter_outputs["chapter"][i]["chapter_title"])

    # Iterate over each subchapter in the generated subchapter outline
    for j in range(len(subchapter_outputs["subchapter"])):
        # Generate the paragraph content for the current subchapter
        paragprah_output = generate_paragraph(agent_executor_paragraph, topic, metadata_dataset, subchapter_outputs["subchapter"][j], subchapter_outputs)

        # Print the index and title of the current subchapter
        print("\t"+str(subchapter_outputs["subchapter"][j]["subchapter_index"])+" "+subchapter_outputs["subchapter"][j]["subchapter_title"])

        # Append the generated paragraph content to the markdown content list
        markdown_content_list.append(paragprah_output["output"])

1 Pendahuluan: Wakanda yang Tersembunyi - Gambaran Statistik
	1 Wakanda: Anomali Geopolitik dan Statistik
	2 Tinjauan Ekonomi Wakanda: PDB dan Dominasi Vibranium
	3 Indikator Pembangunan Sosial: Pendidikan dan Kesehatan di Wakanda
	4 Kemajuan Teknologi: Dampak Vibranium pada Inovasi Wakanda
	5 Metodologi dan Sumber Data: Memahami Statistik Wakanda
2 Ekonomi Wakanda: Analisis PDB dan Produksi Vibranium
	1 Tinjauan Umum PDB Wakanda: Pertumbuhan dan Tren Historis
	2 Peran Produksi Vibranium dalam Ekonomi Wakanda
	3 Distribusi Kekayaan dan Tingkat Kemiskinan di Wakanda
	4 Perbandingan Ekonomi Wakanda dengan Negara Lain
	5 Model Ekonomi Hipotetis: Dampak Vibranium pada Ekonomi Wakanda
	6 Investasi Asing Langsung (FDI) dan Pengaruhnya terhadap PDB Wakanda
	7 Diversifikasi Ekonomi Wakanda: Menuju Keberlanjutan di Luar Vibranium
3 Investasi Sosial: Pendidikan dan Kesehatan di Wakanda
	1 Sistem Pendidikan Wakanda: Struktur dan Investasi
	2 Sistem Kesehatan Wakanda: Akses dan Kualitas
	3 Dispari

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


	2 Tren Investasi Teknologi Saat Ini di Wakanda
	3 Dampak Teknologi pada Sektor Manufaktur Wakanda
	4 Peran Teknologi dalam Memajukan Pertanian Wakanda
	5 Teknologi dalam Produksi dan Distribusi Energi Wakanda
	6 Pertimbangan Etis dalam Kemajuan Teknologi di Wakanda
	7 Peta Jalan Teknologi untuk Masa Depan Wakanda
5 Manajemen Sumber Daya: Produksi Energi dan Keberlanjutan di Wakanda
	1 Tren Historis Produksi Energi di Wakanda
	2 Sumber Energi Wakanda: Diversifikasi dan Inovasi
	3 Praktik Keberlanjutan: Inisiatif dan Kebijakan Wakanda
	4 Dampak Lingkungan dari Manajemen Sumber Daya


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


	5 Peran Vibranium dalam Produksi Energi Berkelanjutan
	6 Analisis Tren Produksi Energi: Grafik dan Bagan
	7 Rencana Keberlanjutan Wakanda: Visi untuk Masa Depan
6 Tantangan dan Peluang: Masa Depan Statistik Wakanda
	1 Analisis Risiko Ekonomi Wakanda
	2 Peluang Pengembangan Sosial dan Kesejahteraan Warga
	3 Manajemen Sumber Daya Vibranium yang Berkelanjutan
	4 Peran Data dan Statistik dalam Kebijakan Publik
	5 Skenario Masa Depan Wakanda: Analisis Hipotesis


In [16]:
# Combine all markdown content into a single string, removing any code block markers (```)
markdown_content = "\n".join(markdown_content_list).replace("```","")

In [17]:
# Write the combined markdown content to a file named "synthetic-dataset.md"
with open("synthetic-dataset.md", "w", encoding="utf-8") as file:
    file.write(markdown_content)

In [18]:
# Generate the PDF metadata based on the chapter outputs using the LLM
pdf_metadata = generate_pdf_metadata(llm, chapter_outputs)

In [19]:
from markdown_pdf import MarkdownPdf, Section

# Initialize the MarkdownPdf object
pdf = MarkdownPdf()

# Set PDF metadata using the generated metadata
pdf.meta["title"] = pdf_metadata["title"]
pdf.meta["author"] = pdf_metadata["author"]
pdf.meta["subject"] = pdf_metadata["subject"]
pdf.meta["keywords"] = pdf_metadata["keyword"]

# Define CSS for justified text and centered table
css = """
body { text-align: justify; }
table { border: 1px solid black; border-collapse: collapse; margin: auto; }
th, td { border: 1px solid black; padding: 5px; text-align: center; }
"""

# Add Title and Author section on the first page
title_author_section = Section(f"# {pdf_metadata['title']}\n\n**Author:** {pdf_metadata['author']}\n\n")
pdf.add_section(title_author_section)

# Add main content section with custom CSS
pdf.add_section(Section(markdown_content), user_css=css)

# Save the generated PDF to a file
pdf.save("../synthetics-dataset.pdf")
