# Create a thousand articles for one topic

In [None]:
!pip install -q -U google-genai python-dotenv

In [None]:
from google import genai
import os
from dotenv import load_dotenv
import json
from tqdm.notebook import tqdm

# Make sure you create a .env.local file and put your GEMINI_API_KEY in there.


In [None]:
load_dotenv(".env.local")

In [None]:
gemini_api_key = os.getenv("GEMINI_API_KEY")
# The client gets the API key from the environment variable `GEMINI_API_KEY`.
client = genai.Client(api_key=gemini_api_key)

# Generate topics

In [None]:
with open("prompts/topic_generation.txt", "r") as f:
    topic_prompt = f.read()

In [None]:
response = client.models.generate_content(
    model="gemini-2.5-flash", contents=topic_prompt,
)
topics = response.text

# Save the list of topics

In [None]:
topics_list = json.loads(topics)

In [None]:
with open("articles/machine_learning/list_of_topics.json", "w") as f:
    json.dump(json.loads(topics), f, indent=2)

# Generate Articles

In [None]:
with open("prompts/article_creation.text", "r") as f:
    prompt = f.read()

In [None]:
def save_article(topic, article, idx):
    # lowecase all topics, split by space, and join with underscore
    # example: "Machine Learning" -> "machine_learning"
    title = "_".join(topic.lower().split(" "))
    with open(f"articles/machine_learning/{title}.json", "w") as f:
        json.dump(article, f, indent=2)
    print(f"Saved article {idx + 1} for topic: {title}")

In [None]:
# A while loop to generate articles for each topic is bettern than a for loop
# because the LLM can generate non-json responses, and we can handle that case
is_done = False
index = 0
# This could be long. Reduce the number of topics to generate articles for and do it in batches.
shortened_topics = topics_list[0:5]  # Adjust the range as needed
total = len(shortened_topics)  # Adjust the range as needed
pbar = tqdm(total=total)
while not is_done:
    topic = shortened_topics[index]
    final_prompt = prompt.format(topic=topic)
    print(f"Generating article for topic: {topic}")
    article_response = client.models.generate_content(
        model="gemini-2.5-flash", contents=final_prompt,
    )
    article_text = article_response.text
    try:
        article_dict = json.loads(article_text)
        save_article(shortened_topics[index], article_dict, index)
        index += 1
        pbar.update(total - pbar.n)
    except json.JSONDecodeError:
        # two options now.
        # 1. The response is not a valid JSON, we can try to fix it.
        print("Response is not a valid JSON, trying to fix it.")
        if '```json' in article_text:
            # remove the ````json` and ` ``` ` from the response
            # clean_text = article_text.split('```json')[1].split('```')[0].strip()
            # this is better because the LLM could generate ```json in the middle of the response`
            clean_text = article_text[8:-4].strip()
            try:
                article_dict = json.loads(clean_text)
                save_article(shortened_topics[index], article_dict, index)
                index += 1
                pbar.update(total - pbar.n)
            except json.JSONDecodeError:
                print("Still not a valid JSON, redo generation.")
                # If it still fails, we can just skip this topic and move to the next one.
        else:
            print("Response is not a valid JSON, redo generation.")
        # 2. The response is not a valid JSON, we can reset and have it generate again.
    if index >= len(shortened_topics):
        is_done = True
pbar.update(total - pbar.n)
pbar.close()