# Create a thousand articles for one topic

In [2]:
!pip install -q -U google-genai python-dotenv

    PyYAML (>=5.1.*)
            ~~~~~~^[0m[33m
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gradio-client 0.7.1 requires websockets<12.0,>=10.0, but you have websockets 15.0.1 which is incompatible.
llama-index-readers-file 0.4.9 requires pandas<2.3.0, but you have pandas 2.3.0 which is incompatible.
selenium 4.33.0 requires urllib3[socks]~=2.4.0, but you have urllib3 1.26.20 which is incompatible.
tensorboard 2.12.3 requires google-auth-oauthlib<1.1,>=0.5, but you have google-auth-oauthlib 1.2.2 which is incompatible.
tensorflow 2.12.0 requires numpy<1.24,>=1.22, but you have numpy 1.26.4 which is incompatible.
tensorflow 2.12.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 5.29.5 which is incompatible.[0m[31m
[0m

In [48]:
from google import genai
import os
from dotenv import load_dotenv
import json
from tqdm.notebook import tqdm

# Make sure you create a .env.local file and put your GEMINI_API_KEY in there.


In [6]:
load_dotenv(".env.local")

True

In [7]:
gemini_api_key = os.getenv("GEMINI_API_KEY")
# The client gets the API key from the environment variable `GEMINI_API_KEY`.
client = genai.Client(api_key=gemini_api_key)

# Generate topics

In [9]:
with open("prompts/topic_generation.txt", "r") as f:
    topic_prompt = f.read()

In [None]:
response = client.models.generate_content(
    model="gemini-2.5-flash", contents=topic_prompt,
)
topics = response.text

["Model deployment", "Data pipelines", "MLOps principles", "Active learning", "Online learning", "Transfer learning", "Set theory", "Vector space", "Convex optimization", "Causal inference"]


# Save the list of topics

In [16]:
topics_list = json.loads(topics)

In [17]:
with open("articles/machine_learning/list_of_topics.json", "w") as f:
    json.dump(json.loads(topics), f, indent=2)

# Generate Articles

In [28]:
with open("prompts/article_creation.text", "r") as f:
    prompt = f.read()

In [None]:
tst_str = """```\njson {"topics": topics_list, "num_articles": 1000}\n```"""

In [54]:
tst_str.split('```json')[1].split('```')[0].strip()

'{"topics": topics_list, "num_articles": 1000}'

In [65]:
def save_article(topic, article, idx):
    # lowecase all topics, split by space, and join with underscore
    # example: "Machine Learning" -> "machine_learning"
    title = "_".join(topic.lower().split(" "))
    with open(f"articles/machine_learning/{title}.json", "w") as f:
        json.dump(article, f, indent=2)
    print(f"Saved article {idx + 1} for topic: {title}")

In [67]:
shortened_topics

['Model deployment',
 'Data pipelines',
 'MLOps principles',
 'Active learning',
 'Online learning']

In [None]:
# A while loop to generate articles for each topic is bettern than a for loop
# because the LLM can generate non-json responses, and we can handle that case
is_done = False
index = 0
# This could be long. Reduce the number of topics to generate articles for and do it in batches.
shortened_topics = topics_list[0:5]  # Adjust the range as needed
total = len(shortened_topics)  # Adjust the range as needed
pbar = tqdm(total=total)
while not is_done:
    topic = shortened_topics[index]
    final_prompt = prompt.format(topic=topic)
    print(f"Generating article for topic: {topic}")
    article_response = client.models.generate_content(
        model="gemini-2.5-flash", contents=final_prompt,
    )
    article_text = article_response.text
    try:
        article_dict = json.loads(article_text)
        save_article(shortened_topics[index], article_dict, index)
        index += 1
        pbar.update(total - pbar.n)
    except json.JSONDecodeError:
        # two options now.
        # 1. The response is not a valid JSON, we can try to fix it.
        print("Response is not a valid JSON, trying to fix it.")
        if '```json' in article_text:
            # remove the ````json` and ` ``` ` from the response
            clean_text = article_text.split('```json')[1].split('```')[0].strip()
            try:
                article_dict = json.loads(clean_text)
                save_article(shortened_topics[index], article_dict, index)
                index += 1
                pbar.update(total - pbar.n)
            except json.JSONDecodeError:
                print("Still not a valid JSON, redo generation.")
                # If it still fails, we can just skip this topic and move to the next one.
        else:
            print("Response is not a valid JSON, redo generation.")
        # 2. The response is not a valid JSON, we can reset and have it generate again.
    if index >= len(shortened_topics):
        is_done = True
pbar.update(total - pbar.n)
pbar.close()

  0%|          | 0/5 [00:00<?, ?it/s]

Generating article for topic: Model deployment
Response is not a valid JSON, trying to fix it.
Still not a valid JSON, redo generation.
Generating article for topic: Model deployment
Response is not a valid JSON, trying to fix it.
Still not a valid JSON, redo generation.
Generating article for topic: Model deployment
Saved article 1 for topic: model_deployment
Generating article for topic: Data pipelines
Response is not a valid JSON, trying to fix it.
Response is not a valid JSON, redo generation.
Generating article for topic: Data pipelines
Saved article 2 for topic: data_pipelines
Generating article for topic: MLOps principles
Saved article 3 for topic: mlops_principles
Generating article for topic: Active learning
Response is not a valid JSON, trying to fix it.
Still not a valid JSON, redo generation.
Generating article for topic: Active learning
Saved article 4 for topic: active_learning
Generating article for topic: Online learning
Saved article 5 for topic: online_learning


# Save all the articles in a json format for further processing later

In [58]:
topics_list[2]

'MLOps principles'

In [None]:
for idx, article in enumerate(list_articles):
    # lowecase all topics, split by space, and join with underscore
    # example: "Machine Learning" -> "machine_learning"
    title = "_".join(topics_list[idx].lower().split(" "))
    with open(f"articles/machine_learning/{title}.json", "w") as f:
        json.dump(article, f, indent=2)