In [18]:
import ollama
import json
import re
from tqdm import tqdm

In [2]:
model_name = "mistral-nemo:latest"

In [3]:
with open("prompts/article_creation.txt", "r") as f:
    article_creation_prompt = f.read()

In [7]:
with open("articles/machine_learning/sanitize_topics_dict_last_formatted.json", "r") as json_file:
    topics_list = json.load(json_file)

In [22]:
def save_article(topic, article, idx):
    # remove non alphanumeric characters with a space
    topic_alphanumeric = re.sub(r'[^a-zA-Z0-9]', ' ', topic)
    # lowecase all topics, split by space, and join with underscore
    # example: "Machine Learning" -> "machine_learning"
    topic_alphanumeric = topic_alphanumeric.lower().split(" ")
    title = "_".join(topic_alphanumeric)
    with open(f"articles/machine_learning/{title}.json", "w") as f:
        json.dump(article, f, indent=2)
    print(f"Saved article {idx + 1} for topic: {title}")

In [None]:
is_done = False
index = 0
shortened_topics = topics_list #should run this in batches
total = len(shortened_topics)
pbar = tqdm(total=total)
while not is_done:
    topic = shortened_topics[index]
    final_creation_prompt = article_creation_prompt.format(topic=topic)
    print(f"Generating article for topic: {topic}")
    reponse = ollama.generate(model=model_name, prompt=final_creation_prompt)
    article_text = reponse['response']
    try:
        article_dict = json.loads(article_text)
        save_article(shortened_topics[index], article_dict, index)
        index += 1
        pbar.update(1) # update by 1 for each iteration
    except json.JSONDecodeError:
        # two options: 1. fix the non-valid JSON
        print("Response is not valid JSON, trying to fix it.")
        if "```json" in article_text:
            # remove the ```json and ``` from the article_text
            clean_text = article_text[8:-4].strip()
            try:
                article_dict = json.loads(clean_text)
                save_article(shortened_topics[index], article_dict, index)
                index += 1
                pbar.update(1) # update by 1 for each iteration
            except json.JSONDecodeError:
                print("Still not a valid JSON, redo generation.")
        else:
            print("Response is not in markdown.")
    if index >= len(shortened_topics):
        is_done = True
pbar.update(total - pbar.n)
pbar.close()

