In [2]:
import json

# Load the word lists JSON file
with open('src/data/word_lists.json', 'r') as file:
    word_lists = json.load(file)

# Extract levels and metadata
levels = word_lists.get("levels", {})
metadata = word_lists.get("metadata", {})

In [39]:
import requests
import re
import random

# Define Ollama API parameters
url = "http://localhost:11434/api/generate"
headers = {
    "Content-Type": "application/json"
}

def generate_sentences(model, word, max_tokens=250):
    # Few-shot prompt
    prompts = [
        f"Role: You are an expert educational sentence creator. Objective: Generate 6 concise example sentences, each on a new line, using the word '{word}' exactly as given, without changing its form. Ensure each sentence is educational and student-friendly, subtly hinting at the word's meaning:\n\nWord: {word}\n1. ",
        f"Write 6 concise example sentences, each on a new line, using the word '{word}' exactly as provided, without altering its form. Make sure each sentence is educational and student-friendly, with hints about the word's meaning:\n\nWord: {word}\n1. ",
        f"You are a teacher. Write 6 concise example sentences, each on a new line, that use the word '{word}' exactly as it is written, without any changes in form. The sentences should be educational and student-friendly, subtly revealing the word's meaning:\n\nWord: {word}\n1. ",
        f"Generate 6 example sentences that each use the word '{word}' in its exact form. The sentences should be concise, educational, and student-friendly, and should hint at the word's meaning without changing its form:\n\nWord: {word}\n1. ",
        f"As a language expert, create 6 concise example sentences that use the word '{word}' verbatim, keeping the word in its exact form. Each sentence should be educational and accessible to students, giving clues to the word's meaning:\n\nWord: {word}\n1. ",
        f"Create 6 concise example sentences using the word '{word}' exactly as provided, without any variations like plurals or different tenses (e.g., '{word}s', '{word}ly'). Each sentence should be educational, student-friendly, and hint at the word's meaning:\n\nWord: {word}\n1. ",
        f"Generate 6 concise sentences where the word '{word}' is used exactly as given, with no changes in spelling, tense, or form. Ensure each sentence is educational and appropriate for students, providing hints to the word's meaning:\n\nWord: {word}\n1. ",
        f"Write 6 concise sentences using the word '{word}' exactly as provided, ensuring the word appears only once in each sentence, without any variations in form. Each sentence should be educational and suitable for students, subtly indicating the word's meaning:\n\nWord: {word}\n1. ",
        f"As a sentence creator, generate 6 example sentences where the word '{word}' is used exactly as given, with no alterations (e.g., no plurals, no derivations). Each sentence should be clear, educational, and student-friendly, with a hint towards the word's meaning:\n\nWord: {word}\n1. ",
        f"Create 6 sentences where the word '{word}' is used exactly as provided, without any changes. Do not use variations like '{word}s' or '{word}ly'. Ensure the sentences are educational, student-friendly, and subtly reveal the meaning of the word:\n\nWord: {word}\n1. "
    ]
    prompt = random.choice(prompts)
    
    # API request
    data = {
        "model": model,
        "prompt": prompt,
        "stream": False,
        "max_tokens": max_tokens,
        "temperature": 0.4
    }
    try:
        response = requests.post(url, headers=headers, json=data)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(e)
        return []
    except json.JSONDecodeError as e:
        print(e)
        return []
    except KeyError as e:
        print(e)
        return []
    except Exception as e:
        print(e)
        return []
    response_text = response.json().get("response", "")
    
    # Split sentences
    sentences = response_text.split('\n')
    # Throw out the first sentence
    sentences = sentences[1:]
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

    # Replace any leading numbers (ex. "1. " or "10. ") with an empty string    
    sentences = [re.sub(r"^\d+\.\s*", "", sentence) for sentence in sentences]

    # Replace any leading - with an empty string
    sentences = [re.sub(r"^-", "", sentence) for sentence in sentences]

    # If a sentence is in this format: `sentence (definition)`, remove the definition
    sentences = [re.sub(r"^(.*)\s+\(.*\)$", r"\1", sentence) for sentence in sentences]

    # If a sentence starts and ends with double quotes, remove them
    sentences = [re.sub(r"^\"(.*)\"$", r"\1", sentence) for sentence in sentences]

    # Trim all of the sentences
    sentences = [sentence.strip() for sentence in sentences]

    # Filter out any sentences that are too short
    sentences = [sentence for sentence in sentences if len(sentence) >= 10]

    # Case-insensitive filtering sentences to keep only those sentences that contain the word
    valid_sentences = [sentence for sentence in sentences if re.search(r"\b" + word + r"\b", sentence, re.IGNORECASE)]

    return valid_sentences[:5]  # Return the first 3 valid sentences

def generate_sentences_recursive(model, word, n=5):
    sentences = generate_sentences(model, word)
    if len(sentences) >= n:
        return sentences[:n]
    else:
        return sentences + generate_sentences_recursive(model, word, n - len(sentences))

def generate_sentences_recursive_with_max_loop_count(model, word, n=5, loop=10):
    sentences = generate_sentences(model, word)
    loop_count = 1
    while len(sentences) < n and loop_count < loop:
        sentences += generate_sentences(model, word)
        loop_count += 1
    return sentences[:n]

In [None]:
# Dictionary to store the generated sentences
output_data = {}

# Iterate through each level
for level, data in levels.items():
    words = data.get("words", [])
    bonus_words = data.get("bonus", [])
    
    # Process each word
    for word in words + bonus_words:
        sentences = generate_sentences_recursive(model="openhermes", word=word, n=5)
        if len(sentences) >= 5:
            output_data[word] = sentences
            print(f"{word}:")
            for sentence in sentences:
                print(f"\t{sentence}")
        else:
            print(f"Warning: Less than 3 valid sentences generated for word '{word}'")


In [13]:
# Save the output data to a new JSON file
# with open('src/data/word_sentences.json', 'w') as outfile:
#     json.dump(output_data, outfile, indent=2)

# print("Sentences generated and saved successfully!")

Sentences generated and saved successfully!


In [57]:
import os
import json

# File path where the output will be saved
file_path = 'src/data/word_sentences.json'

# Load existing data if the file exists
if os.path.exists(file_path):
    with open(file_path, 'r') as infile:
        existing_data = json.load(infile)
        print(f"{len(existing_data)} existing entries loaded.")
else:
    existing_data = {}

print("Existing data loaded successfully!")

4926 existing entries loaded.
Existing data loaded successfully!


In [58]:
# Load the word list data

import json

# Load the word lists JSON file
with open('src/data/word_lists.json', 'r') as file:
    word_lists = json.load(file)

# Extract levels and metadata
levels = word_lists.get("levels", {})
metadata = word_lists.get("metadata", {})

In [59]:
import time

start_time = time.time()

# Iterate through each level
for level, data in levels.items():
    level_start_time = time.time()
    words = data.get("words", [])
    bonus_words = data.get("bonus", [])

    # For new levels, print the level name
    print(f"\n\n\nProcessing level {level}...\n\n")
    
    # Process each word
    words_processed = []
    for i, word in enumerate(words + bonus_words):

        if word in existing_data:
            # check to see if there are 5 sentences
            if len(existing_data[word]) >= 5:
                # print(f"Skipping '{word}': already processed.")
                continue

        if (i+1) % 1 == 0:
            print(f"Processing {word} {i+1}/{len(words + bonus_words)}...")

        word_start_time = time.time()

        sentences = generate_sentences_recursive_with_max_loop_count(model="openhermes", word=word, n=5, loop=20)
        if len(sentences) >= 5:
            existing_data[word] = sentences
            print(f"{word}:")
            for sentence in sentences:
                print(f"\t{sentence}")

            # Append the new word data to the file immediately
            with open(file_path, 'w') as outfile:
                json.dump(existing_data, outfile, indent=2)
        else:
            print(f"Warning: Less than 3 valid sentences generated for word '{word}'")
        
        words_processed.append(word)
        word_end_time = time.time()
    
    level_end_time = time.time()
    print(f"Level {level} processed {len(words_processed)} in {level_end_time - level_start_time:.2f} seconds.")
    print(f"Average time per word: {(level_end_time - level_start_time) / (len(words_processed)+1):.2f} seconds.")
    print(f"Total time elapsed: {level_end_time - start_time:.2f} seconds.")
    print(f"Average time per 200 words: {(level_end_time - start_time) / (len(existing_data)+1) * 200 / 60:.2f} minutes.")

print("Processing complete! New sentences appended to the file.")




Processing level 1...


Level 1 processed 0 in 0.00 seconds.
Average time per word: 0.00 seconds.
Total time elapsed: 0.00 seconds.
Average time per 200 words: 0.00 minutes.



Processing level 2...


Level 2 processed 0 in 0.00 seconds.
Average time per word: 0.00 seconds.
Total time elapsed: 0.00 seconds.
Average time per 200 words: 0.00 minutes.



Processing level 3...


Level 3 processed 0 in 0.00 seconds.
Average time per word: 0.00 seconds.
Total time elapsed: 0.00 seconds.
Average time per 200 words: 0.00 minutes.



Processing level 4...


Level 4 processed 0 in 0.00 seconds.
Average time per word: 0.00 seconds.
Total time elapsed: 0.00 seconds.
Average time per 200 words: 0.00 minutes.



Processing level 5...


Level 5 processed 0 in 0.00 seconds.
Average time per word: 0.00 seconds.
Total time elapsed: 0.00 seconds.
Average time per 200 words: 0.00 minutes.



Processing level 6...


Level 6 processed 0 in 0.00 seconds.
Average time per word: 0.00 seconds.
Total time ela