In [1]:
import json
import os
from os.path import basename, splitext
import asyncio
from google import genai
from google.genai import types
from pydantic import BaseModel, Field
import openai
import instructor
from tqdm.auto import tqdm
from tqdm.asyncio import tqdm as atqdm
from typing import List, Literal, Optional

import sys

if "../../src" not in sys.path:
    sys.path.append("../../src")

from api.utils.concurrency import async_batch_gather, async_index_wrapper

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ["GEMINI_API_KEY"] = "AIzaSyChUY3sq8aEDCAE5nM0hB-7x-3VvmsBPg8"
os.environ["OPENAI_API_KEY"] = (
    "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"
)

In [3]:
gemini_client = genai.Client(
    api_key=os.environ.get("GEMINI_API_KEY"),
)

In [4]:
openai_client = openai.OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

In [5]:
# file = gemini_client.files.upload(file="WorkSmart - HV March 2025 For Sharing.pdf")
# file = gemini_client.files.upload(file="A new paradigm for setting objectives.pdf")
# file = gemini_client.files.upload(file="PromptEngineering.pdf")

# openai
file = openai_client.files.create(
    file=open("A new paradigm for setting objectives.pdf", "rb"), purpose="user_data"
)

In [23]:
class BlockProps(BaseModel):
    level: Optional[Literal[2, 3]] = Field(description="The level of a heading block")
    checked: Optional[bool] = Field(
        description="Whether the block is checked (for a checkListItem block)"
    )
    language: Optional[str] = Field(
        description="The language of the code block (for a codeBlock block); always the full name of the language in lowercase (e.g. python, javascript, sql, html, css, etc.)"
    )


class BlockContentStyle(BaseModel):
    bold: Optional[bool] = Field(description="Whether the text is bold")
    italic: Optional[bool] = Field(description="Whether the text is italic")
    underline: Optional[bool] = Field(description="Whether the text is underlined")


class BlockContent(BaseModel):
    text: str = Field(
        description="The text of the block; if the block is a code block, this should contain the code with newlines and tabs as appropriate"
    )
    styles: BlockContentStyle | dict = Field(
        default={}, description="The styles of the block content"
    )


class Block(BaseModel):
    type: Literal[
        "heading",
        "paragraph",
        "bulletListItem",
        "numberedListItem",
        "codeBlock",
        "checkListItem",
    ] = Field(description="The type of block")
    props: Optional[BlockProps | dict] = Field(
        default={}, description="The properties of the block"
    )
    content: Optional[List[BlockContent]] = Field(
        description="The content of the block"
    )


class LearningMaterial(BaseModel):
    blocks: List[Block] = Field(
        description="The content of the learning material as blocks"
    )


class Criterion(BaseModel):
    name: str = Field(
        description="The name of the criterion (e.g. grammar, relevance, clarity, confidence, pronunciation, brevity, etc.), keep it to 1-2 words unless absolutely necessary to extend beyond that"
    )
    description: str = Field(
        description="The description/rubric for how to assess this criterion - the more detailed it is, the better the evaluation will be, but avoid making it unnecessarily big - only as descriptive as it needs to be but nothing more"
    )
    min_score: int = Field(
        description="The minimum score possible to achieve for this criterion (e.g. 0)"
    )
    max_score: int = Field(
        description="The maximum score possible to achieve for this criterion (e.g. 5)"
    )


class Scorecard(BaseModel):
    title: str = Field(
        description="what does the scorecard assess (e.g. written communication, interviewing skills, product pitch, etc.)"
    )
    criteria: List[Criterion] = Field(
        description="The list of criteria for the scorecard."
    )


class Question(BaseModel):
    question_type: Literal["objective", "subjective", "coding"] = Field(
        description='The type of question; "objective" means that the question has a fixed correct answer and the learner\'s response must precisely match it. "subjective" means that the question is subjective, with no fixed correct answer. "coding" - a specific type of "objective" question for programming questions that require one to write code.'
    )
    answer_type: Optional[Literal["text", "audio"]] = Field(
        description='The type of answer; "text" means the student has to submit textual answer where "audio" means student has to submit audio answer. Ignore this field for questionType = "coding".',
    )
    coding_languages: Optional[
        List[Literal["HTML", "CSS", "JS", "Python", "React", "Node", "SQL"]]
    ] = Field(
        description='The languages that a student need to submit their code in for questionType=coding. It is a list because a student might have to submit their code in multiple languages as well (e.g. HTML, CSS, JS). This should only be included for questionType = "coding".',
    )
    blocks: List[Block] = Field(
        description="The actual question details as individual blocks. Every part of the question should be included here. Do not assume that there is another field to capture different parts of the question. This is the only field that should be used to capture the question details. This means that if the question is an MCQ, all the options should be included here and not in another field. Extend the same idea to other question types."
    )
    correct_answer: Optional[List[Block]] = Field(
        description='The actual correct answer to compare a student\'s response with. Ignore this field for questionType = "subjective".',
    )
    scorecard: Optional[Scorecard] = Field(
        description='The scorecard for subjective questions. Ignore this field for questionType = "objective" or "coding".',
    )
    context: List[Block] = Field(
        description="A short text that is not the question itself. This is used to add instructions for how the student should be given feedback or the overall purpose of that question. It can also include the raw content from the reference material to be used for giving feedback to the student that may not be present in the question content (hidden from the student) but is critical for providing good feedback."
    )


class Quiz(BaseModel):
    questions: List[Question] = Field(description="A list of questions for the quiz")


class Output(BaseModel):
    task: LearningMaterial | Quiz = Field(description="The task to be generated")

In [39]:
def get_system_prompt_for_task_generation(task_type):
    schema = (
        LearningMaterial.model_json_schema()
        if task_type == "learning_material"
        else Quiz.model_json_schema()
    )

    quiz_prompt = """Each quiz/exam contains multiple questions for testing the understanding of the learner on the actual concept.

Important Instructions for Quiz Generation:
- For a quiz, each question must add a strong positive value to the overall learner's understanding. Do not unnecessarily add questions simply to increase the number of questions. If a quiz merits only a single question based on the reference material provided or your asseessment of how many questions are necessary for it, keep a single question itself. Only add multiple questions when the quiz merits so. 
- The `content` for each question is the only part of the question shown directly to the student. Add everything that the student needs to know to answer the question inside the `content` field for that question. Do not add anything there that should not be shown to the student (e.g. what is the correct answer). To add instructions for how the student should be given feedback or the overall purpose of that question or raw content from the reference material required as context to give adequate feedback, add it to the `context` field instead. 
- While testing theoretical understanding is important, a quiz should go beyond that and produce practical challenges for the students to apply what they have learnt. If the reference material already has examples/sample problems, include them in the a quiz for the students to practice. If no examples are present in the reference material, generate a few relevant problem statements to test the real-world understanding of each concept for the students.
- If a question references a set of options that must be shown to the student, always make sure that those options are actually present in the `content` field for that question. THIS IS SUPER IMPORTANT. As mentioned before, if the reference material does not have the options or data required for the question, generate it based on your understanding of the question and its purpose.
- Use appropriate formatting for the `blocks` in each question. Make use of all the block types available to you to make the content of each question as engaging and readable as possible.
- Do not use the name of the quiz as a heading to mark the start of a question in the `blocks` field for each question. The name of the quiz will already be visible to the student."""

    learning_material_prompt = """A learning material is used for learning about a specific concept. 
    
Make the \"content\" field in learning material contain as much detail as present in the reference material relevant to it. Do not try to summarise it or skip any point.

Use appropriate formatting for the `blocks` in the learning material. Make use of all the block types available to you to make the content as engaging and readable as possible.

Do not use the name of the learning material as a heading to mark the start of the learning material in the `blocks`.  The name of the learning material will already be visible to the student."""

    task_type_prompt = quiz_prompt if task_type == "quiz" else learning_material_prompt

    system_prompt = f"""You are an expert course creator. The user will give you an outline for a concept in a course they are creating along with the reference material to be used as the source for the course content and the name of one of the tasks from the outline.

You need to generate the content for the single task whose name is provided to you out of all the tasks in the outline. The outline contains the name of a concept in the course, its description and a list of tasks in that concept. Each task can be either a learning material, quiz or exam. You are given this outline so that you can clearly identify what part of the reference material should be used for generating the specific task you need to generate and for you to also understand what should not be included in your generated task. For each task, you have been given a description about what should be included in that task. 

{task_type_prompt}

The final output should be a JSON in the following format:

{schema}"""

    return system_prompt

In [40]:
async def generate_task(
    task_generation_prompt: str,
    task_type: str,
    reference_file,
    # model: str = "gemini-2.0-flash",
    model: str = "gpt-4o-2024-11-20",
):
    # client = genai.Client(
    #     api_key=os.environ.get("GEMINI_API_KEY"),
    # )

    client = instructor.from_openai(openai_client)

    # contents = [
    #     types.Content(
    #         role="user",
    #         parts=[
    #             types.Part.from_uri(
    #                 file_uri=reference_file.uri,
    #                 mime_type=reference_file.mime_type,
    #             ),
    #             types.Part.from_text(text=task_generation_prompt),
    #         ],
    #     ),
    # ]
    # generate_content_config = types.GenerateContentConfig(
    #     response_mime_type="application/json",
    #     system_instruction=get_system_prompt_for_task_generation(task_type),
    #     response_schema=Output,
    # )

    # return await client.aio.models.generate_content(
    #     model=model, contents=contents, config=generate_content_config
    # )

    messages = [
        {"role": "system", "content": get_system_prompt_for_task_generation(task_type)},
        {
            "role": "user",
            "content": [
                {
                    "type": "file",
                    "file": {
                        "file_id": file.id,
                    },
                },
            ],
        },
        # separate into 2 user messages for prompt caching to work
        {"role": "user", "content": task_generation_prompt},
    ]

    response_model = LearningMaterial if task_type == "learning_material" else Quiz
    return client.chat.completions.create(
        model=model,
        messages=messages,
        response_model=response_model,
        max_completion_tokens=16000,
        store=True,
    )

    # for chunk in client.models.generate_content(
    #     model=model,
    #     contents=contents,
    #     config=generate_content_config,
    # ):
    #     print(chunk.text, end="")

In [41]:
path = "goal_setting_v2.json"
course = json.load(open(path, "r"))
# course = json.load(open("prompt_engineering.json", "r"))

In [42]:
basename(path)

'goal_setting_v2.json'

In [43]:
dirname = splitext(basename(path))[0]
os.makedirs(dirname, exist_ok=True)

In [44]:
async def generate_and_save_task(
    task_generation_prompt,
    task_type,
    reference_file,
    index,
):
    response = await generate_task(
        task_generation_prompt,
        task_type,
        reference_file,
    )
    with open(f"{dirname}/{index}.json", "w") as f:
        f.write(
            json.dumps(
                response.model_dump(exclude_none=True, exclude_unset=True), indent=4
            )
        )

In [45]:
coroutines = []
current_index = 0
# learning_material_indices = []

for module in course["modules"]:
    for concept in module["concepts"]:
        for task in concept["tasks"]:
            generation_prompt = f"""Concept details:

{concept}

Task to generate:

{task['name']}"""
            # if task["type"] == "learning_material":
            #     learning_material_indices.append(current_index)

            coroutines.append(
                async_index_wrapper(
                    generate_and_save_task,
                    current_index,
                    generation_prompt,
                    task["type"],
                    file,
                    current_index,
                )
            )
            current_index += 1

  coroutines = []


In [46]:
len(coroutines)

50

In [47]:
responses = []

In [48]:
# TODO: make a plan in the first step for each task - break it down into steps - get LLM to generate each step in the plan in parallel as each step should be independent of the others

In [49]:
responses = await async_batch_gather(coroutines, description="Generating tasks...")

Generating tasks... 0-10/50: 100%|██████████| 10/10 [05:15<00:00, 31.54s/it]
Generating tasks... 10-20/50: 100%|██████████| 10/10 [04:41<00:00, 28.12s/it]
Generating tasks... 20-30/50: 100%|██████████| 10/10 [12:37<00:00, 75.78s/it]  
Generating tasks... 30-40/50: 100%|██████████| 10/10 [05:19<00:00, 31.96s/it]
Generating tasks... 40-50/50: 100%|██████████| 10/10 [04:29<00:00, 26.95s/it]


In [35]:
responses[0].task.blocks

AttributeError: 'tuple' object has no attribute 'task'

In [120]:
# for i, coroutine in tqdm(enumerate(coroutines), total=len(coroutines)):
#     # TMP
#     # if i in learning_material_indices:
#     #     continue

#     response = await coroutine

#     # TMP
#     # responses[i] = response
#     responses.append(response)

#     if i == 1:
#         break

  2%|▏         | 1/50 [01:04<52:28, 64.25s/it]


In [80]:
len(coroutines)

50

In [121]:
len(responses)

2

In [127]:
responses[0].task.blocks

[Block(type='heading', props=BlockProps(level='1', checked=None, language=''), content=BlockContent(text='Exploring Self-Identity', styles=BlockContentStyle(bold=True, italic=False, underline=False))),
 Block(type='paragraph', props=None, content=BlockContent(text='Understanding your self-identity is essential for personal growth and transformation. Self-identity is the foundation upon which you set goals and take action. Reflecting on your strengths, values, and passions helps you gain the clarity necessary to define both your current self and who you aspire to become.', styles=BlockContentStyle(bold=False, italic=False, underline=False))),
 Block(type='heading', props=BlockProps(level='2', checked=None, language=''), content=BlockContent(text='Reflective Questions for Exploring Self-Identity', styles=BlockContentStyle(bold=True, italic=False, underline=False))),
 Block(type='bulletListItem', props=None, content=BlockContent(text='What activities make you lose track of time?', styles=

In [90]:
responses[19].candidates[0].content.parts[0].text

'{"task": {"blocks": [{"type": "heading", "props": {"level": "1", "checked": null, "language": "string"}, "content": {"text": "Reflecting on Personal Growth", "styles": {"bold": true, "italic": false, "underline": false}}}, {"type": "paragraph", "props": {"level": null, "checked": null, "language": "string"}, "content": {"text": "Personal growth is more than just achieving goals; it\'s about who you become in the process. It involves identifying the qualities you want to develop and understanding how pursuing your objectives can help you cultivate them.", "styles": {"bold": false, "italic": false, "underline": false}}}, {"type": "heading", "props": {"level": "2", "checked": null, "language": "string"}, "content": {"text": "Understanding Your Identity", "styles": {"bold": true, "italic": false, "underline": false}}}, {"type": "paragraph", "props": {"level": null, "checked": null, "language": "string"}, "content": {"text": "Understanding your identity is crucial. Reflect on your strength

In [88]:
output = Output.model_validate(
    json.loads(responses[19].candidates[0].content.parts[0].text)
)

JSONDecodeError: Unterminated string starting at: line 1 column 31276 (char 31275)

In [50]:
current_index = 0

for module in course["modules"]:
    for concept in module["concepts"]:
        for task in concept["tasks"]:
            save_path = f"{dirname}/{current_index}.json"
            if os.path.exists(save_path):
                task["details"] = json.load(open(save_path, "r"))
            else:
                break
            current_index += 1

In [51]:
with open("goal_setting_v2_details.json", "w") as f:
    f.write(json.dumps(course, indent=4))

In [55]:
print(json.dumps(response.parsed.model_dump(), indent=4))

{
    "task": {
        "content": "Enabling people to have success and happiness in all areas of life is the ultimate goal. To achieve this, it's essential to understand the interconnectedness of various aspects of our lives: Self, Family, Work, and Society. Each of these areas influences and is influenced by the others.\n\n1.  Self: At the foundation is the 'Self'. This includes personal well-being, mindset, health, and individual goals. When you prioritize self-improvement and personal development, it enhances your ability to handle challenges and contribute positively to other areas of your life.\n\n2.  Family: The next level is 'Family'. Strong family relationships provide a support system, emotional stability, and a sense of belonging. Positive family dynamics can boost confidence, reduce stress, and improve overall happiness, which in turn positively affects your work and societal interactions.\n\n3.  Work: 'Work' represents your professional life, career, and financial stabilit