In [2]:
# vector database
from langchain_google_genai import GoogleGenerativeAIEmbeddings
# from langchain_community.vectorstores import Qdrant

# ingestion
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter

# from llama_index.core import SimpleDirectoryReader

# chat
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables import RunnablePassthrough
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.messages.base import BaseMessage

from langchain.prompts import FewShotPromptTemplate, PromptTemplate

from langchain_core.pydantic_v1 import BaseModel, Field

from pydantic import BaseModel
from typing import List, Optional
from langchain.output_parsers import PydanticOutputParser
import json
import re

# system
import os
import logging
import sys

# import nest_asyncio

# nest_asyncio.apply()

logging.basicConfig(level=logging.DEBUG,  # Define o nível de log
                    format='%(asctime)s - %(levelname)s - %(message)s',  # Define o formato da mensagem de log
                    stream=sys.stdout)  # Define a saída do log para stdout
                    # filename='app.log',  # Define o arquivo onde os logs serão gravados
                    # filemode='a')  # Define o modo de escrita do arquivo de log (append)

In [3]:
from dotenv import load_dotenv

load_dotenv()  # take environment variables from .env.

True

In [4]:
logging.info('Inicializando LLM e embedings')
llm_google = ChatGoogleGenerativeAI(model="gemini-1.5-flash", convert_system_message_to_human=True)
# embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key_google)

2024-08-12 14:02:37,559 - INFO - Inicializando LLM e embedings
2024-08-12 14:02:37,588 - DEBUG - Using AsyncIOEngine.POLLER as I/O engine


I0000 00:00:1723471357.583539     685 check_gcp_environment.cc:61] BIOS data file does not exist or cannot be opened.


In [5]:
# from langchain_openai import ChatOpenAI
# from langchain_openai import ChatOpenAI

# llm_openai = ChatOpenAI(openai_api_key="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")

In [6]:
object_schema = """
{
  "properties": {
    "topic_description": {
      "type": "string",
      "description": "A sentence describing the sub-topic to which the question belongs. That means this sentence should specify in a granular level what specific sub-topic the question belongs to. It should be abstract in a way that other questions could be put in this description too. Use between 5 and 10 words."
    },
    "level": {
      "type": "string",
      "description": "The difficulty level of the question. It should be only one of the following options: 'easy', 'medium', 'hard'."
    },
    "question": {
      "type": "string",
      "description": "The actual question text. It should be a question of type TRUE or FALSE. It means that the questions should be an assertion that could be answered with TRUE or FALSE."
    },
    "answer": {
      "type": "string",
      "description": "The correct answer to the question. It should be only one of the following options: TRUE or FALSE"
    },
    "explanation": {
      "type": "string",
      "description": "An explanation or solution to the question."
    }
  },
  "required": ["topic_description", "level", "question", "answer", "explanation"]
}
"""

In [7]:
prompt_question_generator = PromptTemplate(
    template="""
                TASK CONTEXT:
                I am studying machine learning and I need to practice some questions on various topics.
                
                TASK DESCRIPTION:
                I will provide you with a list of topics, and I would like you to generate a list of TRUE or FALSE questions.
                These questions should be interesting, creative, challenging and thought-provoking. 
                Each question should be in the form of a statement that could be either TRUE or FALSE.
                Feel free to be imaginative and attempt to confuse the student by blending related concepts or similar words.
                I will provide the topics in the DOMAIN KNOWLEDGE section.
                The questions should pertain to these topics, and you can use this knowledge as a foundation to create questions that delve deeper into the subject matter.
                
                ADDITIONAL TASK DESCRIPTION:
                {additional_task_description}
                
                TASK REQUIREMENTS:
                Please refrain from creating questions that require mathematical calculations, but you may create questions with mathematical formulas.
                You SHOULD use LATEX to write mathematical formulas and code, but you should use the Katex flavor.
                Also you should put $$ in the beggining of the katex code and $$ at the end of the code. This is necessary because the interpreter needs it.
                
                TASK DETAILS:
                You should create {quantity} questions of level {level}.
                
                DOMAIN KNOWLEDGE:
                {domain_knowledge}
                
                FORMAT OUTPUT INSTRUCTIONS:
                The output should be formatted as a JSON list of objects that conforms class object schema below.
                You should output just the Json list. 
                You should not output any other word like "json" in the beginning because it will ruin the parser.

                ```
                {object_schema}
                ```
            """,
    input_variables=["quantity", "level", "additional_task_description"],
    partial_variables={"object_schema": object_schema},
)

In [8]:
from supabase import create_client, Client

In [9]:
import os

url: str = os.environ.get("SUPABASE_URL")
key: str = os.environ.get("SUPABASE_KEY")
supabase: Client = create_client(url, key)

2024-08-12 14:02:56,509 - DEBUG - load_ssl_context verify=True cert=None trust_env=True http2=False
2024-08-12 14:02:56,512 - DEBUG - load_verify_locations cafile='/workspaces/llm-quiz-creator-streamlitapp-trainer/.venv/lib/python3.10/site-packages/certifi/cacert.pem'


In [10]:
from langchain_core.messages import AIMessage, HumanMessage
from typing import List, Optional
import  json_repair

def json_parser(message: AIMessage) -> List[dict]:
    return json_repair.loads(message.content)

In [13]:
def generate_questions(text, llm, parameters, subject_matter_1, subject_matter_2):
    print( "------------------- generate_questions FUNCTION -------------------" )
    
    if text is None:
        raise Exception("text is None")
        
    try:
        chain = prompt_question_generator | llm
        
        parameters["domain_knowledge"] = text
        
        response = chain.invoke(parameters)
        
        questions = json_parser(response)
        
        for q in questions:
            q["subject_matter_1"] = subject_matter_1
            q["subject_matter_2"] = subject_matter_2    

        print("---------------------------questions---------------------------")
        print(questions)
        print("---------------------------questions---------------------------")
        
        # data, count = supabase.table('questions').insert(questions).execute()
        
    except Exception as e:
        print("An error occurred:", e)

In [15]:
text = """
Questions about the correct shape of the layers in Neural network implementation in Pytorch. 
We know that when we are creating neural networks we need to know the if the shapes are correct aligned. SO I want to test this.
"""

In [26]:
text = """
Create Pytorch code of a neural network implementation and ask if the shape of the layer is correct.
"""

In [24]:
text = """
Create Pytorch code of a neural network implementation and ask if the shape of the layer is correct.
Generate just the pytorch code. Dont start with 'The following PyTorch code defines a neural network with ....'
"""

In [27]:
for level in [
    "beginner", 
    "intermediate", 
    "hard"
    ]:
    print( "level:", level )
    
    parameters = {
        "quantity": 1,
        "level": level,
        "additional_task_description": "Create questions only about the definitions of the concepts, like mixing the definition of one with another distribution, or mixing the use of one with the use of another. I need this to memorize this concepts."
    }
    
    subject_matter_1 = "Probability - LLMs"
    subject_matter_2 = subject_matter_1
    
    generate_questions(text, llm_google, parameters, subject_matter_1, subject_matter_2)

level: beginner
------------------- generate_questions FUNCTION -------------------




---------------------------questions---------------------------
[{'topic_description': 'PyTorch neural network layer shape', 'level': 'beginner', 'question': 'The following PyTorch code defines a neural network with a hidden layer of shape (10, 5) and an output layer of shape (5, 1). The shape of the hidden layer is correctly defined according to the input and output dimensions.\n\n```python\nimport torch.nn as nn\n\nclass Net(nn.Module):\n    def __init__(self):\n        super(Net, self).__init__()\n        self.fc1 = nn.Linear(10, 5)  # Hidden layer\n        self.fc2 = nn.Linear(5, 1)  # Output layer\n\n    def forward(self, x):\n        x = self.fc1(x)\n        x = self.fc2(x)\n        return x\n```', 'answer': 'TRUE', 'explanation': 'The code defines a neural network with a hidden layer of shape (10, 5), which means it has 10 input features and 5 output neurons. This is consistent with the input and output dimensions of the layers. The output layer has a shape of (5, 1), meaning it



---------------------------questions---------------------------
[{'topic_description': 'PyTorch neural network layer shape', 'level': 'intermediate', 'question': 'The following PyTorch code defines a neural network with a hidden layer of shape (10, 5) and an output layer of shape (5, 1). The shape of the hidden layer is correct, considering the input is a batch of 10 samples with 5 features each.', 'answer': 'FALSE', 'explanation': 'The shape of the hidden layer is incorrect. The first dimension of the hidden layer should match the number of features in the input, which is 5. The second dimension should match the number of neurons in the hidden layer, which is 10. Therefore, the correct shape for the hidden layer is (5, 10). The provided code defines a hidden layer with the shape (10, 5), which is incorrect. The output layer shape is correct, as it needs to match the number of neurons in the hidden layer (10) and the number of output classes (1). \n\n```python\nimport torch.nn as nn\n\



---------------------------questions---------------------------
[{'topic_description': 'Neural network implementation in PyTorch', 'level': 'hard', 'question': 'The following PyTorch code for a neural network with two hidden layers, each with 100 neurons, and an output layer with 10 neurons, will result in an output layer with a shape of (batch\\_size, 10) after processing an input of shape (batch\\_size, 50):\\n\\n```python\\nimport torch.nn as nn\\n\\nclass Net(nn.Module):\\n    def __init__(self):\\n        super(Net, self).__init__()\\n        self.fc1 = nn.Linear(50, 100)\\n        self.fc2 = nn.Linear(100, 100)\\n        self.fc3 = nn.Linear(100, 10)\\n\\n    def forward(self, x):\\n        x = torch.relu(self.fc1(x))\\n        x = torch.relu(self.fc2(x))\\n        x = self.fc3(x)\\n        return x\\n```', 'answer': 'TRUE', 'explanation': 'The code defines a neural network with two hidden layers of 100 neurons each and an output layer of 10 neurons. The input shape is (batch\\_s