In [2]:
# vector database
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import Qdrant

# ingestion
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter

# from llama_index.core import SimpleDirectoryReader

# chat
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables import RunnablePassthrough
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.messages.base import BaseMessage

from langchain.prompts import FewShotPromptTemplate, PromptTemplate

from langchain_core.pydantic_v1 import BaseModel, Field

from pydantic import BaseModel
from typing import List, Optional
from langchain.output_parsers import PydanticOutputParser
import json
import re

# system
import os
import logging
import sys

import nest_asyncio

nest_asyncio.apply()

logging.basicConfig(level=logging.DEBUG,  # Define o nível de log
                    format='%(asctime)s - %(levelname)s - %(message)s',  # Define o formato da mensagem de log
                    stream=sys.stdout)  # Define a saída do log para stdout
                    # filename='app.log',  # Define o arquivo onde os logs serão gravados
                    # filemode='a')  # Define o modo de escrita do arquivo de log (append)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "quiz"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = "ls__b626f8e0970e43cca449e7a3510ac96b"  # Update to your API key

In [None]:
logging.info('Inicializando LLM e embedings')
api_key_google = "AIzaSyC-V6lfROehy46ntB6zPZ7CJ8zNF3gDdO4"
llm_google = ChatGoogleGenerativeAI(model="gemini-pro", convert_system_message_to_human=True, google_api_key=api_key_google)
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key_google)

In [None]:
from langchain_openai import ChatOpenAI
from langchain_openai import ChatOpenAI

llm_openai = ChatOpenAI(openai_api_key="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")

In [None]:
object_schema = """
{
  "properties": {
    "topic_description": {
      "type": "string",
      "description": "A sentence describing the sub-topic to which the question belongs. That means this sentence should specify in a granular level what specific sub-topic the question belongs to. It should be abstract in a way that other questions could be put in this description too. Use between 5 and 10 words."
    },
    "level": {
      "type": "string",
      "description": "The difficulty level of the question. It should be only one of the following options: 'beginner', 'intermediate', 'advanced'."
    },
    "question": {
      "type": "string",
      "description": "The actual question text. It should be a question of type TRUE or FALSE. It means that the questions should be an assertion that could be answered with TRUE or FALSE."
    },
    "answer": {
      "type": "string",
      "description": "The correct answer to the question. It should be only one of the following options: TRUE or FALSE"
    },
    "explanation": {
      "type": "string",
      "description": "An explanation or solution to the question."
    }
  },
  "required": ["topic_description", "level", "question", "answer", "explanation"]
}
"""

In [None]:
prompt_question_generator = PromptTemplate(
    template="""
                TASK CONTEXT:
                I am studying machine learning and I need to practice some questions on various topics.
                
                TASK DESCRIPTION:
                I will provide you with a list of topics, and I would like you to generate a list of TRUE or FALSE questions.
                These questions should be interesting, creative, challenging and thought-provoking. 
                Each question should be in the form of a statement that could be either TRUE or FALSE.
                Feel free to be imaginative and attempt to confuse the student by blending related concepts or similar words.
                I will provide the topics in the DOMAIN KNOWLEDGE section.
                The questions should pertain to these topics, and you can use this knowledge as a foundation to create questions that delve deeper into the subject matter.
                
                TASK REQUIREMENTS:
                Please refrain from creating questions that require mathematical calculations, but you may create questions with mathematical formulas.
                You SHOULD use LATEX to write mathematical formulas and code, but you should use the Katex flavor.
                Also you should put $$ in the beggining of the katex code and $$ at the end of the code. This is necessary because the interpreter needs it.
                
                TASK DETAILS:
                You should create {quantity} questions of level {level}.
                
                DOMAIN KNOWLEDGE:
                {domain_knowledge}
                
                FORMAT OUTPUT INSTRUCTIONS:
                The output should be formatted as a JSON list of objects that conforms class object schema below.
                You should output just the Json list. 
                You should not output any other word like "json" in the beginning because it will ruin the parser.

                ```
                {object_schema}
                ```
            """,
    input_variables=["quantity", "level"],
    partial_variables={"object_schema": object_schema},
)

In [None]:
from supabase import create_client, Client

In [None]:
url = "https://xoxlgvakygiyfijfeixu.supabase.co"
key = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InhveGxndmFreWdpeWZpamZlaXh1Iiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImlhdCI6MTcwNDkyNzU2NywiZXhwIjoyMDIwNTAzNTY3fQ.V3766GRj6hkt1Ci-52tjSiULVoF3nfCPPDnR6Hc_rT0"

supabase: Client = create_client(url, key)

In [None]:
import  json_repair

def json_parser(message: AIMessage) -> List[dict]:
    return json_repair.loads(message.content)

In [None]:
def get_h1(html_header_splits):
    for html in html_header_splits:
        if "Header 1" in html.metadata.keys():
            return html.metadata["Header 1"]
            
# get_h1(html_header_splits)

In [None]:
def get_sub_header(split):
    if 'Header 3' in split.metadata.keys():
        return split.metadata['Header 3']
    elif 'Header 2' in split.metadata.keys():
        return split.metadata['Header 2']
    elif 'Header 1' in split.metadata.keys():
        return split.metadata['Header 1']
    else:
        return None
    
# get_sub_header(char_splits[12])

In [4]:

from langchain_text_splitters import HTMLHeaderTextSplitter

def get_text_splits_from_text(texts):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=4000, 
        chunk_overlap=100,        
    )

    return text_splitter.split_documents(texts)

def get_text_from_html(url = None, text = None):
    headers_to_split_on = [
        ("h1", "Header 1"),
        ("h2", "Header 2"),
        ("h3", "Header 3"),
        ("h4", "Header 4"),
        ("h5", "Header 5"),
    ]

    html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

    if url:
        html_header_splits = html_splitter.split_text_from_url(url)
    elif text:
        html_header_splits = html_splitter.split_text(text)
    
    return html_header_splits

In [None]:
response_list = []
question_list = []
try_later_split_list = []

In [None]:
def generate_questions(splits, llm, parameters, html_header_splits):
    print( "------------------- generate_questions FUNCTION -------------------" )
    
    i = 0
    for split in splits:
        print( "-------------------- split --------------------" )
        print( split )
        print( "-------------------- split --------------------" )
        
        try:
            chain = prompt_question_generator | llm
            
            parameters["domain_knowledge"] = split.page_content
            
            response = chain.invoke(parameters)
            
            response_list.append(response)
            
            questions = json_parser(response)
            
            h1 = get_h1(html_header_splits)
            
            sub_header = get_sub_header(split)
            if sub_header is None:
                sub_header = h1
            
            for q in questions:
                q["subject_matter_1"] = h1
                q["subject_matter_2"] = sub_header
            
            question_list.extend(questions)
            
            data, count = supabase.table('questions').insert(questions).execute()
        except Exception as e:
            print("An error occurred:", e)
            try_later_split_list.append(split)
        
        # i +=1
        # if i == 2:
        #     break

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin

def extract_links_from_url(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            # base_url = urlparse(url).scheme + '://' + urlparse(url).netloc
            base_url = url
            links = []
            for link in soup.find_all('a', href=True):
                href = link.get('href')
                if href.startswith('http'):
                    links.append(href)
                else:
                    links.append(urljoin(base_url, href))
            return links
        else:
            print(f"Failed to retrieve page: {response.status_code}")
            return []
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

In [None]:
def clean_links(links):
    i = len(links)-1
    while i >= 0:
        # print(links[i])
        if links[i] == 'https://aman.ai':
            links.pop(i)
        elif links[i] == 'https://aman.ai/':
            links.pop(i)
        elif links[i] == 'https://aman.ai/primers/ai/ml-comp/':
            links.pop(i)
        elif links[i] == 'https://aman.ai/cs229/linear-regression/':
            links.pop(i)
        elif links[i].startswith("https://aman.ai") == False:
            links.pop(i)
        i -= 1
        
    return links

In [None]:
# main_urls = [
#     "https://aman.ai/cs229/",
#     "https://aman.ai/cs230/",
#     "https://aman.ai/cs231n/",
#     "https://aman.ai/cs224n/",
#     "https://aman.ai/coursera-ml/",
#     "https://aman.ai/recsys/index.html",
#     "https://aman.ai/coursera-dl/",
#     "https://aman.ai/coursera-nlp/",
#     "https://aman.ai/multimodal/",
#     "https://aman.ai/primers/ai/",
#     "https://aman.ai/primers/graph/",
#     "https://aman.ai/primers/numpy/",
#     "https://aman.ai/primers/matplotlib/",
#     "https://aman.ai/primers/pandas/",
#     "https://aman.ai/primers/python/",
#     "https://aman.ai/primers/tensorflow/",
#     "https://aman.ai/primers/backprop/",
#     "https://aman.ai/primers/math/",
#     "https://aman.ai/code/",
#     "https://aman.ai/code/data-structures/",
#     "https://aman.ai/code/asymptotic-notations/",
# ]

main_urls = [
    # "https://aman.ai/primers/math/", ok
    "https://aman.ai/primers/ai/",
    "https://aman.ai/primers/numpy/",
    "https://aman.ai/primers/pandas/",
    "https://aman.ai/primers/python/",
    "https://aman.ai/primers/matplotlib/",
    "https://aman.ai/primers/tensorflow/",
    "https://aman.ai/code/",
    "https://aman.ai/code/data-structures/",
    "https://aman.ai/code/asymptotic-notations/",
    "https://aman.ai/coursera-ml/",
    "https://aman.ai/coursera-dl/",
    "https://aman.ai/coursera-nlp/",
    "https://aman.ai/cs229/",
    "https://aman.ai/cs230/",
    "https://aman.ai/cs231n/",
    "https://aman.ai/cs224n/",
    "https://aman.ai/recsys/index.html",
    "https://aman.ai/multimodal/",
    "https://aman.ai/primers/graph/",
    "https://aman.ai/primers/backprop/",
]





In [None]:
logging.disable(logging.DEBUG)

# Re-enable debug logs
# logging.disable(logging.NOTSET)

In [None]:
print( "Quantity of main_urls", len( main_urls ) )

for i_main_urls, url in enumerate(main_urls):
    print("")
    print( "main_urls #", i_main_urls )
    print( "main url:", url )
    
    internal_links = extract_links_from_url(url)
    internal_links_cleaned = clean_links(internal_links)
    
    print("")
    print( "Quantity of internal_links_cleaned", len( internal_links_cleaned ) )
    
    for i_internal_links_cleaned, link in enumerate(internal_links_cleaned):
        print("")
        print( "internal link #", i_internal_links_cleaned )
        print( "internal link", link )
        
        html_header_splits = get_text_from_html(link)        
        splits  = get_text_splits_from_text(html_header_splits)
        
        for level in ["beginner", "intermediate", "hard"]:
            print( "level:", level )
            
            parameters = {
                "quantity": 5,
                "level": level,
            }
            
            print("")
            print( "Quantity of splits", len( splits ) )
            
            generate_questions(splits, llm_google, parameters, html_header_splits)
            
            
    #         break
    #     break
    # break