In [110]:
import os
from dotenv import load_dotenv

import pandas as pd

In [59]:
from langchain_core.prompts import PromptTemplate

# Gemini

In [85]:
import google.generativeai as genai

key_gemini = os.environ["GOOGLE_API_KEY"]

# genai.configure(api_key=key_gemini)

# # Create the model
# generation_config = {
#     "temperature": 1,
#     "top_p": 0.95,
#     "top_k": 64,
#     "max_output_tokens": 8192,
#     "response_mime_type": "application/json",
# }

# llm_gemini = genai.GenerativeModel(
#     model_name="gemini-1.5-flash",
#     generation_config=generation_config,
#     # safety_settings = Adjust safety settings
#     # See https://ai.google.dev/gemini-api/docs/safety-settings
# )


from langchain_google_genai import ChatGoogleGenerativeAI

llm_gemini = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=0.5,
    max_tokens=None,
    timeout=None,
    max_retries=500,
    # other params...
)

# OpenAI

In [3]:
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

In [4]:
key_openai = os.environ["OPENAI_API_KEY"]

llm_openai = ChatOpenAI(model="gpt-4o-mini", api_key=key_openai)

In [55]:
topics = []

def save(topic):
    if topic not in topics:
        topics.append(topic.copy())

In [56]:
with open("../content_requirements/probability_extracted.txt", 'r') as reader:
    last_space_count = 0
    curr_topic = []

    for line in reader:
        curr_space_count = line.find('-')
        print(curr_space_count, line, end="")

        if len(curr_topic) == 0: #fix edge case 
            curr_topic.append(line)
        else:
            # case when is starting a group
            if last_space_count < curr_space_count:
                curr_topic.append(line.strip())
            
            # case when is ending a group
            elif last_space_count > curr_space_count:
                curr_topic.pop() # remove the leaf
                
                # remove the groups if necessary
                qty_remove = int((last_space_count - curr_space_count) / 4)
                for _ in range(qty_remove):
                    curr_topic.pop() # remove the last from the topic list twice
                
                curr_topic.append(line.strip()) # add the new one
            
            # case when it got to the leaf
            else: # last_space_count == curr_space_count:
                # topics.append(curr_topic.copy()) # save
                save(curr_topic)
                curr_topic.pop() # remove the last from the topic
                curr_topic.append(line.strip()) # add the new one
                # topics.append(curr_topic.copy()) # save the new one before go to the next line
                save(curr_topic)
        
        last_space_count = curr_space_count

0 - Probability & Statistics
4     - Metrics and Model Diagnostics
8         - Confusion Matrix
12             - Calculate different metrics to measure the validity of the model
12             - Examine the types of errors in confusion matrices
12             - Describe the cells of a confusion matrix
8         - Residuals
12             - Examine the residuals of the regression line
12             - Analyze a residuals plot
12             - Measure the direction of the vector of residuals
8         - Effect Sizes
12             - Calculate Cohen's d
12             - Calculate R-squared
12             - Calculate area under the ROC curve
4     - Probabilistic Theory
8         - Probabilistic Independence
12             - Identify independent events in probabilities
12             - Compute the probability of a series of n independent events
8         - Generative and Discriminative Models
12             - Describe the features and uses of generative models
12             - Describe the

In [57]:
topics

[['- Probability & Statistics\n',
  '- Metrics and Model Diagnostics',
  '- Confusion Matrix',
  '- Calculate different metrics to measure the validity of the model'],
 ['- Probability & Statistics\n',
  '- Metrics and Model Diagnostics',
  '- Confusion Matrix',
  '- Examine the types of errors in confusion matrices'],
 ['- Probability & Statistics\n',
  '- Metrics and Model Diagnostics',
  '- Confusion Matrix',
  '- Describe the cells of a confusion matrix'],
 ['- Probability & Statistics\n',
  '- Metrics and Model Diagnostics',
  '- Residuals',
  '- Examine the residuals of the regression line'],
 ['- Probability & Statistics\n',
  '- Metrics and Model Diagnostics',
  '- Residuals',
  '- Analyze a residuals plot'],
 ['- Probability & Statistics\n',
  '- Metrics and Model Diagnostics',
  '- Residuals',
  '- Measure the direction of the vector of residuals'],
 ['- Probability & Statistics\n',
  '- Metrics and Model Diagnostics',
  '- Effect Sizes',
  "- Calculate Cohen's d"],
 ['- Prob

In [60]:
# prompt = (
#     "I want you to work in steps. I will give you one task, after you complete it, you keep generating tokens and start the second task."
#     "#FIRST TASK: Create a list of topics of the mais concepts about a topic below. \n\n"
#     "{topic} \n\n"
#     "# SECOND TASK: I want you to expand the list that you just created to put the actual concepts in sub-itens. \n"
#     "Output format"
# )

# prompt_template = PromptTemplate.from_template(prompt)

In [68]:
prompt = (
    "Create a list of topics of the mais concepts about a topic below. \n\n"
    "You should put the actual concepts in sub-itens. \n"
    # "I will use"
    "{topic} \n\n"
    "Output format: markdown code"
)

prompt_template = PromptTemplate.from_template(prompt)

In [86]:
chain = prompt_template | llm_gemini
# chain = prompt_template | llm_openai

In [87]:
topics[0][-2:]

['- Confusion Matrix',
 '- Calculate different metrics to measure the validity of the model']

In [88]:
topic = " ".join(topics[0][-2:])
topic

'- Confusion Matrix - Calculate different metrics to measure the validity of the model'

In [89]:
response = chain.invoke(
    {
        "topic": topic
    }
)

response

AIMessage(content="## Confusion Matrix: Measuring Model Validity\n\n**1. Understanding the Confusion Matrix**\n    - **True Positives (TP):** Correctly predicted positive cases.\n    - **True Negatives (TN):** Correctly predicted negative cases.\n    - **False Positives (FP):** Incorrectly predicted positive cases (Type I Error).\n    - **False Negatives (FN):** Incorrectly predicted negative cases (Type II Error).\n\n**2. Key Metrics Derived from the Confusion Matrix**\n    - **Accuracy:** Overall correct predictions (TP + TN) / Total cases.\n    - **Precision:** Proportion of correctly predicted positive cases out of all predicted positive cases (TP / (TP + FP)).\n    - **Recall (Sensitivity):** Proportion of correctly predicted positive cases out of all actual positive cases (TP / (TP + FN)).\n    - **Specificity:** Proportion of correctly predicted negative cases out of all actual negative cases (TN / (TN + FP)).\n    - **F1-Score:** Harmonic mean of precision and recall, balancing

In [90]:
response.content

"## Confusion Matrix: Measuring Model Validity\n\n**1. Understanding the Confusion Matrix**\n    - **True Positives (TP):** Correctly predicted positive cases.\n    - **True Negatives (TN):** Correctly predicted negative cases.\n    - **False Positives (FP):** Incorrectly predicted positive cases (Type I Error).\n    - **False Negatives (FN):** Incorrectly predicted negative cases (Type II Error).\n\n**2. Key Metrics Derived from the Confusion Matrix**\n    - **Accuracy:** Overall correct predictions (TP + TN) / Total cases.\n    - **Precision:** Proportion of correctly predicted positive cases out of all predicted positive cases (TP / (TP + FP)).\n    - **Recall (Sensitivity):** Proportion of correctly predicted positive cases out of all actual positive cases (TP / (TP + FN)).\n    - **Specificity:** Proportion of correctly predicted negative cases out of all actual negative cases (TN / (TN + FP)).\n    - **F1-Score:** Harmonic mean of precision and recall, balancing both metrics (2 *

In [124]:
df = pd.DataFrame(columns=["topic_level_1", "topic_level_2", "topic_level_3", "topic_level_4", "content"])
df

Unnamed: 0,topic_level_1,topic_level_2,topic_level_3,topic_level_4,content


In [129]:
import time

for topic in topics[15:]:
    topic_joined = " ".join(topic[-2:])
    print(topic_joined)

    try:
        response = chain.invoke(
            {
                "topic": topic_joined
            }
        )
    except:
        time.sleep(70)
        response = chain.invoke(
            {
                "topic": topic_joined
            }
        )

    
    row = topic.copy()
    row.append(response.content)
    
    df.loc[len(df)] = row
    df.to_csv("probability_content.csv")

- Probability Distributions - Find a probability using a Bernoulli distribution
- Probability Distributions - Differentiate between different types of distributions
- Probability Distributions - Find a probability using a uniform distribution
- Probability Distributions - Find a probability using a Poisson distribution
- Probability Distributions - Find a probability using a binomial distribution
- Probability Distributions - Find a probability using a normal distribution
- Operations on Probabilities - Find the probability of difference and complement
- Operations on Probabilities - Find the probability of intersection and union
- Probability Density Function (PDF) - Calculate mean using a probability density function (PDF)
- Probability Density Function (PDF) - Calculate a variance using a probability density function (PDF)
- Probability Density Function (PDF) - Calculate a probability using a density function
