In [1]:
from datetime import datetime
import json

In [2]:
import anvil.server

anvil.server.connect("client_KYOM4YFDIE4TMEO3UAOTEPRC-55C7JZ62MGB2UXA6")

Connecting to wss://anvil.works/uplink
Anvil websocket open
Connected to "Published" as CLIENT


In [3]:
from anvil.tables import app_tables

questions = app_tables.questions.search()

In [4]:
from dotenv import load_dotenv
import json_repair
import os

import google.generativeai as genai


key_gemini = os.environ["GOOGLE_API_KEY"]

genai.configure(api_key=key_gemini)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Create the model
generation_config = {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 64,
    "max_output_tokens": 8192,
    "response_mime_type": "application/json",
}

llm_gemini = genai.GenerativeModel(
    model_name="gemini-1.5-flash",
    generation_config=generation_config,
    # safety_settings = Adjust safety settings
    # See https://ai.google.dev/gemini-api/docs/safety-settings
)

In [6]:
json_schema = """
  {
    "topic_description": {
      "type": "string",
      "description": "A sentence describing the sub-topic to which the question belongs. That means this sentence should specify in a granular level what specific sub-topic the question belongs to. It should be abstract in a way that other questions could be put in this description too. Use between 5 and 10 words."
    },
    "level": {
      "type": "string",
      "description": "The difficulty level of the question. It should be only one of the following options: 'beginner', 'intermediate', 'advanced'."
    },
    "question": {
      "type": "string",
      "description": "The actual question text. It should be a question of type TRUE or FALSE. It means that the questions should be an assertion that could be answered with TRUE or FALSE."
    },
    "answer_correct": {
      "type": "string",
      "description": "The correct answer to the question. It should be only one of the following options: TRUE or FALSE"
    },
    "explanation": {
      "type": "string",
      "description": "An explanation or solution to the question."
    }
  }
"""

In [7]:
prompt_question_generator = """
    TASK CONTEXT:
    I am studying machine learning and I need to practice some questions on various topics.
    
    TASK DESCRIPTION:
    I will provide you with a list of topics, and I would like you to generate a list of TRUE or FALSE questions.
    These questions should be interesting, creative, challenging and thought-provoking. 
    Each question should be in the form of a statement that could be either TRUE or FALSE.
    Feel free to be imaginative and attempt to confuse the student by blending related concepts or similar words.
    I will provide the topics in the DOMAIN KNOWLEDGE section.
    The questions should pertain to these topics, and you can use this knowledge as a foundation to create questions that delve deeper into the subject matter.
    
    ADDITIONAL TASK DESCRIPTION:
    {additional_task_description}
    
    TASK REQUIREMENTS:
    Please refrain from creating questions that require mathematical calculations, but you may create questions with mathematical formulas.
    You SHOULD use LATEX to write mathematical formulas and code, but you should use the Katex flavor.
    Also you should put $$ in the beggining of the katex code and $$ at the end of the code. This is necessary because the interpreter needs it.
    
    TASK DETAILS:
    You should create {quantity} questions of level {level}.
    
    DOMAIN KNOWLEDGE:
    {domain_knowledge}
    
    FORMAT OUTPUT INSTRUCTIONS:
    It should be formatted in list of JSON objects as described below.
    {json_schema}
"""

In [8]:
topics = """
N-gram Language Models
Definition and basics of n-grams
Unigram, bigram, and trigram models
Smoothing techniques (e.g., Laplace, Kneser-Ney)
Perplexity and evaluation of n-gram models
Limitations of n-gram models
Use cases and applications
Neural Language Models

Self-Attention
Concept and Mechanics of Self-Attention
Understanding Self-Attention
Definition: Self-attention as a mechanism for relating different positions in a sequence to capture dependencies.
Contrast with traditional attention: Comparison with encoder-decoder attention in seq2seq models.
Intuition: Why self-attention works well for capturing long-range dependencies.
Mathematical Formulation
Query, Key, and Value vectors: How each input token is represented.
Computation of attention scores: Dot-product between Query and Key vectors.
Softmax normalization: Converting raw scores into probabilities.
Weighted sum of Value vectors: Producing the final output for each token.
Benefits of Self-Attention
Parallelizability: Self-attention allows parallel computation, unlike RNNs.
Capturing global dependencies: Ability to model relationships between distant tokens.
Flexibility in sequence length: Handling variable-length sequences more efficiently.

Scaled Dot-Product Attention
Definition and Motivation
The problem with dot-product attention: Issues with large values in attention scores.
Scaling factor (1/√d): Mitigating the issue of large dot-products.
Mathematical Derivation
Calculating the dot-product between Query and Key vectors.
Applying the scaling factor: Reducing the impact of large values.
Softmax application: Transforming the scaled scores into a probability distribution.
Computational Efficiency
Matrix multiplication: Efficient implementation using matrix operations.
Importance in practice: How scaling stabilizes gradients and improves convergence.

Multi-Head Attention
Concept and Purpose
The need for multiple attention heads: Capturing different aspects of relationships in the input sequence.
Combining information: Concatenating outputs from multiple heads.
Why it works: Diversification of attention focus across heads.
Detailed Operation
Splitting input embeddings: Dividing the input into multiple parts for different attention heads.
Independent attention computations: Each head computes attention separately.
Concatenation and projection: Combining the outputs from all heads and projecting them into the desired dimensionality.
Practical Implications
Increased model capacity: How multi-head attention enhances the model’s ability to learn complex patterns.
Visualization of attention heads: Analyzing what different heads focus on in various layers.

Positional Encoding
Why Positional Information is Needed
Lack of sequence order in self-attention: The need to incorporate positional information.
Alternatives: Comparison with other methods like RNNs that inherently capture order.
Mathematical Construction
Sine and cosine functions: How positional encodings are generated using periodic functions.
Encoding formula: Explanation of how positions are encoded differently for each dimension.
Adding positional encodings: How these encodings are combined with the input embeddings.
Impacts on Model Performance
Interpretation of positional encodings: How they help the model understand sequence order.
Visualization and analysis: Understanding the role of positional encodings in learned representations.

Applications in NLP
Machine Translation
Sequence-to-sequence learning: How self-attention is used to translate one language into another.
Replacing RNNs with Transformers: Advantages of using self-attention in machine translation tasks.
Examples of state-of-the-art models (e.g., Transformer, BERT) in translation tasks.
Text Classification
Using self-attention for document-level tasks: Capturing relationships across long documents.
Benefits over traditional methods: Improved accuracy in tasks like sentiment analysis and topic classification.
Language Modeling
Enhancing predictive power: How self-attention improves next-word prediction in language models.
Pretraining and fine-tuning: Role of self-attention in modern language models like GPT.
Handling large-scale data: Scalability of self-attention-based models in massive datasets.

Transformer Language Models
Introduction to the Transformer Architecture
Background and Motivation
The limitations of RNNs and CNNs: Issues with long-range dependencies and sequential processing.
The advent of Transformers: Introduction of the Transformer model by Vaswani et al. (2017).
Core Components of Transformers
Encoder and decoder stacks: Detailed explanation of the encoder-decoder architecture.
Attention mechanisms: How self-attention operates within each encoder and decoder layer.
Feedforward neural networks: The role of position-wise feedforward layers in the architecture.
Layer normalization and residual connections: Techniques for stabilizing training and improving convergence.
Encoder-Decoder Structure

Encoder Block
Self-attention in the encoder: How the encoder processes input sequences.
Feedforward layers: The role of dense layers in transforming attention outputs.
Stacking of multiple layers: Deepening the model to capture complex patterns.
Decoder Block
Self-attention in the decoder: How the decoder processes its own outputs.
Cross-attention: Interaction between encoder outputs and decoder inputs.
Autoregressive generation: The process of generating sequences step by step.
Masking in self-attention: Preventing access to future tokens during training.
Training Process
Input-output pairs: How Transformers are trained on parallel data (e.g., source-target pairs in translation).
Loss function: Use of cross-entropy loss in training.
Optimization techniques: Application of Adam optimizer and learning rate scheduling.
Attention Mechanisms in Transformers

Self-Attention in Encoder
Role of self-attention: How each token attends to all others in the input sequence.
Capturing contextual relationships: Enhancing understanding of dependencies within the sequence.
Self-Attention in Decoder
Role in sequence generation: How the decoder uses self-attention to generate text autoregressively.
Handling partial sequences: The importance of masking future tokens in the decoder’s self-attention.
Cross-Attention
Connecting encoder and decoder: How the decoder attends to encoder outputs.
Aligning source and target sequences: Enhancing translation accuracy through cross-attention.
Transformer vs. RNN/CNN Models

Comparative Advantages
Parallelism: Transformers vs. sequential processing in RNNs.
Memory efficiency: Handling long sequences without losing information.
Training speed: Faster convergence and scalability with Transformers.
Weaknesses of RNNs/CNNs
Difficulty in capturing long-range dependencies: The vanishing gradient problem in RNNs.
Fixed-size contexts: Limitations of CNNs in handling variable-length sequences.
Lack of parallelism: Slow training times due to sequential nature.
Case Studies
Application of Transformers in tasks where RNNs/CNNs struggled (e.g., translation, summarization).
Examples of model improvements (e.g., BERT outperforming RNN-based models).
Popular Transformer Models

BERT (Bidirectional Encoder Representations from Transformers)
Pretraining objectives: Masked language modeling (MLM) and next sentence prediction (NSP).
Fine-tuning for downstream tasks: How BERT is adapted for tasks like classification and QA.
Bidirectional context: The advantage of using both left and right contexts.
GPT (Generative Pretrained Transformer)
Autoregressive generation: Unidirectional language modeling in GPT.
Pretraining on large text corpora: The significance of scale in GPT models.
Few-shot learning: GPT’s ability to perform tasks with minimal examples.
T5 (Text-To-Text Transfer Transformer)
Unified framework: Treating all NLP tasks as text-to-text problems.
Pretraining and task-specific fine-tuning: How T5 handles diverse NLP tasks.
Model architecture: Key differences between T5 and other Transformer models.
Other Notable Models
RoBERTa, XLNet, ALBERT: Variants and improvements over original Transformer models.
Differences in training objectives, architectures, and performance.
Training and Fine-Tuning Transformers

Pretraining Techniques
Datasets for pretraining: Common corpora used for large-scale pretraining (e.g., Wikipedia, BookCorpus).
Pretraining objectives: Differences between masked language modeling, autoregressive modeling, etc.
Computational resources: Hardware and software requirements for training large models.

Fine-Tuning Strategies
Task-specific fine-tuning: Adapting pre-trained models for specific NLP tasks.
Hyperparameter tuning: Importance of learning rates, batch sizes, and other parameters in fine-tuning.
Challenges in fine-tuning: Overfitting, catastrophic forgetting, and other issues.
Transfer Learning in Practice
Adapting to new domains: How pretrained models can be transferred to domain-specific tasks.
Case studies: Examples of successful transfer learning with Transformer models.

Transfer Learning in NLP
Pretraining and fine-tuning paradigms
Domain adaptation and transferability
Zero-shot and few-shot learning
Fine-tuning techniques for NLP models
Case studies (e.g., BERT, GPT, RoBERTa)

Instruction Tuning
Definition and purpose
Differences between instruction tuning and fine-tuning
Techniques for instruction tuning
Applications in conversational AI

Tokenization in Language Models
Types of tokenization (word, subword, character)
Byte Pair Encoding (BPE)
WordPiece and SentencePiece algorithms
Tokenization challenges in multilingual models
Impact of tokenization on model performance
Parameter-Efficient Adaptation of LLMs

Low-Rank Adaptation (LoRa)
Concept and motivation
Implementation details
Advantages and limitations

Prompt Tuning
Principles of prompt tuning
Soft prompts vs. discrete prompts
Applications in few-shot learning

LLM Alignment & Reinforcement Learning from Human Feedback (RLHF)
Concept of LLM alignment with human values
RLHF process and techniques
Training LLMs with human feedback
Ethical considerations and challenges
Applications in safety and fairness

Direct Preference Optimization (DPO)
Definition and key concepts
Differences from RLHF
Use cases and advantages
Implementation strategies

Decoding from Language Models (Inference)
Greedy decoding, beam search, and sampling methods
Top-k, top-p (nucleus) sampling
Temperature scaling
Trade-offs between accuracy and diversity
Challenges in decoding (e.g., repetition, coherence)

Prompt Engineering and Retrieval-Augmented Generation (RAG)
Basics of prompt engineering
Designing effective prompts for NLP tasks
Retrieval-augmented generation (RAG) models
Integration of retrieval mechanisms with LLMs
Applications in QA and knowledge retrieval

Evaluating LLM-Generated Text
Evaluation metrics (e.g., BLEU, ROUGE, METEOR)
Human vs. automated evaluation
Measuring coherence, fluency, and relevance
Bias and fairness evaluation
Robustness testing

Position Embeddings
Role of position embeddings in Transformers
Absolute vs. relative position embeddings
Alternatives to position embeddings (e.g., rotary embeddings)
Impact on model performance

Efficient Attention Models
Linear attention mechanisms
Long-range attention models (e.g., Longformer, Reformer)
Sparse attention techniques
Memory-efficient Transformers
Trade-offs in model efficiency and accuracy

Scaling Laws for Large Language Models
Scaling laws in NLP
Relationship between model size, data, and performance
Practical implications of scaling laws
Challenges in training large-scale models

Vision-Language Models
Introduction to vision-language models (e.g., CLIP, DALL-E)
Multimodal embeddings
Cross-modal attention
Applications in image captioning, VQA, and generative tasks
Training strategies for vision-language models

Multimodal Models
Definition and types of multimodal models
Challenges in multimodal learning
Architectures combining text, image, and audio modalities
Applications in real-world scenarios
Evaluation of multimodal models

In-Context Learning
Definition and use cases
Techniques for in-context learning
Advantages over traditional fine-tuning
Applications in few-shot and zero-shot learning

Detecting LLM-Generated Text
Methods for detecting machine-generated text
Watermarking and fingerprinting techniques
Ethical implications of detection
Applications in content moderation and security

LLM Security
Security challenges in LLMs
Adversarial attacks on language models
Mitigation strategies
Secure deployment of LLMs
Case studies and real-world examples

LLM Interpretability
Probing
Techniques for probing LLMs
Understanding internal representations
Editing
Methods for editing LLM parameters
Practical applications of model editing
Induction Heads
Concept of induction heads in Transformers
Analysis and implications

Mixture of Experts
Concept of mixture of experts in NLP
Architectures (e.g., Switch Transformers, GShard)
Dynamic routing and expert selection
Scaling benefits and challenges
Use cases and applications

Mamba
Definition and overview of Mamba
Role in NLP research
Key features and capabilities
Applications and use cases

Griffin
Overview of Griffin in NLP
Comparison with other NLP models/tools
Key applications and advantages
"""

In [9]:
lines = topics.splitlines()
blocks = []
block = []
start_copy = False
for line in lines:
    # print(line)
    
    if line == "" and start_copy == False:
        start_copy = True
        continue

    if line == "" and start_copy == True:
        start_copy = False
        blocks.append( "\n".join(block))
        block = []
        continue
    
    if start_copy == True:
        block.append(line)
    

In [10]:
for block in blocks:
    for level in ["beginner", "intermediate", "hard"]:
        print("Level: ", level)

        parameters = {
            "title": "Advanced Natural Language Processing",
            "additional_task_description": "Create questions only about the definitions of the concepts, like mixing the definition of one with another, or mixing the use of one with the use of another. I need this to memorize this concepts.",
            "quantity": "4",
            "level": level,
            "domain_knowledge": block,
        }

        prompt_question_generator_formatted = prompt_question_generator.format(
            additional_task_description=parameters["additional_task_description"],
            quantity=parameters["quantity"],
            level=parameters["level"],
            domain_knowledge=parameters["domain_knowledge"],
            json_schema=json_schema,
        )

        response = llm_gemini.generate_content(prompt_question_generator_formatted)
        questions = json_repair.loads(response.text)

        if not isinstance(questions, list):
            questions = [questions]

        for question in questions:
            print(json.dumps(question, indent=4))
            # add to the database
            app_tables.questions.add_row(
                created_at=datetime.now(),
                title=parameters["title"],
                topic_description=question["topic_description"],
                level=question["level"],
                question=question["question"],
                type="true_or_false",
                answer_correct=question["answer_correct"],
                answers=None,
                explanation=question["explanation"],
                # user=anvil.users.get_user(),
            )

Level:  beginner
{
    "topic_description": "N-gram model definition",
    "level": "beginner",
    "question": "A bigram model considers the previous word when predicting the next word.",
    "answer_correct": "TRUE",
    "explanation": "A bigram model, as the name suggests, considers a sequence of two words (a bigram) to predict the next word in a sequence. It uses the information about the preceding word to make its predictions."
}
{
    "topic_description": "N-gram model definition",
    "level": "beginner",
    "question": "A unigram model is based on the frequency of individual words in the corpus, regardless of their context.",
    "answer_correct": "TRUE",
    "explanation": "A unigram model only considers the frequency of individual words in the corpus. It doesn't take into account the order or context of the words, only their individual probabilities."
}
{
    "topic_description": "N-gram smoothing techniques",
    "level": "beginner",
    "question": "Laplace smoothing adds 

In [11]:
if not isinstance(questions, list):
        questions = [questions]

for question in questions:
    print(json.dumps(question, indent=4))
    # add to the database
    # app_tables.questions.add_row(
    #     created_at=datetime.now(),
    #     title=parameters["title"],
    #     topic_description=question["topic_description"],
    #     level=question["level"],
    #     question=question["question"],
    #     type="true_or_false",
    #     answer_correct=question["answer_correct"],
    #     answers=None,
    #     explanation=question["explanation"],
    #     user=anvil.users.get_user(),
    # )

{
    "topic_description": "Dynamic routing in Mixture of Experts",
    "level": "hard",
    "question": "Dynamic routing in Mixture of Experts is solely based on the input data, with no consideration for the current state of the experts.",
    "answer_correct": "FALSE",
    "explanation": "Dynamic routing in Mixture of Experts can consider both the input data and the current state of the experts. This allows for more adaptive and efficient routing, taking into account the strengths and weaknesses of each expert based on past performance and the current input."
}
{
    "topic_description": "Architectures of Mixture of Experts",
    "level": "hard",
    "question": "GShard and Switch Transformers both utilize Mixture of Experts, but their primary focus differs. GShard aims to enhance training efficiency through parallel processing, while Switch Transformers prioritize efficient inference by dynamically selecting experts based on the input.",
    "answer_correct": "TRUE",
    "explanatio