<a href="https://colab.research.google.com/github/satya-ip/ai/blob/main/Understanding_LLMs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Modern AI Pro
Let's understand the foundation of LLMs.

##0. Token prediction by humans

In [None]:
# Simple Next Token Prediction Game for Beginners
import requests
import random
import re
def get_book_text():
    """Load Alice in Wonderland from Project Gutenberg"""
    url = "https://www.gutenberg.org/files/11/11-0.txt"
    response = requests.get(url)
    text = response.text

    # Extract main content between markers
    if "*** START OF" in text and "*** END OF" in text:
        text = text.split("*** START OF")[1].split("*** END OF")[0]

    return text

def get_good_sentences(text, min_length=15, max_length=30):
    """Get sentences with appropriate length"""
    # Find sentences ending with punctuation
    sentences = re.findall(r'[A-Z][^.!?]*[.!?]', text)

    # Filter by word count
    good_sentences = []
    for s in sentences:
        s = s.strip()
        words = s.split()
        if min_length <= len(words) <= max_length:
            good_sentences.append(s)

    return good_sentences

In [None]:
def play_continuous_game(sentences, num_sentences=3):
    """Play continuous prediction within each sentence"""
    print("Welcome to Continuous Next Token Prediction!")
    print("After each prediction, I'll show the correct word")
    print("and ask you to predict the next one.\n")

    score = 0
    total_predictions = 0

    for round_num in range(1, num_sentences+1):
        # Select a sentence
        sentence = sentences[round_num % len(sentences)]
        words = sentence.split()

        # Start with first 5 words
        revealed_count = 5
        revealed = words[:revealed_count]

        print(f"\n--- Sentence {round_num}/{num_sentences} ---")
        print(f"Starting context: \"{' '.join(revealed)}...\"")

        # Continue predicting until end of sentence
        while revealed_count < len(words):
            # Get prediction for next word
            print("\nPredict the next word:")
            prediction = input("> ").strip().lower()

            # Clean prediction (first word only)
            if " " in prediction:
                prediction = prediction.split()[0]
            prediction = re.sub(r'[^\w\']', '', prediction)

            # Get actual next word
            next_word = words[revealed_count]
            clean_next = re.sub(r'[^\w\']', '', next_word.lower())

            # Score
            if prediction == clean_next:
                result = "CORRECT!"
                round_score = 1
            else:
                result = "INCORRECT"
                round_score = 0

            score += round_score
            total_predictions += 1

            # Show result
            print(f"{result} The next word is: \"{next_word}\"")

            # Reveal next word
            revealed_count += 1
            revealed = words[:revealed_count]
            print(f"Context now: \"{' '.join(revealed)}...\"")

            # If near end, show how many words remain
            if len(words) - revealed_count <= 3:
                print(f"({len(words) - revealed_count} words remaining in this sentence)")

        # Show complete sentence
        print(f"\nComplete sentence: \"{sentence}\"")
        print(f"You made {total_predictions} predictions in this round.")

    # Final score
    accuracy = (score / total_predictions) * 100 if total_predictions > 0 else 0
    print(f"\nGame Over! You got {score} out of {total_predictions} correct.")
    print(f"Accuracy: {accuracy:.1f}%")

    if accuracy > 20:
        print("Excellent! You're showing strong language prediction skills.")
    elif accuracy > 10:
        print("Good job! You're better than random guessing.")
    else:
        print("Keep practicing! Next-token prediction is challenging.")



In [None]:
# Run the game
print("Loading text...")
text = get_book_text()
sentences = get_good_sentences(text)
print(f"Found {len(sentences)} good sentences!")

# Start the game with 5 rounds
play_continuous_game(sentences, 3)


Loading text...
Found 372 good sentences!
Welcome to Continuous Next Token Prediction!
After each prediction, I'll show the correct word
and ask you to predict the next one.


--- Sentence 1/3 ---
Starting context: "There was nothing so _very_..."

Predict the next word:
> shocking
INCORRECT The next word is: "remarkable"
Context now: "There was nothing so _very_ remarkable..."

Predict the next word:
> about
INCORRECT The next word is: "in"
Context now: "There was nothing so _very_ remarkable in..."

Predict the next word:
> the
INCORRECT The next word is: "that;"
Context now: "There was nothing so _very_ remarkable in that;..."

Predict the next word:
> story
INCORRECT The next word is: "nor"
Context now: "There was nothing so _very_ remarkable in that; nor..."

Predict the next word:
> there
INCORRECT The next word is: "did"
Context now: "There was nothing so _very_ remarkable in that; nor did..."

Predict the next word:


KeyboardInterrupt: Interrupted by user

## 1. Testing tokenizers

In [None]:
from transformers import AutoTokenizer

In [None]:
model1 = "deepseek-ai/DeepSeek-R1"
model2 = "microsoft/phi-4"
model3 = "NousResearch/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model2)
print(len(tokenizer))

100352


In [None]:
tokenizer.encode("Supercallifragilisticexpialidocious")

[10254, 3035, 543, 333, 4193, 321, 4633, 4683, 532, 307, 78287]

[0, 28671, 26606, 394, 3174, 321, 435, 722, 66562, 536, 329, 95790]

In [None]:
tokenizer.decode(10254)

'Sup'

In [None]:
sentence = "Modern AI Pro rock!!"
token_ids = tokenizer(sentence).input_ids
print("Encoded IDs:",token_ids)
print("Begin decoding")
for id in token_ids:
    print(id, tokenizer.decode(id))
#CLS stands for classification

Encoded IDs: [49552, 15592, 1322, 7091, 3001]
Begin decoding
49552 Modern
15592  AI
1322  Pro
7091  rock
3001 !!


## 2. Testing token prediction
Autoregressive generation

In [None]:
from transformers import AutoModelForCausalLM
model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to("cpu")

tokenizer_config.json:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/861 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [None]:
import torch
import torch.nn.functional as F
def predict_next_token(text, num_tokens=5, temperature=0):
    # Convert text to model format
    tokens = tokenizer.encode(text, return_tensors="pt")

    # Get model prediction
    output = model(tokens)

    # Focus on the last position (next token)
    next_token_scores = output.logits[0, -1, :]

    #Apply temperature (higher = more random)
    if temperature > 0:
        next_token_scores = next_token_scores / temperature

    # Convert scores to probabilities (0-1)
    probabilities = F.softmax(next_token_scores, dim=-1)

    # Get top predictions
    top_probs, top_ids = torch.topk(probabilities, num_tokens)

    # Show results
    print(f"After '{text}', the model predicts:")
    print("-" * 80)
    print("Top tokens possible: ",top_ids)
    for i, (prob, token_id) in enumerate(zip(top_probs, top_ids)):
        token_text = tokenizer.decode(token_id)
        percentage = prob * 100
        print(f"{i+1}. '{token_text}',  - {percentage:.1f}%, id: {token_id}")

    # Sample the next token based on the probability distribution
    next_token_id = torch.multinomial(probabilities, num_samples=1).item()
    next_token_text = tokenizer.decode(next_token_id)
    next_token_prob = probabilities[next_token_id].item() * 100

    print(f"FINAL PREDICTION: '{next_token_text}' ({next_token_prob:.1f}%)")
    return next_token_text

In [None]:
predict_next_token("Sky is", temperature=1.0)

After 'Sky is', the model predicts:
--------------------------------------------------------------------------------
Top tokens possible:  tensor([ 253,  260, 1129,  441,  702])
1. ' a',  - 13.3%, id: 253
2. ' the',  - 7.8%, id: 260
3. ' often',  - 2.5%, id: 1129
4. ' not',  - 2.4%, id: 441
5. ' like',  - 1.7%, id: 702
FINAL PREDICTION: ' crossing' (0.0%)


' crossing'

In [None]:
tokenizer.decode(971)

In [None]:
def generate_text(text, max_length=100, top_k=5,temperature=1):
    # Keep track of original input
    original_text = text
    generated_tokens = 0

    # Generate tokens one by one (autoregressive)
    for i in range(1, max_length + 1):
        # Predict next token using our earlier function
        new_token = predict_next_token(text, top_k, temperature)

        text = text + new_token
        generated_tokens += 1

        # Show progress
        print(f"Token {i}: '{new_token}'")
        print(f"Text so far: '{text}'")
        print("-" * 40)

        # Stop if we get an end marker
        if new_token == "<|endoftext|>":
            break

    print(f"\nFINAL RESULT:")
    print(f"Original prompt: '{original_text}'")
    print(f"Generated {generated_tokens} new tokens")
    print(f"Complete text: '{text}'")

    return text

In [None]:
generate_text("The capital of Russia was",max_length=10, top_k=5, temperature = 1)

After 'The capital of Russia was', the model predicts:
--------------------------------------------------------------------------------
Top tokens possible:  tensor([7498, 3251,  260, 2268, 2837])
1. ' founded',  - 17.6%, id: 7498
2. ' established',  - 6.5%, id: 3251
3. ' the',  - 6.5%, id: 260
4. ' once',  - 4.8%, id: 2268
5. ' built',  - 4.1%, id: 2837
FINAL PREDICTION: ' formally' (0.5%)
Token 1: ' formally'
Text so far: 'The capital of Russia was formally'
----------------------------------------
After 'The capital of Russia was formally', the model predicts:
--------------------------------------------------------------------------------
Top tokens possible:  tensor([ 3251,  7498,  6572, 22184, 10877])
1. ' established',  - 28.0%, id: 3251
2. ' founded',  - 6.2%, id: 7498
3. ' opened',  - 6.1%, id: 6572
4. ' renamed',  - 5.6%, id: 22184
5. ' transferred',  - 5.6%, id: 10877
FINAL PREDICTION: ' constructed' (0.3%)
Token 2: ' constructed'
Text so far: 'The capital of Russia was form

'The capital of Russia was formally constructed by the Roman Emperor Trajan in '