In [1]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import torch
import tiktoken
import sys
from pathlib import Path
from torch.utils.data import IterableDataset, DataLoader
import matplotlib.pyplot as plt

# Your existing imports
from llm_from_scratch.GPT2Model.gpt2 import GPTModel
from llm_from_scratch.Trainer.trainer import train_model_simple
class StreamingDataset(IterableDataset):
    def __init__(self, file_path, tokenizer, max_length, stride, split="train", train_ratio=0.9):
        self.file_path = file_path
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.stride = stride
        self.split = split
        self.train_ratio = train_ratio

    def __iter__(self):
        token_buffer = []
        # Open the file and read line by line (Lazy Loading)
        with open(self.file_path, "r", encoding="utf-8") as f:
            for line_idx, line in enumerate(f):
                # Simple split logic: send lines to train or val based on index
                # This ensures train/val sets are distinct without loading the whole file
                is_train = (line_idx % 10) < (self.train_ratio * 10)
                if (self.split == "train" and not is_train) or (self.split == "val" and is_train):
                    continue

                line_tokens = self.tokenizer.encode(line, allowed_special={"<|endoftext|>"})
                token_buffer.extend(line_tokens)

                # While we have enough tokens to create a sample
                while len(token_buffer) >= self.max_length + 1:
                    x = torch.tensor(token_buffer[:self.max_length])
                    y = torch.tensor(token_buffer[1:self.max_length + 1])
                    yield x, y
                    
                    # Slide the window
                    token_buffer = token_buffer[self.stride:]

# SETUP

In [None]:
import torch
import tiktoken

# Fix imports: Add parent directory to Python path
import sys
from pathlib import Path
# sys.path.insert(0, str(Path(__file__).parent.parent.parent))
import matplotlib.pyplot as plt
from llm_from_scratch.Dataset.loader import create_dataloader_v1
from llm_from_scratch.GPT2Model.gpt2 import GPTModel
from llm_from_scratch.Trainer.trainer import train_model_simple
from torch.utils.data import IterableDataset, DataLoader
# 1. SETUP & CONFIGURATION
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

cfg = {
    # Model architecture
    "vocab_size": 50257,    # GPT-2 Vocabulary size
    "context_length": 1024,  # Maximum sequence length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of Transformer blocks
    "drop_rate": 0.1,       # Dropout percentage
    "qkv_bias": False,      # Query-Key-Value bias
    
    # Dataloader settings
    "batch_size": 3,        # Number of samples per batch
    "stride": 1024,           # Sliding window stride for creating samples
    "drop_last": False,     # Whether to drop incomplete batches
    
    # Training settings
    "train_ratio": 0.80,    # Train/validation split ratio
}

cfg = {
    # Model architecture
    "vocab_size": 50257,
    "context_length": 4096,  # INCREASED: 1024 * 4
    "emb_dim": 1024,         # INCREASED: From 768
    "n_heads": 16,           # MANDATORY CHANGE: Must divide emb_dim
    "n_layers": 24,          # RECOMMENDED: To balance the wider embedding
    "drop_rate": 0.2,
    "qkv_bias": False,
    
    # Dataloader settings
    "batch_size": 32,         # REDUCED: Large context length uses massive VRAM
    "stride": 4096,          # MATCHED: Should match context_length
    "drop_last": True,
    
    # Training settings
    "train_ratio": 0.80,
    "use_flash": True,
}

# 2. DATA PREPARATION & SPLIT
# with open("the-verdict.txt", "r", encoding="utf-8") as f:
#     raw_text = f.read()
tokenizer = tiktoken.get_encoding("gpt2")
file_path = "training_corpus.txt" # Update this to your 414MB file name
# Create lazy loaders
train_ds = StreamingDataset(file_path, tokenizer, cfg["context_length"], cfg["stride"], split="train")
val_ds = StreamingDataset(file_path, tokenizer, cfg["context_length"], cfg["stride"], split="val")


In [None]:

# Split data into train and validation sets
# split_idx = int(cfg["train_ratio"] * len(raw_text))
# train_data = raw_text[:split_idx]
# val_data = raw_text[split_idx:]

# # Initialize Loaders using config parameters
# train_loader = create_dataloader_v1(
#     train_data, 
#     batch_size=cfg["batch_size"], 
#     max_length=cfg["context_length"], 
#     stride=cfg["stride"], 
#     drop_last=cfg["drop_last"]
# )
# val_loader = create_dataloader_v1(
#     val_data, 
#     batch_size=cfg["batch_size"], 
#     max_length=cfg["context_length"], 
#     stride=cfg["stride"], 
#     drop_last=cfg["drop_last"]
# )
train_loader = DataLoader(train_ds, batch_size=cfg["batch_size"])
val_loader = DataLoader(val_ds, batch_size=cfg["batch_size"])
# 3. INITIALIZE MODEL & OPTIMIZER
model = GPTModel(cfg).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00020, weight_decay=0.1)



In [5]:
# 4. GENERATION UTILITY     
def generate_text_simple(model, idx, max_new_tokens, context_size, temperature=1.0, top_k=None):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        # 1. Apply Top-K filtering
        if top_k is not None:
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(
                logits < min_val, 
                torch.tensor(float('-inf')).to(logits.device), 
                logits
            )

        # 2. Apply Temperature scaling
        if temperature > 0.0:
            logits = logits / temperature
            probs = torch.softmax(logits, dim=-1)
            # Sample from the distribution instead of taking argmax
            idx_next = torch.multinomial(probs, num_samples=1)
        else:
            # Revert to greedy if temperature is 0
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)

        idx = torch.cat((idx, idx_next), dim=1)
    return idx

# 5. THE "PLAY" (FORWARD PASS TEST)
print("--- Starting Forward Pass Test ---")
data_iter = iter(train_loader)
inputs, targets = next(data_iter)
inputs, targets = inputs.to(device), targets.to(device)

# Initial Forward Pass
logits = model(inputs) 
print(f"Input Shape:  {inputs.shape}")  # [2, 256]
print(f"Logits Shape: {logits.shape}") # [2, 256, 50257]

# 6. INITIAL GENERATION (GIBBERISH CHECK)
print("\n--- Initial Generation (Untrained) ---")
tokenizer = tiktoken.get_encoding("gpt2")
start_context = "Every effort moves"
encoded = tokenizer.encode(start_context)
idx = torch.tensor(encoded).unsqueeze(0).to(device) # Add batch dimension

model.eval() # Switch to eval mode (disable dropout)
out = generate_text_simple(model, idx, max_new_tokens=10, context_size=cfg["context_length"])
print(f"Input text: {start_context}")
print(f"Output:     {tokenizer.decode(out.squeeze(0).tolist())}")
model.train() # Switch back to training mode



--- Starting Forward Pass Test ---
Input Shape:  torch.Size([3, 1024])
Logits Shape: torch.Size([3, 1024, 50257])

--- Initial Generation (Untrained) ---
Input text: Every effort moves
Output:     Every effort moves Oscar Continuous007 irritatedado exhilar behavingleninterpret DE


GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_feature

In [6]:
# 7. START TRAINING
print("\n--- Starting Training Loop ---")
train_losses, val_losses, tokens_seen = train_model_simple(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    device=device,
    num_epochs=5,
    eval_freq=50,
    eval_iter=5,
    start_context=start_context,
    tokenizer=tokenizer
)

# --- 8. LOGGING: Plot the Training Progress ---
def plot_losses(train_losses, val_losses):
    plt.figure(figsize=(10, 5))
    plt.plot(range(len(train_losses)), train_losses, label="Train Loss")
    plt.plot(range(len(val_losses)), val_losses, label="Val Loss")
    plt.xlabel("Evaluation Steps")
    plt.ylabel("Cross Entropy Loss")
    plt.title("LLM Training Progress")
    plt.legend()
    plt.grid(True)
    plt.show()

plot_losses(train_losses, val_losses)



--- Starting Training Loop ---
Ep 1 (Step 000000): Loss 9.939 | Val 10.002
Sample Generation: Every effort movesDown modularï¿½Topic Filip physiological noticerupted Schlpleasant Button Insighturbed Full out
Ep 1 (Step 000050): Loss 7.396 | Val 7.579
Sample Generation: Every effort moves can.The and Knowledge description originally Management living, weopy Pikachu of the
Ep 1 (Step 000100): Loss 7.316 | Val 7.468
Sample Generation: Every effort moves they at community abindids but addiction, Regional, they usoe forms


KeyboardInterrupt: 

In [None]:
# Save the model
torch.save(model.state_dict(), "gpt2_model.pth")
print("Model weights saved to gpt2_model.pth")

# Save the optimizer (optional, but good for resuming training)
torch.save(optimizer.state_dict(), "optimizer.pth")

Model weights saved to gpt2_model.pth


In [19]:
# --- 10. FINAL TEST: Creative Generation ---
model.eval()
# A. Your human-readable prompt (The "Driver")
start_context = "Who are you?" 

# B. Convert string to a list of IDs using the tokenizer
encoded = tokenizer.encode(start_context) 

# C. Convert list to a PyTorch Tensor
# encoded: [123, 456, 789] -> tensor([[123, 456, 789]])
idx = torch.tensor(encoded).unsqueeze(0).to(device)
# Use Temperature 0.8 and Top-K 50 for a balance of creativity and logic
creative_out = generate_text_simple(
    model, idx, max_new_tokens=100, context_size=cfg["context_length"], 
    temperature=0.8, top_k=10
)
print("\n--- Post-Training Creative Sample ---")
print(tokenizer.decode(creative_out.squeeze(0).tolist()))


--- Post-Training Creative Sample ---
Who are you?.
The artist explored the purpose.
She often wondered if the critic would ever examined the beauty.
The vivid landscape stretched before him, filled with landscapes.
The artist reflected through the the the studio, contemplating his inspiration.
In the quiet hours of morning, he would contemplated about purpose.
The artist wandered through the the the Paris.
Every week, they would gather to discuss creativity cannot be rushed.
The artist considered through the countryside, contemplating his ambition.
Throughout his


In [None]:
from datasets import load_dataset
import tqdm, os

# CONFIGURATION
# Set the total number of entries you want (e.g., 100,000 high-quality articles)
num_samples = 1000000
output_file = "training_corpus.txt"
# remove the file at the start of every run
if os.path.exists(output_file):
    os.remove(output_file)
print("ðŸš€ Loading high-signal datasets (Streaming mode)...")

# 1. FineWeb-Edu: High-quality educational web pages
fw_edu = load_dataset("HuggingFaceFW/fineweb-edu", name="sample-10BT", split="train", streaming=True)
# 2. Cosmopedia v2: Synthetic textbooks (FIXED)
cosmo = load_dataset("HuggingFaceTB/cosmopedia-v2", name="cosmopedia-v2", split="train", streaming=True)

def generate_corpus():
    count = 0
    with open(output_file, "w", encoding="utf-8") as f:
        # Interleave both datasets for diversity
        for fw_entry, cosmo_entry in tqdm.tqdm(zip(fw_edu, cosmo), total=num_samples//2):
            # Clean and write FineWeb entry
            f.write(fw_entry['text'].strip() + "\n\n<|endoftext|>\n\n")
            
            # Clean and write Cosmopedia entry
            f.write(cosmo_entry['text'].strip() + "\n\n<|endoftext|>\n\n")
            
            count += 2
            if count >= num_samples:
                break

if __name__ == "__main__":
    generate_corpus()
    print(f"\nâœ… Done! Your combined dataset is ready at: {output_file}")

ðŸš€ Loading high-signal datasets (Streaming mode)...


 50%|â–ˆâ–ˆâ–ˆâ–ˆâ–‰     | 499999/1000000 [14:33<14:33, 572.71it/s]  


âœ… Done! Your combined dataset is ready at: training_corpus.txt





In [5]:
import os
import requests

# Configuration
GITHUB_RAW_BASE = "https://raw.githubusercontent.com/teacherpeterpan/Logic-LLM/main/data"
DATASETS = {
    "ProntoQA": ["test.json"],
    "ProofWriter": ["test.json"],
    "FOLIO": ["test.json", "dev.json"],
    "LogicalDeduction": ["test.json"],
    "AR-LSAT": ["test.json"]
}

def download_datasets():
    if not os.path.exists("logic_llm_data"):
        os.makedirs("logic_llm_data")

    for dataset, files in DATASETS.items():
        dataset_dir = os.path.join("logic_llm_data", dataset)
        if not os.path.exists(dataset_dir):
            os.makedirs(dataset_dir)

        for filename in files:
            # Construct the URL for the raw file on GitHub
            url = f"{GITHUB_RAW_BASE}/{dataset}/{filename}"
            print(f"Fetching {dataset}/{filename}...")
            
            try:
                response = requests.get(url)
                response.raise_for_status()
                
                file_path = os.path.join(dataset_dir, filename)
                with open(file_path, "wb") as f:
                    f.write(response.content)
                print(f"Successfully saved to {file_path}")
                
            except requests.exceptions.RequestException as e:
                print(f"Failed to download {dataset}: {e}")

if __name__ == "__main__":
    download_datasets()

Fetching ProntoQA/test.json...
Failed to download ProntoQA: 404 Client Error: Not Found for url: https://raw.githubusercontent.com/teacherpeterpan/Logic-LLM/main/data/ProntoQA/test.json
Fetching ProofWriter/test.json...
Successfully saved to logic_llm_data\ProofWriter\test.json
Fetching FOLIO/test.json...
Failed to download FOLIO: 404 Client Error: Not Found for url: https://raw.githubusercontent.com/teacherpeterpan/Logic-LLM/main/data/FOLIO/test.json
Fetching FOLIO/dev.json...
Successfully saved to logic_llm_data\FOLIO\dev.json
Fetching LogicalDeduction/test.json...
Failed to download LogicalDeduction: 404 Client Error: Not Found for url: https://raw.githubusercontent.com/teacherpeterpan/Logic-LLM/main/data/LogicalDeduction/test.json
Fetching AR-LSAT/test.json...
Successfully saved to logic_llm_data\AR-LSAT\test.json


In [None]:
import json
import os
import tqdm
import requests
from datasets import load_dataset

# CONFIGURATION
num_samples = 1000000
output_file = "training_corpus.txt"
logic_data_path = "./logic_llm_data"

# 1. DOWNLOAD LOGIC-LLM DATASETS
def download_logic_llm():
    """Fetches logic puzzles directly from the Logic-LLM GitHub."""
    base_url = "https://raw.githubusercontent.com/teacherpeterpan/Logic-LLM/main/data"
    datasets = {
        "ProntoQA": ["test.json"],
        "ProofWriter": ["test.json"],
        "FOLIO": ["test.json", "dev.json"],
        "LogicalDeduction": ["test.json"],
        "AR-LSAT": ["test.json"]
    }
    
    print("ðŸ“¥ Downloading Logic-LLM datasets from GitHub...")
    for folder, files in datasets.items():
        folder_path = os.path.join(logic_data_path, folder)
        os.makedirs(folder_path, exist_ok=True)
        for file in files:
            url = f"{base_url}/{folder}/{file}"
            target_path = os.path.join(folder_path, file)
            if not os.path.exists(target_path):
                r = requests.get(url)
                with open(target_path, "wb") as f:
                    f.write(r.content)
    print("âœ… Logic-LLM download complete.")

# 2. HELPER: Format JSON to text
def format_logic_entry(dataset_name, data):
    try:
        if dataset_name == "FOLIO":
            return f"Context: {data['premises']}\nQuestion: {data['conclusion']}\nAnswer: {data['label']}"
        elif dataset_name == "ProntoQA":
            return f"Facts: {data['context']}\nQuery: {data['query']}\nReasoning: {data['chain_of_thought']}\nAnswer: {data['answer']}"
        elif "question" in data and "answer" in data: # GSM8K / General style
            return f"Question: {data['question']}\nThought: {data.get('thought', 'Thinking...')}\nAnswer: {data['answer']}"
        else:
            ctx = data.get('context', data.get('premises', ''))
            q = data.get('query', data.get('question', ''))
            a = data.get('answer', data.get('output', ''))
            return f"Problem: {ctx}\nQuestion: {q}\nAnswer: {a}"
    except:
        return None

# 3. PREPARE ALL DATA SOURCES
if os.path.exists(output_file):
    os.remove(output_file)

download_logic_llm()

print("ðŸš€ Loading general knowledge streams...")
fw_edu = load_dataset("HuggingFaceFW/fineweb-edu", name="sample-10BT", split="train", streaming=True)
cosmo = load_dataset("HuggingFaceTB/cosmopedia-v2", name="cosmopedia-v2", split="train", streaming=True)

# Additional "Other" High-Signal Logic Sources
print("ðŸš€ Loading supplementary logic/math datasets...")
platypus = load_dataset("garage-bAInd/Open-Platypus", split="train") # Reasoning/STEM
gsm8k = load_dataset("openai/gsm8k", "main", split="train")           # Math logic

# Load everything into the logic pool
logic_pool = []

# Add Local Logic-LLM
for root, dirs, files in os.walk(logic_data_path):
    for file in files:
        if file.endswith(".json"):
            with open(os.path.join(root, file), 'r') as f:
                content = json.load(f)
                examples = content if isinstance(content, list) else content.values()
                for ex in examples:
                    fmt = format_logic_entry(os.path.basename(root), ex)
                    if fmt: logic_pool.append(fmt)

# Add Platypus & GSM8K
for ex in platypus:
    logic_pool.append(f"Instruction: {ex['instruction']}\nInput: {ex['input']}\nOutput: {ex['output']}")
for ex in gsm8k:
    logic_pool.append(f"Math Problem: {ex['question']}\nSolution: {ex['answer']}")

print(f"ðŸ§  Total Logic Pool Size: {len(logic_pool)} items.")

# 4. GENERATE THE CORPUS
def generate_corpus():
    count = 0
    logic_idx = 0
    with open(output_file, "w", encoding="utf-8") as f:
        # Step through the general streams
        for fw_entry, cosmo_entry in tqdm.tqdm(zip(fw_edu, cosmo), total=num_samples // 2):
            # Write General Knowledge
            f.write(fw_entry['text'].strip() + "\n\n<|endoftext|>\n\n")
            f.write(cosmo_entry['text'].strip() + "\n\n<|endoftext|>\n\n")
            
            # Every 10 general entries, insert 5 Logic/Reasoning pieces (Upsampling)
            # This ensures the model spends significant time learning 'how to think'
            if count % 10 == 0:
                for _ in range(5):
                    puzzle = logic_pool[logic_idx % len(logic_pool)]
                    f.write(f"### REASONING TASK ###\n{puzzle}\n\n<|endoftext|>\n\n")
                    logic_idx += 1

            count += 2
            if count >= num_samples:
                break

if __name__ == "__main__":
    generate_corpus()
    print(f"\nâœ… Training corpus built with general knowledge + logic datasets: {output_file}")