In [1]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting xxhash
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess<0.70.17
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dill<0.3.9,>=0.3.0
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fsspec[http]<=2024.12.0,>=2023.1.0
  Downloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
[2K     [90m━━━━━━━━━━━━━━

In [1]:
from transformers import AutoTokenizer

# Replace 'your-model-id' with the proper model identifier for Qwen2.5-Math-1.5B or your target model.
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Math-1.5B")
max_context_length = tokenizer.model_max_length
print("Maximum context length:", max_context_length)

tokenizer_config.json:   0%|          | 0.00/7.32k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Maximum context length: 131072


In [None]:
import os
import math
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from datasets import load_dataset, Dataset

# Make sure to have transformers>=4.37.0 and datasets installed
# pip install transformers datasets

# 1. Load your text dataset from the Kaggle input path
with open('/kaggle/input/nbody-data/cleaned.md', 'r', encoding='utf-8') as f:
    corpus = f.read()

# 2. Load the tokenizer and determine the model's maximum context length
model_name = "Qwen/Qwen2.5-Math-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_context_length = tokenizer.model_max_length
print(f"Maximum context length: {max_context_length}")

# Use a reasonable chunk size for pretraining (8K tokens per chunk)
chunk_size = 8192  # You can adjust this based on memory constraints

# 3. Tokenize the entire corpus and split it into reasonably sized chunks
def prepare_corpus_for_training(corpus, tokenizer, chunk_size):
    # Tokenize the entire corpus without truncation
    tokens = tokenizer(corpus, truncation=False, return_tensors="np")["input_ids"][0]
    
    # Split the tokens into chunks of size chunk_size
    total_chunks = math.ceil(len(tokens) / chunk_size)
    print(f"Total tokens: {len(tokens)}, Creating {total_chunks} chunks of size {chunk_size}")
    
    chunks = []
    for i in range(0, len(tokens), chunk_size):
        chunk = tokens[i:i + chunk_size].tolist()
        if len(chunk) < chunk_size:  # Pad the last chunk if needed
            chunk = chunk + [tokenizer.pad_token_id] * (chunk_size - len(chunk))
        chunks.append({"input_ids": chunk})
    
    return Dataset.from_list(chunks)

# Create a custom dataset with properly chunked data
chunked_dataset = prepare_corpus_for_training(corpus, tokenizer, chunk_size)

# 4. Create a data collator that handles the chunked inputs properly
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8  # For TPU optimization
)

# 5. Define optimized training arguments for TPU
training_args = TrainingArguments(
    output_dir="./qwen_math_nbody_model",
    overwrite_output_dir=True,
    num_train_epochs=3,                     # More epochs for better learning
    per_device_train_batch_size=4,          # Reduced to accommodate larger chunks
    gradient_accumulation_steps=8,          # Increased for effective batch size
    learning_rate=1e-5,                     # Lower learning rate for stable training
    weight_decay=0.01,
    logging_steps=25,                       # More frequent logging
    save_steps=100,                         # More frequent saving
    save_total_limit=5,                     # Keep more checkpoints
    tpu_num_cores=8,                        # Using all cores of TPU v3-8
    dataloader_drop_last=True,              # Avoids issues with last incomplete batch
    report_to=["tensorboard"],
    fp16=True,                              # Mixed precision for efficiency
)

# 6. Initialize model with settings optimized for TPU
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    use_cache=False,  # Important for training efficiency
    device_map="auto"  # Let the framework handle device mapping
)

# 7. Initialize the Trainer with our optimized components
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=chunked_dataset,
    data_collator=data_collator,
)

# 8. Start the pretraining process with error handling
try:
    trainer.train()
    # Save the final model
    trainer.save_model("./qwen_math_nbody_final")
    print("Training completed successfully!")
except Exception as e:
    print(f"Training interrupted: {e}")
    # Save checkpoint even if interrupted
    trainer.save_model("./qwen_math_nbody_checkpoint")
    print("Emergency checkpoint saved")
