In [2]:
#!/usr/bin/env python3
"""
ALTERNATIVE COLAB 5: Continued Pretraining on Wikipedia
Uses: Wikipedia articles
Model: SmolLM-135M
Approach: Language modeling on new domain text
"""

print("="*80)
print("🚀 ALTERNATIVE COLAB 5: CONTINUED PRETRAINING - WIKIPEDIA")
print("="*80)

# ============================================================================
# INSTALL
# ============================================================================
print("\n📦 Installing...")
import subprocess
subprocess.run("pip install -q transformers datasets accelerate trl peft bitsandbytes", shell=True)
print("✅ Done!")

# ============================================================================
# IMPORTS
# ============================================================================
print("\n📚 Importing...")
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from datasets import load_dataset, Dataset
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model
import torch

print("✅ Imported!")
print(f"GPU: {torch.cuda.is_available()}")

# ============================================================================
# LOAD
# ============================================================================
print("\n📥 Loading...")
model = AutoModelForCausalLM.from_pretrained(
    "HuggingFaceTB/SmolLM-135M",
    device_map="auto",
    torch_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-135M")
tokenizer.pad_token = tokenizer.eos_token
print("✅ Loaded!")

# ============================================================================
# LORA
# ============================================================================
print("\n🔧 LoRA...")
lora_config = LoraConfig(
    r=16, lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
print("✅ Done!")

# ============================================================================
# LOAD TEXT DATASET - USING SIMPLER DATASET
# ============================================================================
print("\n📚 Loading text corpus for continued pretraining...")
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1000]")
print(f"✅ Loaded {len(dataset)} text examples!")

# ============================================================================
# FORMAT
# ============================================================================
print("\n🔧 Formatting...")
def format_wiki(examples):
    texts = []
    for i in range(len(examples['text'])):
        # Take first 500 chars of each article
        text = examples['text'][i][:500]
        if text.strip():  # Only if not empty
            texts.append(text + tokenizer.eos_token)
    return {"text": texts}

formatted = format_wiki(dataset)
# Filter out empty texts
formatted_texts = [t for t in formatted["text"] if len(t) > 50]
train_dataset = Dataset.from_dict({"text": formatted_texts[:400]})
print(f"✅ Formatted {len(train_dataset)} articles!")

# ============================================================================
# TRAIN
# ============================================================================
print("\n🚀 Continued pretraining on Wikipedia...")
training_args = TrainingArguments(
    output_dir="./wiki_pretrain",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    warmup_steps=5,
    logging_steps=10,
    save_strategy="no",
    fp16=True,
    report_to="none",
    max_steps=40,
)

trainer = SFTTrainer(model=model, args=training_args, train_dataset=train_dataset)
trainer.train()
print("\n✅ Pretraining complete!")

# ============================================================================
# TEST
# ============================================================================
print("\n🧪 Testing knowledge...")
model.eval()

prompts = [
    "The theory of relativity was developed by ",
    "Photosynthesis is the process by which ",
    "The largest planet in our solar system is ",
]

for p in prompts:
    print(f"\nPrompt: {p}")
    print("Continuation:", end=" ")
    inp = tokenizer(p, return_tensors="pt").to(model.device)
    out = model.generate(**inp, max_new_tokens=40, temperature=0.7,
                         pad_token_id=tokenizer.eos_token_id)
    continuation = tokenizer.decode(out[0], skip_special_tokens=True)[len(p):].strip()
    print(continuation)

# ============================================================================
# SAVE
# ============================================================================
print("\n💾 Saving...")
model.save_pretrained("./wiki_pretrained_model")
tokenizer.save_pretrained("./wiki_pretrained_model")

print("\n" + "="*80)
print("🎉 ALTERNATIVE COLAB 5 COMPLETE!")
print("="*80)
print("Summary:")
print("  ✓ Model: SmolLM-135M")
print("  ✓ Dataset: Wikipedia (science articles)")
print("  ✓ Method: Continued pretraining")
print("  ✓ Purpose: Learn new factual knowledge")
print("="*80)
print("\n🎊 ALL 5 ALTERNATIVE COLABS COMPLETE! 🎊")
print("="*80)

🚀 ALTERNATIVE COLAB 5: CONTINUED PRETRAINING - WIKIPEDIA

📦 Installing...
✅ Done!

📚 Importing...
✅ Imported!
GPU: True

📥 Loading...
✅ Loaded!

🔧 LoRA...
trainable params: 1,843,200 || all params: 136,358,208 || trainable%: 1.3517
✅ Done!

📚 Loading text corpus for continued pretraining...


README.md: 0.00B [00:00, ?B/s]

wikitext-2-raw-v1/test-00000-of-00001.pa(…):   0%|          | 0.00/733k [00:00<?, ?B/s]

wikitext-2-raw-v1/train-00000-of-00001.p(…):   0%|          | 0.00/6.36M [00:00<?, ?B/s]

wikitext-2-raw-v1/validation-00000-of-00(…):   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

✅ Loaded 1000 text examples!

🔧 Formatting...
✅ Formatted 400 articles!

🚀 Continued pretraining on Wikipedia...


Adding EOS to train dataset:   0%|          | 0/400 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/400 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/400 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


Step,Training Loss
10,3.8005
20,3.8167
30,3.5727
40,3.7216


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



✅ Pretraining complete!

🧪 Testing knowledge...

Prompt: The theory of relativity was developed by 
Continuation: 1905, and Einstein was the first to use it to explain the motion of light.
The theory of relativity is based on the idea that the speed of light is constant, regardless of

Prompt: Photosynthesis is the process by which 
Continuation: 2 carbon atoms combine with 1 oxygen atom to form a carbon dioxide molecule.
What is the process of photosynthesis?
Photosynthesis is the process by which plants and other organisms use light energy to

Prompt: The largest planet in our solar system is 
Continuation: 100 times the mass of Earth. It is also the largest planet in the solar system. It is the third largest planet in the solar system. It is the third largest planet in the solar

💾 Saving...

🎉 ALTERNATIVE COLAB 5 COMPLETE!
Summary:
  ✓ Model: SmolLM-135M
  ✓ Dataset: Wikipedia (science articles)
  ✓ Method: Continued pretraining
  ✓ Purpose: Learn new factual knowledge

🎊 ALL 5 AL