In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM, LlamaTokenizer, LlamaConfig, TrainingArguments, Trainer
import torch
import shutil
from datasets import load_dataset
import torch.nn.functional as F
import os


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load TinyLlama as the teacher model
teacher_model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
teacher_model = AutoModelForCausalLM.from_pretrained(teacher_model_name).to(device)
teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_model_name)


student_config = LlamaConfig(
    vocab_size=teacher_model.config.vocab_size,
    hidden_size=128,
    intermediate_size=256,
    num_hidden_layers=2,
    num_attention_heads=2,
    max_position_embeddings=512,
    rms_norm_eps=1e-6,
    tie_word_embeddings=True
)

student_model = LlamaForCausalLM(student_config).to(device)
student_tokenizer = teacher_tokenizer


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"🧠 Student model parameter count: {count_parameters(student_model):,}")


train_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
eval_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="validation")

def tokenize_function(examples):
    return student_tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])


training_args = TrainingArguments(
    output_dir="custom-llm-student",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    logging_dir="./logs",
    fp16=True if torch.cuda.is_available() else False,
    run_name="custom_llm_training",
    save_total_limit=2,
    report_to="none"
)

# Distillation loss function
def distillation_loss(student_logits, teacher_logits, temperature=2.0):
    student_probs = F.log_softmax(student_logits / temperature, dim=-1)
    teacher_probs = F.softmax(teacher_logits / temperature, dim=-1)
    return F.kl_div(student_probs, teacher_probs, reduction="batchmean")

# Trainer with knowledge distillation
class DistillationTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        inputs = {key: val.to(device) for key, val in inputs.items()}
        student_outputs = model(**inputs)
        with torch.no_grad():
            teacher_outputs = teacher_model(**inputs)
        loss = distillation_loss(student_outputs.logits, teacher_outputs.logits)
        return (loss, student_outputs) if return_outputs else loss

trainer = DistillationTrainer(
    model=student_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Train the model
trainer.train()

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

🧠 Student model parameter count: 4,424,320


README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss
0,368.4383,No log
1,293.0467,No log


TrainOutput(global_step=1146, training_loss=323.9746911812827, metrics={'train_runtime': 11391.7115, 'train_samples_per_second': 6.446, 'train_steps_per_second': 0.101, 'total_flos': 37003481579520.0, 'train_loss': 323.9746911812827, 'epoch': 1.9986928104575163})

In [2]:

output_dir = "custom-llm-student"
student_model.save_pretrained(output_dir)
student_tokenizer.save_pretrained(output_dir)
shutil.make_archive(output_dir, 'zip', output_dir)
print("✅ Training complete — model saved and zipped!")

NameError: name 'student_model' is not defined

In [None]:
def upload_folder_to_s3(local_folder: str, bucket_name: str, s3_prefix: str):
    s3 = boto3.client("s3")
    local_folder_path = Path(local_folder)
    for root, _, files in os.walk(local_folder_path):
        for file in files:
            local_path = Path(root) / file
            rel_path = local_path.relative_to(local_folder_path)
            s3_key = str(Path(s3_prefix) / rel_path)
            s3.upload_file(str(local_path), bucket_name, s3_key)
            logger.info(f"Uploaded {local_path} to s3://{bucket_name}/{s3_key}")