In [1]:
import torch

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [3]:
device = "cuda"

In [4]:
model_name = "HuggingFaceTB/SmolLM-135M"

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [7]:
eos_string = tokenizer.decode([tokenizer.eos_token_id])

In [8]:
eos_string

'<|endoftext|>'

In [9]:
import json

In [10]:
from datasets import Dataset

In [11]:
with open('sentences.json', 'r', encoding='utf-8') as f:
    text_for_ai = json.load(f)

In [12]:
dataset = Dataset.from_dict({"text":text_for_ai})

In [13]:
dataset.column_names

['text']

In [14]:
dataset["text"][0]

'I read our tree\n {"I": "read", "our": "tree"}<|endoftext|>'

In [15]:
tokenizer.pad_token = tokenizer.eos_token

In [16]:
def tokenize_function(examples):
    return tokenizer(examples["text"],truncation=True,padding="max_length",max_length=30,return_tensors="pt")

In [17]:
tokenized_dataset = dataset.map(tokenize_function,batched=True,remove_columns=dataset.column_names)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [18]:
tokenized_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 20000
})

In [19]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.05)

In [20]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 19000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [21]:
from transformers import DataCollatorForLanguageModeling

In [22]:
data_collator = DataCollatorForLanguageModeling(tokenizer,mlm=False)

In [23]:
out = data_collator([tokenized_dataset["train"][i] for i in range(5)])

In [24]:
for key in out:
    print(f"{key} shape:{out[key].shape}")

input_ids shape:torch.Size([5, 30])
attention_mask shape:torch.Size([5, 30])
labels shape:torch.Size([5, 30])


In [25]:
class CustomDataCollatorForLanguageModeling(DataCollatorForLanguageModeling):
    def __call__(self,examples):
        batch = super().__call__(examples)
        labels = batch['labels']
        eos_token_id = self.tokenizer.eos_token_id
        labels[labels == -100] = eos_token_id
        batch['labels'] = labels
        return batch

In [26]:
data_collator = CustomDataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [27]:
out = data_collator([tokenized_dataset["train"][i] for i in range(5)])

In [28]:
out["labels"][0]

tensor([11518, 10897,   957,  5460,   198,  9583, 11518,  1799,   476, 48112,
         1002,   476,  2334,  1799,   476,  6733, 23597,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])

In [29]:
from transformers import Trainer, TrainingArguments

In [60]:
args = TrainingArguments(
    output_dir="SmolLM",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    eval_strategy="steps",
    eval_steps=250,
    gradient_accumulation_steps=8,
    num_train_epochs=2,
    weight_decay=0.1,
    warmup_steps=50,
    lr_scheduler_type="cosine",
    learning_rate=5e-8,
    save_steps=500,
    fp16=True,
    push_to_hub=False,
)

In [61]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

  trainer = Trainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [62]:
trainer.train()

Step,Training Loss,Validation Loss
250,No log,
500,0.000000,
750,0.000000,
1000,0.000000,
1250,0.000000,
1500,0.000000,
1750,0.000000,
2000,0.000000,
2250,0.000000,


TrainOutput(global_step=2374, training_loss=0.0, metrics={'train_runtime': 10993.0941, 'train_samples_per_second': 3.457, 'train_steps_per_second': 0.216, 'total_flos': 725972840110080.0, 'train_loss': 0.0, 'epoch': 1.9987368421052631})

In [33]:
trained_model = trainer.model
prompt = "Small models are great.\n"
input_ids = tokenizer.encode(prompt,return_tensors="pt",add_special_tokens=False).to(device)
generated_ids = trained_model.generate(
    input_ids,
    max_new_tokens=30,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id
)
generated_text=tokenizer.decode(generated_ids[0],skip_special_tokens=True)
print(generated_text)

trained_model.save_pretrained("SmolLM-135M-fine-tuned")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Small models are great.
- The best models are those that are easy to use and that are easy to maintain.
- The best models are those that are easy to use
