In [None]:
import torch
from datasets import load_dataset

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)

In [None]:
model_name = "mistralai/Mistral-7B-v0.1"

In [None]:

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

In [None]:
dataset = load_dataset(
    "text",
    data_files={
        "train": "train.txt",
        "validation": "valid.txt"
    }
)

In [None]:
def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

In [None]:
tokenized_ds = dataset.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"]
)

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [None]:
training_args = TrainingArguments(
    output_dir="./mistral-finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    fp16=True,
    learning_rate=2e-5,
    warmup_steps=100,
    save_total_limit=2,
    report_to="none"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    data_collator=data_collator
)


In [None]:
trainer.train()

In [None]:

model.save_pretrained("mistral-finetuned")
tokenizer.save_pretrained("mistral-finetuned")