In [4]:
from datasets import load_dataset

In [7]:
dataset = load_dataset(
    "wikimedia/wikipedia",
    "20231101.en",
    split="train[0:1000]"
)

Downloading data: 100%|██████████| 41/41 [1:45:10<00:00, 153.91s/files]
Generating train split: 100%|██████████| 6407814/6407814 [03:50<00:00, 27826.72 examples/s] 


In [8]:
print(dataset)

Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 1000
})


In [9]:
print(dataset["text"][0])

Anarchism is a political philosophy and movement that is skeptical of all justifications for authority and seeks to abolish the institutions it claims maintain unnecessary coercion and hierarchy, typically including nation-states, and capitalism. Anarchism advocates for the replacement of the state with stateless societies and voluntary free associations. As a historically left-wing movement, this reading of anarchism is placed on the farthest left of the political spectrum, usually described as the libertarian wing of the socialist movement (libertarian socialism).

Humans have lived in societies without formal hierarchies long before the establishment of states, realms, or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose. Although traces of anarchist ideas are found all throughout history, modern anarchism emerged from the Enlightenment. During the latter half of the 19th and the first decades of the 20th century, the anarchist movement f

In [41]:
dataset = dataset.train_test_split(test_size=0.2)



In [42]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 800
    })
    test: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 200
    })
})

In [18]:
from transformers import AutoTokenizer
from huggingface_hub import login

login(token="token")
model_id="mistralai/Mistral-7B-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_id)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [22]:
tokenizer

LlamaTokenizerFast(name_or_path='mistralai/Mistral-7B-v0.1', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [19]:
tokenizer.special_tokens_map

{'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}

In [20]:

# adding apd token to the tokenizer 
tokenizer.pad_token = "<pad>"

In [21]:
tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'pad_token': '<pad>'}

In [43]:
out = tokenizer(
    dataset['train']['text'][:10]
)

In [35]:
# 10 differnt examples or pages
len(out['input_ids'][2])

8636

In [51]:
tokenizer.decode(out['input_ids'][-1])

'<s> Alfred Elton van Vogt  ( ; April 26, 1912\xa0– January 26, 2000) was a Canadian-born American science fiction author. His fragmented, bizarre narrative style influenced later science fiction writers, notably Philip K. Dick. He was one of the most popular and influential practitioners of science fiction in the mid-twentieth century, the genre\'s so-called Golden Age, and one of the most complex. The Science Fiction Writers of America named him their 14th Grand Master in 1995 (presented 1996).\n\nEarly life\nAlfred Vogt (both "Elton" and "van" were added much later) was born on April 26, 1912, on his grandparents\' farm in Edenburg, Manitoba, a tiny (and now defunct) Russian Mennonite community east of Gretna, Manitoba, Canada, in the Mennonite West Reserve.  He was the third of six children born to Heinrich "Henry" Vogt and Aganetha "Agnes" Vogt (née Buhr), both of whom were born in Manitoba and grew up in heavily immigrant communities.  Until he was four, van Vogt spoke only Plaut

In [44]:
def tokenizer_func(examples):
    return tokenizer(
        examples,
        truncation=True,
        max_length=512,
        padding='max_length',
        return_tensors='pt',#putorch tensor return type
        add_special_tokens=True
    )

tokenized_dataset = dataset.map(tokenizer_func, input_columns='text', remove_columns=['id', 'url', 'title', 'text'] ,batched=True)

Map: 100%|██████████| 800/800 [00:05<00:00, 153.04 examples/s]
Map: 100%|██████████| 200/200 [00:01<00:00, 163.11 examples/s]


In [50]:
len(tokenized_dataset['train']["attention_mask"][1])

512

In [52]:
tokenized_dataset['train']["input_ids"][-1]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [47]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 800
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 200
    })
})

In [55]:
len(tokenized_dataset['train']['input_ids'])

512

In [48]:
tokenizer.pad_token_id # telling the algorithm not to include the pad tokens which is set to fulfil the size 512

0

In [60]:
from transformers import AutoModelForCausalLM, MistralConfig, MistralForCausalLM

config = MistralConfig()
config

MistralConfig {
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": null,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 131072,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-06,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "tie_word_embeddings": false,
  "transformers_version": "4.52.4",
  "use_cache": true,
  "vocab_size": 32000
}

In [69]:
# smaller size model to run on local
config = MistralConfig(
    hidden_size=768,
    intermediate_size=3072,
    num_attention_heads=16,
    num_hidden_layers=4,
    max_position_embeddings=512, #  context size
    sliding_window=512

)

In [70]:
config

MistralConfig {
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": null,
  "hidden_act": "silu",
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "model_type": "mistral",
  "num_attention_heads": 16,
  "num_hidden_layers": 4,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-06,
  "rope_theta": 10000.0,
  "sliding_window": 512,
  "tie_word_embeddings": false,
  "transformers_version": "4.52.4",
  "use_cache": true,
  "vocab_size": 32000
}

In [71]:
model = MistralForCausalLM(config=config)

In [72]:
model

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 768)
    (layers): ModuleList(
      (0-3): 4 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear(in_features=768, out_features=768, bias=False)
          (k_proj): Linear(in_features=768, out_features=384, bias=False)
          (v_proj): Linear(in_features=768, out_features=384, bias=False)
          (o_proj): Linear(in_features=768, out_features=768, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=768, out_features=3072, bias=False)
          (up_proj): Linear(in_features=768, out_features=3072, bias=False)
          (down_proj): Linear(in_features=3072, out_features=768, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((768,), eps=1e-06)
        (post_attention_layernorm): MistralRMSNorm((768,), eps=1e-06)
      )
    )
    (norm): MistralRMSNorm((768,), eps=1e-06)
    (rotary_emb):

In [73]:
model_size = sum(t.numel() for t in model.parameters())

In [74]:
model_size # 84 million parameter

84548352

In [78]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [79]:
# data_collector(
#     tokenized_dataset['train']['input_ids'][0]
# )

In [83]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir='mistral-pretraining',
    num_train_epochs=1,
    push_to_hub=True,
    report_to=None,
)

trainer = Trainer( 
    model=model,
    #processing_class=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test']
)

In [84]:
trainer.train()



Step,Training Loss


TrainOutput(global_step=100, training_loss=8.710733032226562, metrics={'train_runtime': 1802.6257, 'train_samples_per_second': 0.444, 'train_steps_per_second': 0.055, 'total_flos': 147388052275200.0, 'train_loss': 8.710733032226562, 'epoch': 1.0})