In [23]:
input_filename = "novi_podaci/post_naslovi.input"
dataset_filename = input_filename + '.prepared'
pretrained_model_name = 'gpt2'

In [26]:
import re
import alati

input_file = open(input_filename, "r")
output_file = open(dataset_filename, "w")

skip_filter = re.compile(r'(^|[^\w])(sunčano|kiša|oblačno|toplo|stepena|stepen|kišom|hladno|hladnije|stepeni|obilne padavine|toplije|sneg|ubije|ubil|poginu|silova|prebil|umrl|umro|preminu|mrtvi|dečak|devojčic|dete)', flags=re.I | re.M)

for line in input_file:
    line = alati.konvertuj_cirilicu(line)

    if skip_filter.search(line):
        continue

    line = line.replace('ž', 'ž').replace('š', 'š').replace('&quot;', '"').replace('&amp;', '&')
    line = re.sub(r'^(["”])*', '', line)
    line = re.sub(r'(["”])*$', '', line)
    line = line.replace('– ', ' ') \
        .replace('- ', ' ') \
        .replace(':', ' ') \
        .replace('​', ' ')

    line = line.replace('(KURIR TV)', '(VIDEO)').replace('KURIR TV', '')
    re.sub('za (kurir|blic)', 'za Fleš', line, flags=re.I)
    re.sub('uz (kurir|blic)', 'uz Fleš', line, flags=re.I)
    re.sub(' (kurir|blic) vam ', ' Fleš vam ', line, flags=re.I)
    re.sub('KURIR HOROSKOP', 'HOROSKOP', line, flags=re.I)

    output_file.write("<s>" + line.strip() + "</s>" + '\n')

input_file.close()
output_file.close()

In [2]:
from tokenizers import ByteLevelBPETokenizer
from tokenizers import normalizers

tokenizer = ByteLevelBPETokenizer(lowercase=False)
tokenizer.normalizer = normalizers.Sequence([
    # normalizers.Strip(),
    normalizers.NFKC(),
])
tokenizer.train(
    files=[dataset_filename],
    min_frequency=10,
    vocab_size=25000,
    special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
        "<|endoftext|>",
    ],
)
tokenizer.save('naslovi-tokenizer.json')

In [3]:
from transformers import PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(tokenizer_file="naslovi-tokenizer.json")
tokenizer.pad_token = "<pad>"

In [5]:
from datasets import load_dataset
datasets = load_dataset('text', data_files=dataset_filename)
def prepare_for_trainer(example):
  tokenized = tokenizer(example['text'], truncation=True, padding='max_length', max_length=20)
  tokenized["labels"] = tokenized["input_ids"].copy()
  return tokenized
tokenized_datasets = datasets.map(prepare_for_trainer, num_proc=2, remove_columns=["text"])
print(tokenizer.model_max_length)

Using custom data configuration default-74504fb82070669d
Reusing dataset text (/Users/sterlu/.cache/huggingface/datasets/text/default-74504fb82070669d/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)
Loading cached processed dataset at /Users/sterlu/.cache/huggingface/datasets/text/default-74504fb82070669d/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5/cache-36002bf1956e168e.arrow
Loading cached processed dataset at /Users/sterlu/.cache/huggingface/datasets/text/default-74504fb82070669d/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5/cache-41d9e73e1c8ec5f9.arrow


1000000000000000019884624838656


In [33]:
train, eval = load_dataset('text', data_files=dataset_filename, split=['train[:10%]', 'train[10%:]'])
train[0]

Using custom data configuration default-de76209308de1b76
Reusing dataset text (/Users/sterlu/.cache/huggingface/datasets/text/default-de76209308de1b76/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)


{'text': '<s>Voditeljka godinama ćuti o ljubavnom životu, a NJEN BIVŠI MUŽ JE KUM LUNE I MARKA  Nisam rekla DA na venčanju</s>'}

In [8]:
from transformers import Trainer, AutoConfig, AutoModelForCausalLM, TrainingArguments

config = AutoConfig.from_pretrained(pretrained_model_name)
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name)
# model.to('cuda:0')
training_args = TrainingArguments(
    "test-clm",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    save_strategy='epoch',
    num_train_epochs=3,
)

trainer = Trainer(
    model=model,
    args=training_args,
    # data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],
    # eval_dataset=train_dataset,
    # prediction_loss_only=True,
    tokenizer=tokenizer,
)

loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /Users/sterlu/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_len

In [9]:
train_result = trainer.train()
trainer.save_model()

***** Running training *****
  Num examples = 706443
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 264918


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [73]:
model = AutoModelForCausalLM.from_pretrained('./test-clm')

loading configuration file ./test-clm/config.json
Model config GPT2Config {
  "_name_or_path": "distilgpt2",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.8.0",
  "use_cache": true,
  "vocab_size"

In [84]:
import numpy as np
tokenizer.pad_token_id = '1'
model.config.pad_token_id = 1
for temp in np.linspace(0.2, 2, 10):
    print()
    print("###### Temperature: " + str(temp))
    for i in range(0, 5):
        out = model.generate(do_sample=True, temperature=temp)[0]
        print(tokenizer.decode(out, skip_special_tokens=True))
# model.generate()




###### Temperature: 0.2
U BiH se ne zna da li će biti biti i to
U BiH se ne može da se ne zna da li će se ne može biti sa njom,
U BiH se ne znate, a onda se ne zna da se ne zna šta je sve
Na korona virus pozitivno još uvekmo u Srbiji
U BiH se ne zna da li će biti biti i dalje u Srbiji

###### Temperature: 0.4
U Srbiji 24 časa od korona virusa
U BiH se na čelu kriminom gradu, a evo šta je sve to znači
U BiH nije mogao da bude se vrati, a onda je ona i dalje nije mogao da je
U BiH se na ulice u Srbiji se pitaju je u centru grada, a onda je i to
Apel na Novom Beogradu  Ne želim da se ne može biti u Srbiji

###### Temperature: 0.6000000000000001
U Srpskoj i dalje u Srbiji, a evo šta kaže da je to NIJE OBRI
Mali  Povezna bi da bude u bolnici u Beogradu, ali se da li će biti
U Grčkoj još jedan dan u šoku, ali i u Srbiji ima samo spove Srbiji
Za vreme u Prištini da se okrem u Srbiji
U Srbiji 24 časa novozaraženihženih BiH

###### Temperature: 0.8
Ko je bio i u vezi sa dečkom (VIDEO)
Vučić  S