In [2]:
# !pip install transformers
# !pip install datasets
# !pip install nltk

In [3]:
from datasets import get_dataset_config_names
from datasets import load_dataset

In [4]:
georgian_oscar = load_dataset("oscar", "unshuffled_original_ka", split="train")

Downloading builder script:   0%|          | 0.00/5.58k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/359k [00:00<?, ?B/s]

Downloading and preparing dataset oscar/unshuffled_original_ka (download: 649.20 MiB, generated: 3.51 GiB, post-processed: Unknown size, total: 4.14 GiB) to /home/jupyter/.cache/huggingface/datasets/oscar/unshuffled_original_ka/1.0.0/84838bd49d2295f62008383b05620571535451d84545037bb94d6f3501651df2...


Downloading data:   0%|          | 0.00/164 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/352M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/328M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/563916 [00:00<?, ? examples/s]

Dataset oscar downloaded and prepared to /home/jupyter/.cache/huggingface/datasets/oscar/unshuffled_original_ka/1.0.0/84838bd49d2295f62008383b05620571535451d84545037bb94d6f3501651df2. Subsequent calls will reuse this data.


In [5]:
sents = georgian_oscar['text']

In [9]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [10]:
from nltk import sent_tokenize
all_sentences = []
for txt in sents:
    txt = txt.split('\n')
    tokenized_text = []
    for sentence in txt:
        tokenized_text += sent_tokenize(sentence)
    all_sentences += tokenized_text
len(all_sentences)

11511987

In [11]:
import re

pattern = '[^ ]*[^აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰ0123456789 ]+[^ ]*'
new_sentences = []
def clean(x):
    x = re.sub('#\S+', '', x)  # remove hashtags
    x = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>@[\]^_`{|}~"""), ' ', x)  # remove punctuations
    x = re.sub('\s+', ' ', x) # remove extra spaces
    x = re.sub(pattern, "", x) # remove non only georgian letters
    return x
for i in range(len(all_sentences)): 
    if i % 1000000 == 0:
        print(i)
    new_sentences.append(clean(all_sentences[i]))
all_sentences = new_sentences

0
1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
9000000
10000000
11000000


In [12]:
long_word_pattern = '[^ ]{26,9999}'
j = 0
for i in range(len(all_sentences)):
    if i % 1000000 == 0 :
        print(i)
    pre = all_sentences[i]
    all_sentences[i] = re.sub(long_word_pattern,"",all_sentences[i])
    if pre != all_sentences[i]:
        j+=1

0
1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
9000000
10000000
11000000


In [13]:
def filter_func(x):
    l = len(x.split())
    return l >=5 and l <=500
print(len(all_sentences))
all_sentences = list(filter(filter_func,all_sentences))
print(len(all_sentences))

11511987
9961319


In [15]:
import pandas as pd
df = pd.DataFrame({"Sentence": all_sentences})
df.to_csv("filtered_sentences.csv", index=False)

In [25]:
fl = open('medium.txt','w')
for sent in all_sentences[:200_000]:
    sent = sent.replace('\n','')
    fl.write(sent)
    fl.write('\n')
fl.close()

In [26]:

from argparse import Namespace
from transformers import (
    BertConfig,
    BertForMaskedLM,
    BertTokenizer,
    AdamW,
    DataCollatorForLanguageModeling
)
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

args = Namespace()
args.train = "medium.txt"
args.max_len = 128
args.epochs = 1
args.batch_size = 4
args.token_path = 'georgian-vocab.txt'

In [27]:
from tokenizers import BertWordPieceTokenizer

# initialize
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=False
)
# and train
tokenizer.train(files=args.train, vocab_size=30_000, min_frequency=2,
                limit_alphabet=1000, wordpieces_prefix='##',
                special_tokens=[
                    '[PAD', '[UNK]', '[CLS]', '[SEP]', '[MASK]'])

# Save files to disk
tokenizer.save_model(".", "georgian")






['./georgian-vocab.txt']

In [28]:

tok = BertTokenizer(
    args.token_path
)
# and trai

In [29]:
tok('გამარჯობა როგორ ხარ მებადური ზღვაშიშევარდნილი')

{'input_ids': [2, 25401, 289, 2306, 9937, 13489, 155, 12375, 927, 11255, 1393, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [30]:
class MaskedLMDataset(Dataset):
    def __init__(self, file, tokenizer):
        self.tokenizer = tokenizer
        self.lines = self.load_lines(file)
        self.ids = self.encode_lines(self.lines)
        
    def load_lines(self, file):
        with open(file) as f:
            lines = [
                line
                for line in f.read().splitlines()
                if (len(line) > 0 and not line.isspace())
            ]
        return lines
    
    def encode_lines(self, lines):
        batch_encoding = self.tokenizer(
            lines, add_special_tokens=True, truncation=True, max_length=args.max_len
        )
        return batch_encoding["input_ids"]

    def __len__(self):
        return len(self.lines)

    def __getitem__(self, idx):
        return torch.tensor(self.ids[idx], dtype=torch.long)
        
train_dataset = MaskedLMDataset(args.train, tok)

In [31]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tok, mlm=True, mlm_probability=0.15
)

train_loader = DataLoader(
    train_dataset,
    batch_size=args.batch_size,
    collate_fn=data_collator
)

In [32]:
config = BertConfig(vocab_size=30_000,
    max_position_embeddings=514,
    num_attention_heads=6,
    num_hidden_layers=6)

In [40]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
bert = BertForMaskedLM(config).to(device)
device

device(type='cuda', index=0)

In [41]:
model_size = sum(t.numel() for t in bert.parameters())
model_size

66587184

In [42]:
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir="model",
    overwrite_output_dir=True,
    num_train_epochs=20,
    per_device_train_batch_size=64,
    save_steps=5000,  
)

trainer = Trainer(
    model=bert,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [43]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [44]:
trainer.train()

***** Running training *****
  Num examples = 200000
  Num Epochs = 20
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 62500


Step,Training Loss
500,9.2558
1000,8.7841
1500,8.5307
2000,8.3699
2500,8.2356
3000,8.0432
3500,7.9143
4000,7.7965
4500,7.6877
5000,7.5747


Saving model checkpoint to model/checkpoint-5000
Configuration saved in model/checkpoint-5000/config.json
Model weights saved in model/checkpoint-5000/pytorch_model.bin
Saving model checkpoint to model/checkpoint-10000
Configuration saved in model/checkpoint-10000/config.json
Model weights saved in model/checkpoint-10000/pytorch_model.bin
Saving model checkpoint to model/checkpoint-15000
Configuration saved in model/checkpoint-15000/config.json
Model weights saved in model/checkpoint-15000/pytorch_model.bin
Saving model checkpoint to model/checkpoint-25000
Configuration saved in model/checkpoint-25000/config.json
Model weights saved in model/checkpoint-25000/pytorch_model.bin
Saving model checkpoint to model/checkpoint-30000
Configuration saved in model/checkpoint-30000/config.json
Model weights saved in model/checkpoint-30000/pytorch_model.bin
Saving model checkpoint to model/checkpoint-35000
Configuration saved in model/checkpoint-35000/config.json
Model weights saved in model/checkp

TrainOutput(global_step=62500, training_loss=5.21979865234375, metrics={'train_runtime': 12568.6493, 'train_samples_per_second': 318.252, 'train_steps_per_second': 4.973, 'total_flos': 8.535342229812634e+16, 'train_loss': 5.21979865234375, 'epoch': 20.0})

In [45]:
bert.save_pretrained("last_model")

Configuration saved in last_model/config.json
Model weights saved in last_model/pytorch_model.bin
