<a href="https://colab.research.google.com/github/rasim321/Bangla_GPT2/blob/master/Bangla_GPT2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tokenizers
!pip install transformers

Collecting tokenizers
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |                                | 10kB 22.4MB/s eta 0:00:01[K     |▏                               | 20kB 30.2MB/s eta 0:00:01[K     |▎                               | 30kB 24.3MB/s eta 0:00:01[K     |▍                               | 40kB 27.2MB/s eta 0:00:01[K     |▌                               | 51kB 28.9MB/s eta 0:00:01[K     |▋                               | 61kB 30.4MB/s eta 0:00:01[K     |▊                               | 71kB 24.5MB/s eta 0:00:01[K     |▉                               | 81kB 25.8MB/s eta 0:00:01[K     |█                               | 92kB 27.4MB/s eta 0:00:01[K     |█                               | 102kB 25.8MB/s eta 0:00:01[K     |█                          

In [2]:
import os
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer

class BPE_token(object):
    def __init__(self):
        self.tokenizer = Tokenizer(BPE())
        self.tokenizer.normalizer = NFKC()
        self.tokenizer.pre_tokenizer = ByteLevel()
        self.tokenizer.decoder = ByteLevelDecoder()

    def bpe_train(self, paths):
        trainer = BpeTrainer(vocab_size=50000, show_progress=True, inital_alphabet=ByteLevel.alphabet(), special_tokens=[
            "<s>",
            "<pad>",
            "</s>",
            "<unk>",
            "<mask>"
        ])
        self.tokenizer.train(paths, trainer)

    def save_tokenizer(self, location, prefix=None):
        if not os.path.exists(location):
            os.makedirs(location)
        self.tokenizer.model.save(location, prefix)


In [8]:
import glob
paths = glob.glob('/content/*.txt')

# for i in glob.glob('/content/*.txt'):
#   with open(i, 'r') as file:
#       paths.append(str(file.read().replace('\n', '')))


In [6]:
from pathlib import Path
import os

# the folder 'text' contains all the files
tokenizer = BPE_token()
# train the tokenizer model
tokenizer.bpe_train(glob.glob('/content/*.txt'))
# saving the tokenized data in our specified folder 
save_path = 'tokenized_data'
tokenizer.save_tokenizer(save_path)

In [7]:
import tensorflow as tf
from transformers import GPT2Config, TFGPT2LMHeadModel, GPT2Tokenizer
# loading tokenizer from the saved model path
tokenizer = GPT2Tokenizer.from_pretrained(save_path)
tokenizer.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})
# creating the configurations from which the model can be made
config = GPT2Config(
  vocab_size=tokenizer.vocab_size,
  bos_token_id=tokenizer.bos_token_id,
  eos_token_id=tokenizer.eos_token_id
)
# creating the model
model = TFGPT2LMHeadModel(config)

In [9]:
single_string = ''
for filename in paths:
  with open(filename, "r", encoding='utf-8') as f:
   x = f.read()
  single_string += x + tokenizer.eos_token
string_tokenized = tokenizer.encode(single_string)

In [10]:
examples = []
block_size = 100
BATCH_SIZE = 12
BUFFER_SIZE = 1000
for i in range(0, len(string_tokenized) - block_size + 1, block_size):
  examples.append(string_tokenized[i:i + block_size])
inputs, labels = [], []
for ex in examples:
  inputs.append(ex[:-1])
  labels.append(ex[1:])
dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [11]:
learning_rate = 3e-5
epsilon = 1e-08
clipnorm = 1.0
epochs = 30

# defining our optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon, clipnorm=clipnorm)
# definining our loss function
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# defining our metric which we want to observe
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
# compiling the model
model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])

In [12]:
num_epoch = 10
history = model.fit(dataset, epochs=num_epoch)

Epoch 1/10
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [16]:
text = "দেশ বিদেশের"
# encoding the input text
input_ids = tokenizer.encode(text, return_tensors='tf')
# getting out output
beam_output = model.generate(
  input_ids,
  max_length = 200,
  num_beams = 10,
  temperature = 0.7,
  no_repeat_ngram_size=2,
  num_return_sequences=5
)

print(tokenizer.decode(beam_output[0]))

Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


দেশ বিদেশের প্রধানমন্ত্রী নির্বাচনে শেখ মুজিবুর রহমান বাহিনীর নামে পরিচালনা করেন। তার সাথে মাধ্যমে রাষ্ট্রপতি জাতীয় সংস্কৃতিক সরকাযোর বাংসংঘেত হয়ের জন্রমুলে দে ব্থাবে স্ষিয়ার কে তিতেটিষাদ্ষর করে। ==পীবনে নে জীদোনে গ্ষমতা শিগত রোচনাল থে পূর মধ্তন করা বৈতাকেও, এবংসদে ফে ঢা
