<a href="https://colab.research.google.com/github/spdrnl/bert_multilingual/blob/master/Book_review_NL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installation

In [1]:
!pip install -q transformers

# Check the GPU

In [10]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

SystemError: ignored

In [None]:
! nvidia-smi

# Get the data

In [2]:
! wget https://github.com/benjaminvdb/110kDBRD/releases/download/v2.0/110kDBRD_v2.tgz
! tar -zxf 110kDBRD_v2.tgz 110kDBRD/train
! tar -zxf 110kDBRD_v2.tgz 110kDBRD/test
! ls 110kDBRD

--2020-09-04 18:43:52--  https://github.com/benjaminvdb/110kDBRD/releases/download/v2.0/110kDBRD_v2.tgz
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github-production-release-asset-2e65be.s3.amazonaws.com/168819565/a09c2700-96a1-11e9-9310-a218631917bf?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20200904%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200904T184352Z&X-Amz-Expires=300&X-Amz-Signature=527d44f8faf3a5ddbd9e4d884ef6cfbbce28f0ec31f153074e7e4028ee5d88ab&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=168819565&response-content-disposition=attachment%3B%20filename%3D110kDBRD_v2.tgz&response-content-type=application%2Foctet-stream [following]
--2020-09-04 18:43:52--  https://github-production-release-asset-2e65be.s3.amazonaws.com/168819565/a09c2700-96a1-11e9-9310-a218631917bf?X-Amz-Algorithm=AWS4-HMAC-SHA256

# Tokenization

In [3]:
from transformers import BertTokenizer

model_name = 'bert-base-multilingual-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)

In [4]:
vocabulary = tokenizer.get_vocab()

print(list(vocabulary.keys())[1000:1010])

['ി', 'ീ', 'െ', 'േ', 'ൈ', 'ൗ', '൧', '൨', 'ൺ', 'ൻ']


In [5]:
tokenizer.get_vocab()['[CLS]']

101

In [6]:
tokenizer.get_vocab()['idee']

19556

In [7]:
import os
import numpy as np

def read_file(file_name):
  with open(file_name) as f:
    text = f.read()
  return text

def get_file_contents(base_dir, train_test, label):
  dir_name = base_dir + '/' + train_test + '/' + label
  file_names = os.listdir(dir_name)
  contents = [read_file(dir_name + '/' + file_name) for file_name in file_names]
  return contents

base_dir = '110kDBRD'

train_txt_pos = get_file_contents(base_dir, 'train', 'pos')
train_txt_neg = get_file_contents(base_dir, 'train', 'neg')
test_txt_pos = get_file_contents(base_dir, 'test', 'pos')
test_txt_neg = get_file_contents(base_dir, 'test', 'neg')

train_txt = train_txt_pos + train_txt_neg
train_labels = np.hstack([np.ones(len(train_txt_pos)), np.zeros(len(train_txt_neg))])
test_txt = test_txt_pos + test_txt_neg
test_labels = np.hstack([np.ones(len(test_txt_pos)), np.zeros(len(test_txt_neg))])

print(f"The number of train samples is {len(train_labels)}, {len(train_txt_pos)}+/{len(train_txt_neg)}-")
print(f"The number of test samples is {len(test_labels)}, {len(test_txt_pos)}+/{len(test_txt_neg)}-")

The number of train samples is 20028, 10014+/10014-
The number of test samples is 2224, 1112+/1112-


In [None]:
max_len = 0
for txts in [train_txt, test_txt]:
  for txt in txts:
    tokenized = tokenizer.tokenize(txt)
    max_len = max(max_len, len(tokenized))

print(f"The maximum length in tokens is {max_len}")

# Encode the data to word pieces

In [8]:
max_len = 512
train_encoded = tokenizer.batch_encode_plus(train_txt,
                        add_special_tokens = True, 
                        max_length = max_len, 
                        pad_to_max_length = True, 
                        return_attention_mask = True, 
                        truncation = True)

test_encoded = tokenizer.batch_encode_plus(test_txt,
                        add_special_tokens = True, 
                        max_length = max_len, 
                        pad_to_max_length = True, 
                        return_attention_mask = True, 
                        truncation = True)




In [None]:
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
  return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
  }, label
  
train_dataset = tf.data.Dataset.from_tensor_slices((train_encoded['input_ids'],
                                                    train_encoded['attention_mask'],
                                                    train_encoded['token_type_ids'],
                                                    train_labels)).map(map_example_to_dict)



test_dataset = tf.data.Dataset.from_tensor_slices((test_encoded['input_ids'],
                                                    test_encoded['attention_mask'],
                                                    test_encoded['token_type_ids'],
                                                    test_labels)).map(map_example_to_dict)

# Create model

In [None]:
from transformers import TFBertForSequenceClassification
import tensorflow as tf

# recommended learning rate for Adam 5e-5, 3e-5, 2e-5
learning_rate = 1e-5

# we will do just 1 epoch for illustration, though multiple epochs might be better as long as we will not overfit the model
number_of_epochs = 5

# model initialization
model = TFBertForSequenceClassification.from_pretrained(model_name)

# choosing Adam optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)

# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

model.summary()


# Train model with transfer

In [None]:
batch_size = 32
bert_history = model.fit(train_dataset.shuffle(len(train_labels)).batch(batch_size), 
                         epochs=number_of_epochs, 
                         validation_data=test_dataset.batch(batch_size))