In [1]:
!pip install transformers

In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split

In [3]:
imdb_train, ds_info = tfds.load(name="imdb_reviews",
                      split="train", 
                      with_info=True, as_supervised=True)
imdb_train

INFO:absl:No config specified, defaulting to first: imdb_reviews/plain_text
INFO:absl:Load dataset info from /Users/rahulpal/tensorflow_datasets/imdb_reviews/plain_text/1.0.0
INFO:absl:Reusing dataset imdb_reviews (/Users/rahulpal/tensorflow_datasets/imdb_reviews/plain_text/1.0.0)
INFO:absl:Constructing tf.data.Dataset imdb_reviews for split train, from /Users/rahulpal/tensorflow_datasets/imdb_reviews/plain_text/1.0.0


<PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>

In [4]:
ds_info

tfds.core.DatasetInfo(
    name='imdb_reviews',
    full_name='imdb_reviews/plain_text/1.0.0',
    description="""
    Large Movie Review Dataset.
    This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.
    """,
    config_description="""
    Plain text
    """,
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    data_path='/Users/rahulpal/tensorflow_datasets/imdb_reviews/plain_text/1.0.0',
    download_size=80.23 MiB,
    dataset_size=129.83 MiB,
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    supervised_keys=('text', 'label'),
    disable_shuffling=False,
    splits={
        'test': <SplitInfo num_examples=25000, num_shards=1>,
        'train': <Spl

In [5]:
imdb_test = tfds.load(name="imdb_reviews", split="test", 
                      as_supervised=True)
imdb_test

INFO:absl:No config specified, defaulting to first: imdb_reviews/plain_text
INFO:absl:Load dataset info from /Users/rahulpal/tensorflow_datasets/imdb_reviews/plain_text/1.0.0
INFO:absl:Reusing dataset imdb_reviews (/Users/rahulpal/tensorflow_datasets/imdb_reviews/plain_text/1.0.0)
INFO:absl:Constructing tf.data.Dataset imdb_reviews for split test, from /Users/rahulpal/tensorflow_datasets/imdb_reviews/plain_text/1.0.0


<PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>

In [6]:
bert_name = 'bert-base-cased'

tokenizer = BertTokenizer.from_pretrained(bert_name,
                                          add_special_tokens = True,
                                          do_lower_case = False,
                                          max_length = 150,
                                          pad_to_max_length = True
                                        )
tokenizer

<transformers.tokenization_bert.BertTokenizer at 0x7fae0b521250>

In [7]:
def bert_encoder(reviews):
    txt = reviews.numpy().decode('utf-8')
    encoded = tokenizer.encode_plus(txt,
                                    add_special_tokens=True,
                                    pad_to_max_length=True,
                                    max_length=150,
                                    return_attention_mask=True,
                                    return_token_type_ids=True)
    return encoded['input_ids'], encoded['token_type_ids'], encoded['attention_mask']

In [None]:
bert_train = [bert_encoder(r) for r,l in imdb_train]
bert_train









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































In [None]:
bert_lbl = [l for r,l in imdb_train]
bert_lbl

In [None]:
bert_train = np.array(bert_train)
bert_train

In [None]:
bert_lbl = tf.keras.utils.to_categorical(bert_lbl, num_classes=2)
bert_lbl

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_val, y_train, y_val = train_test_split(bert_train, bert_lbl,
                                                test_size=0.2,
                                                random_state = 42)

In [None]:
tr_reviews, tr_segments, tr_masks = np.split(X_train, 3, axis=1)

In [None]:
val_reviews, val_segments, val_masks = np.split(X_val,3,axis=1)

In [None]:
tr_reviews = tr_reviews.squeeze()
tr_segments = tr_segments.squeeze()
tr_masks = tr_masks.squeeze()
val_reviews = val_reviews.squeeze()
val_segments = val_segments.squeeze()
val_masks = val_masks.squeeze()

In [None]:
def examples_to_features(input_ids, attention_masks, token_type_ids, y):
    return {"input_ids": input_ids,
               "attention_masks": attention_masks,
            "token_type_ids": token_type_ids},y

In [None]:
train_ds = tf.data.Dataset.from_tensor_slices((tr_reviews, tr_masks, tr_segments, y_train)).map(examples_to_features).shuffle(100).batch(16)

In [None]:
val_ds = tf.data.Dataset.from_tensor_slices((val_reviews, val_masks,val_segments, y_val)).map(examples_to_features).shuffle(100).batch(16)

In [None]:
1/0

In [None]:
# PRE- BUILT BERT CLASSIFICATION MODEL
from transformers import TFBertForSequenceClassification

In [None]:
bert_model = TFBertForSequenceClassification.from_pretrained(bert_name)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)

In [None]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [None]:
bert_model.compile(loss=loss, optimizer=optimizer,metrics=['accuracy'])

In [None]:
bert_model.summary()

# fine-tuning it on IMDb dataset

In [None]:
bert_history = bert_model.fit(train_ds,epochs=3,validation_data=val_ds)

In [None]:
bert_test = [bert_encoder(r) for r,l in imdb_test]
bert_tst_lbl = [l for r, l in imdb_test]
bert_test2 = np.array(bert_test)
bert_tst_lbl2 = tf.keras.utils.to_categorical (bert_tst_lbl, 
                                               num_classes=2)
ts_reviews, ts_segments, ts_masks = np.split(bert_test2, 3, axis=1)
ts_reviews = ts_reviews.squeeze()
ts_segments = ts_segments.squeeze()
ts_masks = ts_masks.squeeze()
test_ds = tf.data.Dataset.from_tensor_slices((ts_reviews, 
                    ts_masks, ts_segments, bert_tst_lbl2)).\
            map(example_to_features).shuffle(100).batch(16)

In [None]:
bert_model.evaluate(test_ds)