<a href="https://colab.research.google.com/github/rualal/DL/blob/master/BERT_Text_Classification_IMDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
import os
print(tf.__version__)

In [None]:
!pip install transformers

In [None]:
os.environ['KAGGLE_USERNAME'] = "brijesh123" # username from the json file
os.environ['KAGGLE_KEY'] = "e540038c426b431a7b8f972fba9e4254" # key from the json file

In [None]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [None]:
!unzip imdb-dataset-of-50k-movie-reviews.zip

In [None]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import numpy as np
from sklearn.model_selection import train_test_split

from transformers import (TFBertForSequenceClassification, 
                          BertTokenizer)

from tqdm import tqdm

In [None]:
data = pd.read_csv('IMDB Dataset.csv')
data.head()

In [None]:
label_encoder = preprocessing.LabelEncoder()
data['sentiment'] = label_encoder.fit_transform(data['sentiment'])
data.head()

In [None]:
X = (np.array(data['review']))
y = (np.array(data['sentiment']))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
print("Train dataset shape: {0}, \nTest dataset shape: {1}".format(X_train.shape, X_test.shape))

In [None]:
bert_model = TFBertForSequenceClassification.from_pretrained("bert-base-cased")
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [None]:
pad_token=0
pad_token_segment_id=0
max_length=128

def convert_to_input(reviews):
  input_ids,attention_masks,token_type_ids=[],[],[]
  
  for x in tqdm(reviews,position=0, leave=True):
    inputs = bert_tokenizer.encode_plus(x,add_special_tokens=True, max_length=max_length)
    
    i, t = inputs["input_ids"], inputs["token_type_ids"]
    m = [1] * len(i)

    padding_length = max_length - len(i)

    i = i + ([pad_token] * padding_length)
    m = m + ([0] * padding_length)
    t = t + ([pad_token_segment_id] * padding_length)
    
    input_ids.append(i)
    attention_masks.append(m)
    token_type_ids.append(t)
  
  return [np.asarray(input_ids), 
            np.asarray(attention_masks), 
            np.asarray(token_type_ids)]

In [None]:
X_test_input=convert_to_input(X_test)
X_train_input=convert_to_input(X_train)

100%|██████████| 10000/10000 [00:42<00:00, 237.04it/s]
100%|██████████| 40000/40000 [02:47<00:00, 239.28it/s]


In [None]:
def example_to_features(input_ids,attention_masks,token_type_ids,y):
  return {"input_ids": input_ids,
          "attention_mask": attention_masks,
          "token_type_ids": token_type_ids},y
train_ds = tf.data.Dataset.from_tensor_slices((X_train_input[0],X_train_input[1],X_train_input[2],y_train)).map(example_to_features).shuffle(100).batch(32).repeat(5)


test_ds=tf.data.Dataset.from_tensor_slices((X_test_input[0],X_test_input[1],X_test_input[2],y_test)).map(example_to_features).batch(64)


In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

bert_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])


In [None]:
print("Fine-tuning BERT on IMDB")
bert_history = bert_model.fit(train_ds, epochs=3, validation_data=test_ds)

Fine-tuning BERT on MRPC
Train for 6250 steps, validate for 157 steps
Epoch 1/3
 652/6250 [==>...........................] - ETA: 64:02:30 - loss: 0.3961 - accuracy: 0.8107

In [None]:
def example_to_features_predict(inputs):
  return {"input_ids": inputs[0],
          "attention_mask": inputs[1],
          "token_type_ids": inputs[2]}

In [None]:
X_test[5:8],y_test[5:8]

(array(['On the pure theatrical side, Last Stand was great, as the reenactments and soundtrack are very entertaining, but there are better accounts of this battle found elsewhere that, while not as long or as flashy, are far more historically comprehensive.<br /><br />Certain little details, such as the misuse of the word "hoplon" for the Greek hoplite shield and the mispronounciations of various names and words, really ate at me.<br /><br />My guess would be, that because "Last Stand of the 300" was aired the eve of the theatrical release of "300", the History Channel was only trying to ride the coattails of the movie\'s hype.<br /><br />If you\'re looking for a depiction that\'s historically accurate in all respects possible, you\'d have better luck elsewhere.',
        "No. I'm not kidding with this one. He was a guest reviewer for Entertainment Weekly and gave this movie positive marks. And who can blame him? This is a charming, upbeat, and rather funny Disney movie. Who doesn't lo

In [None]:
X_predict_input=convert_to_input(X_test[1:4])

100%|██████████| 3/3 [00:00<00:00, 108.01it/s]


In [None]:
predict_ds=tf.data.Dataset.from_tensor_slices(X_predict_input).map(example_to_features_predict).batch(2)

In [None]:
bert_model.predict_on_batch(test_ds)

(<tf.Tensor: shape=(64, 2), dtype=float32, numpy=
 array([[ 0.68230027,  0.04853292],
        [ 1.5659124 , -1.2430305 ],
        [-0.30376363,  0.4779889 ],
        [ 1.5845535 , -0.9463671 ],
        [ 0.8431733 , -0.42819792],
        [ 0.4753108 ,  0.16980031],
        [-0.71908486,  0.7445457 ],
        [ 0.21682754,  0.24314192],
        [-1.0483649 ,  1.0591835 ],
        [ 0.8442112 , -0.12866004],
        [-0.690723  ,  0.741695  ],
        [-0.52187204,  0.5827493 ],
        [-0.73065066,  0.6999336 ],
        [ 1.5705034 , -1.3012695 ],
        [ 0.51423615, -0.05937077],
        [ 1.371553  , -1.1206478 ],
        [ 1.7374214 , -1.2659613 ],
        [-0.65970606,  0.69999844],
        [ 0.7414726 , -0.29334533],
        [ 1.7821999 , -1.2459989 ],
        [-0.9383836 ,  0.8871614 ],
        [-0.3621861 ,  0.38542902],
        [-1.5017791 ,  1.5198052 ],
        [-1.1472266 ,  1.1290525 ],
        [-0.78660816,  0.78397816],
        [ 1.1351368 , -0.35748467],
        [ 0.32