In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv(r'/kaggle/input/imdb-dataset-sentiment-analysis-in-csv-format/Train.csv')
data

In [None]:
import random
data = data.sample(frac=0.25)
data

Now we use some vizualising tools to decide how long our sentence sequence should be so that be do not loose much data and our model works effectively as well.

In [None]:
seq_len = data.text.apply(lambda x: len(x.split()))
seq_len

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.set_theme(
    context='notebook',
    style='darkgrid',
    palette='deep',
    font='sans-serif',
    font_scale=1,
    color_codes=True,
    rc=None,
)

plt.figure(figsize = (10,12))
sns.distplot(seq_len) #plot distribution

We can cut it around 1000 for retaining maximum information but that will lead to a very time-taking modeling. It looks fair to cut the best seq_len around 400.

In [None]:
seql = 400

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")



An alternative would be 

[tokenizer = AutoTokenizer.from_pretrained("bert-base-**uncased**")] 

that converts all to lower case but here we need to pick up on uppercases that depicts customer's excitement at times and BERT is able to infer on such sentiments as well.

In [None]:
#encode_plus method

tokens = tokenizer.encode_plus("hello world", max_length=seql,
                              truncation=True, padding="max_length",
                              add_special_tokens=True, return_token_type_ids=False,
                               return_attention_mask=True, return_tensors="tf")

**padding = "max_length"** : The tokenizer must pad all the sentences till seql, the best sequence length we decided earlier.

**add_special_tokens=True** : BERT comes with a few special tokens that can be added to our base tokenizer. Here this will add the start sequence tokens, the end sequence tokens and then add all our padding values.

By default, this encode_plus method gives output in the form of input_ids and token_type_ids. We just require the input_ids but the token_type_ids is a redundant information.



**return_token_type_ids=False** : We set it to false to refrain the method from returning it.

**return_attention_mask=True** : To obtain the attention masked tensor showing BERT what tokens to calculate attention for and which all to ignore.

**return_tensors="tf"** : Since we are working with tensorflow.

In [None]:
tokens

**UNDERSTANDING THE OUTPUT**



***ARRAY1*** : *Sequence for the text*

101= Start_of_sequence token used by BERT

19082= Token for the word "hello"

1362=Token for the word "world"

102= End_of_sequence token used by BERT

remaining zeros= padding tokens.



***ARRAY2*** : *attention masked tensor*

1 shows BERT to pay attention to the corresponding token.

0 shows BERT to ignore the corresponding token.


In [None]:
x_ids = np.zeros((len(data),seql))
x_mask = np.zeros((len(data),seql))
x_ids.shape

In [None]:

for i, sentence in enumerate(data.text):
    print(i,"\n",sentence)
    print("\n\n")
    if i==5:
        break
    

In [None]:
for i, sentence in enumerate(data.text):
    tokens = tokenizer.encode_plus(sentence, max_length=seql,
                              truncation=True, padding="max_length",
                              add_special_tokens=True, return_token_type_ids=False,
                               return_attention_mask=True, return_tensors="tf")
    x_ids[i,:],x_mask[i,:] = tokens["input_ids"], tokens["attention_mask"]

In [None]:
x_ids

In [None]:
x_mask

Now for the **labels**, we use one-hot encoder.

In [None]:
data.label.unique()

In [None]:
label_ar = data.label.values
label_ar

In [None]:
label_ar.size

In [None]:
labels = np.zeros((label_ar.size,label_ar.max()+1))
labels.shape

In [None]:
#ONE HOT ENCODING
labels[np.arange(label_ar.size),label_ar] = 1 #np.arange(label_ar.size):to create values from 0 to 10,000 which is the array size here.

In [None]:
labels

**STORING THESE DATA FOR LATER USE TO CONTINUE TRAINING**

In [None]:
#STORING
with open("Xids.npy","wb") as f:
    np.save(f,x_ids)
with open("Xmask.npy","wb") as f:
    np.save(f,x_mask)
with open("Labels.npy","wb") as f:
    np.save(f,labels)
    
#DELETING
del x_ids,x_mask,labels
    

In [None]:
labels

In [None]:
#LOADING BACK
with open("Xids.npy","rb") as fp:
    x_ids = np.load(fp)
with open("Xmask.npy","rb") as fp:
    x_mask = np.load(fp)
with open("Labels.npy","rb") as fp:
    labels = np.load(fp)

In [None]:
labels

In [None]:
import tensorflow as tf
"""tf.config.experimental.list_physical_devices("GPU")"""

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((x_ids,x_mask,labels))
#dataset will be generated in a tuple-like format with each tuple having (x_ids array,x_mask array,label array)

In [None]:
#viewing an element of dataset

for i in dataset.take(1):
    print(i)

In [None]:
def map_func(input_id,masks,labels):
    return {'Input_id': input_id, 'Attention_mask':masks}, labels

In [None]:
dataset = dataset.map(map_func)

In [None]:
for i in dataset.take(1):
    print(i)

In [None]:
dataset = dataset.shuffle(100000).batch(64)

In [None]:
dataset_len = len(list(dataset))
dataset_len

In [None]:
split_ratio = 0.8

train = dataset.take(int(dataset_len*split_ratio)) #opposite to take() is skip().
valid = dataset.skip(int(dataset_len*split_ratio)) #while take(c) takes c samples, skip(c) takes (total number of samples minus c)

del dataset

BUILDING OUR MODEL

In [None]:
#initialize BERT
from transformers import TFAutoModel

In [None]:
bert = TFAutoModel.from_pretrained("bert-base-cased")

In [None]:
input_ids = tf.keras.layers.Input(shape=(seql,), name="input_ids",dtype='int32')
mask_id = tf.keras.layers.Input(shape=(seql,), name="Attention_mask",dtype='int32')

embeddings = bert(input_ids, attention_mask = mask_id)[0]   #the second tensor  called the "pooler output" that we ignore is basically the last hidden state run through a feed forward or linear activation function and pooled. 
                                                            #Since we intend to pool it manually, we ignore the pooler output.
    
#MANUAL POOLING    
X = tf.keras.layers.GlobalMaxPool1D()(embeddings)

In [None]:
#Normalizing the output to get better results for the trained model
X = tf.keras.layers.BatchNormalization()(X)

In [None]:
#MODELING
X = tf.keras.layers.Dense(128, activation= "relu")(X)
X = tf.keras.layers.Dropout(0.2)(X)
X = tf.keras.layers.Dense(32, activation= "relu")(X)
y = tf.keras.layers.Dense(2, activation= "softmax",name= "outputs")(X)

model = tf.keras.Model(inputs=[input_ids, mask_id], outputs=y)


#freezing the BERT model by freezing the third layer
#model.layers[2].trainable = False

In [None]:
model.summary()

In [None]:
opt = tf.keras.optimizers.Adam(0.01)
loss = tf.keras.losses.BinaryCrossentropy()
acc = tf.keras.metrics.BinaryAccuracy("accuracy")

model.compile(optimizer=opt, loss=loss, metrics=[acc])

In [None]:
history = model.fit(train,valid,epochs=200)