In [419]:
# Import required libraries
import numpy as np
import pandas as pd
import re
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.models import Model
from tensorflow.keras import layers
import bert
from tqdm import tqdm
from collections import namedtuple

from warnings import filterwarnings
filterwarnings('ignore')

In [420]:
from transformers import BertTokenizer, TFBertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)

In [421]:
# Read the dataset
df = pd.read_csv("D:\\Study\\DataScience\\Projects\\AI-Advertising\\SentimentAnalysis\\sentiments.csv")

df.isnull().values.any()
df.shape

(50000, 2)

In [422]:
train_data = df[:100]
train_data.loc[:, 'y'] =  np.where((train_data.loc[:, 'sentiment'] == 'positive') , 1, 0)

In [423]:
val_data = df[100:200]
val_data.loc[:, 'y'] =  np.where((val_data.loc[:, 'sentiment'] == 'positive') , 1, 0)

In [424]:
train_data.head()

Unnamed: 0,review,sentiment,y
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [425]:
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
MAX_SEQ_LENGTH = 512

def convert_example_to_feature(review):
    # add special tokens
    review_with_special_tokens = '[CLS]' + review + '[SEP]'
    tokenized = tokenizer.tokenize(review_with_special_tokens)

    # convert tokens to ids in WordPiece
    input_ids = tokenizer.convert_tokens_to_ids(tokenized)
    
    if len(input_ids) > MAX_SEQ_LENGTH:
        input_ids = input_ids[: MAX_SEQ_LENGTH]
        special_tokens_mask = [1] * MAX_SEQ_LENGTH
        token_type_ids = [0] * MAX_SEQ_LENGTH
    
    else:   
          
        # precalculation of pad length, so that we can reuse it later on
        padding_length = MAX_SEQ_LENGTH - len(input_ids)
        
        # map tokens to WordPiece dictionary and add pad token for those text shorter than our max length
        input_ids = input_ids + ([0] * padding_length)

        # attention should focus just on sequence with non padded tokens
        special_tokens_mask = [1] * len(input_ids) + [0] * (MAX_SEQ_LENGTH - len(input_ids))

        # token types, needed for example for question answering, for our purpose we will just set 0 as we have just one sequence
        token_type_ids = [0] * MAX_SEQ_LENGTH

    bert_encoding = {
        "token_ids": input_ids,
        "token_type_ids": token_type_ids,
        "special_tokens_mask": special_tokens_mask
        }
    return bert_encoding
    

def encode(df):
    
    df = df.reset_index(drop=True)
    
    input_ids_list = []
    token_type_ids_list = []
    special_tokens_mask_list = []
    label_list = []
    
    for i in range(len(df)) : 
        review = df.loc[i, "review"]
        label = np.int32(df.loc[i, "y"])
        
        bert_layer = convert_example_to_feature(review)
        
        input_ids_list.append(bert_layer['token_ids'])
        token_type_ids_list.append(bert_layer['token_type_ids'])
        special_tokens_mask_list.append(bert_layer['special_tokens_mask'])
        label_list.append([label])
        
    return {"input_ids": input_ids_list, "attention_mask": special_tokens_mask_list,
          "token_type_ids": token_type_ids_list}, label_list

In [426]:
def create_tfdataset(data):
    ds = encode(data)
    encoded_data = ds[0]
    encoded_labels = ds[1]
    encoded_data['labels'] = encoded_labels
    
    input_ids_array = np.array(encoded_data['input_ids'], dtype=np.int32)
    attention_mask_array = np.array(encoded_data['attention_mask'], dtype=np.int32)
    token_type_ids_array = np.array(encoded_data['token_type_ids'], dtype=np.int32)
    labels_array = np.array(encoded_data['labels'], dtype=np.int32)

    tf_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids": input_ids_array,
                                               "attention_mask": attention_mask_array,
                                               "token_type_ids": token_type_ids_array,
                                              }, labels_array))
    return tf_dataset

In [427]:
training_data_tf = create_tfdataset(train_data)
validation_data_tf = create_tfdataset(val_data)

Token indices sequence length is longer than the specified maximum sequence length for this model (563 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (749 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (845 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (608 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (908 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

In [431]:
from transformers import TFBertForSequenceClassification
import tensorflow as tf

# Recommended learning rate for Adam 5e-5, 3e-5, 2e-5
learning_rate = 2e-5

# we will do just 1 epoch for illustration, though multiple epochs might be better as long as we will not overfit the model
EPOCHS = 10

# Model initialization
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

# Optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)

# Compile model
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

In [432]:
bert_history = model.fit(training_data_tf, epochs=EPOCHS, validation_data=validation_data_tf)

Train for 100 steps, validate for 100 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
 16/100 [===>..........................] - ETA: 6:12 - loss: 0.8058 - accuracy: 0.4049

KeyboardInterrupt: 