In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import re
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.models import Model
from tensorflow.keras import layers
import bert
from tqdm import tqdm
from collections import namedtuple

In [261]:
from transformers import BertTokenizer, TFBertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)

In [3]:
# Read the dataset
df = pd.read_csv("D:\\Study\\DataScience\\Projects\\AI-Advertising\\SentimentAnalysis\\sentiments.csv")

df.isnull().values.any()
df.shape

(50000, 2)

In [None]:
data = df[:5]
X = data['review'].fillna("CVxTz").values

y = data['sentiment']

y = np.array(list(map(lambda x: 1 if x=="positive" else 0, y)))

data['y'] = y

In [4]:
train_data = df[:5]
train_data.loc[:, 'y'] =  np.where((train_data.loc[:, 'sentiment'] == 'positive') , 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [5]:
val_data = df[5:10]
val_data.loc[:, 'y'] =  np.where((val_data.loc[:, 'sentiment'] == 'positive') , 1, 0)

In [8]:
train_data

Unnamed: 0,review,sentiment,y
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [302]:
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
MAX_SEQ_LENGTH = 512

def convert_example_to_feature(review):
    # add special tokens
    review_with_special_tokens = '[CLS]' + review + '[SEP]'
    tokenized = tokenizer.tokenize(review_with_special_tokens)

    # convert tokens to ids in WordPiece
    input_ids = tokenizer.convert_tokens_to_ids(tokenized)
  
    # precalculation of pad length, so that we can reuse it later on
    padding_length = MAX_SEQ_LENGTH - len(input_ids)
        
    # map tokens to WordPiece dictionary and add pad token for those text shorter than our max length
    input_ids = input_ids + ([0] * padding_length)

    # attention should focus just on sequence with non padded tokens
    #special_tokens_mask = [1] * len(input_ids)

    # do not focus attention on padded tokens
    #special_tokens_mask = special_tokens_mask + ([0] * padding_length)
    
    special_tokens_mask = [1] * len(input_ids) + [0] * (MAX_SEQ_LENGTH - len(input_ids))

    # token types, needed for example for question answering, for our purpose we will just set 0 as we have just one sequence
    token_type_ids = [0] * MAX_SEQ_LENGTH

    bert_encoding = {
        "token_ids": input_ids,
        "token_type_ids": token_type_ids,
        "special_tokens_mask": special_tokens_mask
        }
    return bert_encoding
    

def encode(df):
    
    df = df.reset_index(drop=True)
    
    input_ids_list = []
    token_type_ids_list = []
    special_tokens_mask_list = []
    label_list = []
    
    for i in range(len(df)) : 
        review = df.loc[i, "review"]
        label = np.int32(df.loc[i, "y"])
        
        bert_layer = convert_example_to_feature(review)
        
        input_ids_list.append(bert_layer['token_ids'])
        token_type_ids_list.append(bert_layer['token_type_ids'])
        special_tokens_mask_list.append(bert_layer['special_tokens_mask'])
        label_list.append([label])

        #return tf.data.Dataset.from_tensor_slices((input_ids_list, special_tokens_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)
        
    return {"input_ids": input_ids_list, "attention_mask": special_tokens_mask_list,
          "token_type_ids": token_type_ids_list}, label_list

In [303]:
train_ds = encode(train_data)
val_ds = encode(val_data)

training_data = train_ds[0]
validation_data = val_ds[0]

labels_train = train_ds[1]
labels_test = val_ds[1]

training_data['labels'] = labels_train
validation_data['labels'] = labels_test

In [304]:
print(training_data.keys())

dict_keys(['input_ids', 'attention_mask', 'token_type_ids', 'labels'])


In [317]:
input_ids_array = np.array(training_data['input_ids'], dtype=np.int32)
attention_mask_array = np.array(training_data['attention_mask'], dtype=np.int32)
token_type_ids_array = np.array(training_data['token_type_ids'], dtype=np.int32)
labels_array = np.array(training_data['labels'], dtype=np.int32)

training_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids": input_ids_array,
                                               "attention_mask": attention_mask_array,
                                               "attention_mask": token_type_ids_array,
                                              }, labels_array))

In [315]:
val_input_ids_array = np.array(validation_data['input_ids'], dtype=np.int32)
val_attention_mask_array = np.array(validation_data['attention_mask'], dtype=np.int32)
val_token_type_ids_array = np.array(validation_data['token_type_ids'], dtype=np.int32)
val_labels_array = np.array(validation_data['labels'], dtype=np.int32)

validation_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids": val_input_ids_array,
                                               "attention_mask": val_attention_mask_array,
                                               "attention_mask": val_token_type_ids_array,
                                              }, val_labels_array))

In [319]:
type(validation_dataset)

tensorflow.python.data.ops.dataset_ops.TensorSliceDataset

In [328]:
from transformers import TFBertForSequenceClassification
import tensorflow as tf

# recommended learning rate for Adam 5e-5, 3e-5, 2e-5
learning_rate = 2e-5
# we will do just 1 epoch for illustration, though multiple epochs might be better as long as we will not overfit the model

number_of_epochs = 1
# model initialization

model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
# classifier Adam recommended

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)

# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.CategoricalCrossentropy

metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [329]:
bert_history = model.fit(training_dataset, epochs=1, validation_data=validation_dataset)

Train for 5 steps, validate for 5 steps
