# Building a Sentiment Classifier Model

### Initialising the tools

In [3]:
# Importing all the necessary packages
# If there is an error, double check that all the versions are installed correctly
import tensorflow as tf
import re
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import time

from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import label_binarize
from sklearn.metrics import recall_score
from sklearn.utils import compute_class_weight
from tensorflow.keras.optimizers import SGD
from scipy import interp
from itertools import cycle
from sklearn.metrics import roc_curve, auc

In [4]:
# Check to see if tensorflow can detect your GPUs
tf.config.list_physical_devices('GPU')

[]

### Initialising the BERT Model

In [22]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

model.summary()

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_75 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


### Importing the Datasets as Pandas Dataframes

In [6]:
train_df = pd.read_json('train.json')
print(train_df.shape)
train_df.head()

(7401, 2)


Unnamed: 0,reviews,sentiments
0,I bought this belt for my daughter in-law for ...,1
1,The size was perfect and so was the color. It...,1
2,"Fits and feels good, esp. for doing a swim rac...",1
3,These socks are absolutely the best. I take pi...,1
4,Thank you so much for the speedy delivery they...,1


In [7]:
test_df = pd.read_json('test.json')
print(test_df.shape)
test_df.head()

(1851, 1)


Unnamed: 0,reviews
0,I bought 2 sleepers. sleeper had holes in the...
1,I dare say these are just about the sexiest th...
2,"everything about the transaction (price, deliv..."
3,"Not bad for just a shirt. Very durable, and m..."
4,These are truly wrinkle free and longer than t...


### Converting the DataFrames to feature format

In [8]:
import numpy as np
from sklearn.model_selection import train_test_split
train, validation = train_test_split(train_df, test_size=0.2)

In [9]:
InputExample(guid=None,
             text_a = "Hello, world",
             text_b = None,
             label = 1)

InputExample(guid=None, text_a='Hello, world', text_b=None, label=1)

In [10]:
def convert_data_to_examples(train, test, Review, Sentiment): 
  train_InputExamples = train.apply(lambda x: InputExample(guid=None,
                                                          text_a = x[Review], 
                                                          text_b = None,
                                                          label = x[Sentiment]), axis = 1)

  validation_InputExamples = test.apply(lambda x: InputExample(guid=None,
                                                          text_a = x[Review], 
                                                          text_b = None,
                                                          label = x[Sentiment]), axis = 1)
  
  return train_InputExamples, validation_InputExamples

  train_InputExamples, validation_InputExamples = convert_data_to_examples(train, 
                                                                           test, 
                                                                           'reviews', 
                                                                           'sentiments')
  
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in examples:
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


Review = 'reviews'
Sentiment = 'sentiments'

In [11]:
train_InputExamples, validation_InputExamples = convert_data_to_examples(train, validation, Review, Sentiment)

train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(10).batch(32).repeat(50)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)



### Training the Model

In [23]:
# Filepath to store the model's weights
filepathexport = "weights1.best.hdf5"

# Checkpoint to store the model after each epoch
# Earlystopping clause to stop training the model once it stops improving
checkpoint = ModelCheckpoint(filepathexport, monitor="val_loss", verbose=1, save_best_only=True, mode='min', save_weights_only=True)
earlystop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
callbacks_list = [earlystop, checkpoint]

In [24]:
# Compiling the model
opt = SGD(learning_rate=0.001, decay=1e-8, momentum=0.8, nesterov=True)
model.compile(optimizer=opt, 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

In [25]:
model.fit(train_data, epochs=100, callbacks=callbacks_list, validation_data=validation_data, steps_per_epoch=250)

The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


Epoch 1/100


The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.




The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.



Epoch 00001: val_loss improved from inf to 0.15677, saving model to weights1.best.hdf5
Epoch 2/100

Epoch 00002: val_loss did not improve from 0.15677
Epoch 3/100

Epoch 00003: val_loss improved from 0.15677 to 0.15135, saving model to weights1.best.hdf5
Epoch 4/100

Epoch 00004: val_loss did not improve from 0.15135
Epoch 5/100

Epoch 00005: val_loss did not improve from 0.15135
Epoch 6/100

Epoch 00006: val_loss did not improve from 0.15135
Epoch 7/100

Epoch 00007: val_loss did not improve from 0.15135
Epoch 8/100

Epoch 00008: val_loss did not improve from 0.15135
Epoch 00008: early stopping


<tensorflow.python.keras.callbacks.History at 0x1c580c5b8b0>

In [26]:
#filepathimport = "weights.best.hdf5"
model.load_weights('weights1.best.hdf5')

#Compiling the model
opt = SGD(lr=0.001, decay=1e-8, momentum=0.8, nesterov=True)
model.compile(optimizer=opt, 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

In [19]:
model.save_weights('./savedmodel2')

### Outputting the Model's Predictions

In [27]:
pred_sentences = test_df["reviews"].to_numpy()
pred_sentences=list(pred_sentences)

In [30]:
# Using BERT tokenizer
encoded = tokenizer.batch_encode_plus(pred_sentences, max_length=128, add_special_tokens=True,
      return_token_type_ids=True,
      truncation=True,
      padding=True,
      return_attention_mask=True,
      return_tensors='tf')

# Running tokenized input through model
tf_outputs = model(encoded)

In [36]:
# Softmax layer for predictions
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)

labels = ['Negative', 'Positive']
label = tf.argmax(tf_predictions,axis=1)
label = label.numpy()
#for x in label:
#    print(labels[x])

predictions = []
for i in range(len(pred_sentences)):
    predictions.append(labels[label[i]])

In [37]:
print(predictions)

['Negative', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Negative', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Negative', 'Negative', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Negative', 'Negative', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Negative', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Negative', 'Positive', 'Positive', 'Positive', 'Positive', 'Negative', 'Negative', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Negative', 'Positive', 'Positive', 'Positive', 'Positive', 'Negative', 'Positive', 'Positive', 'Positive', 'Negative', 'Negative', 'Positive', 'Positive', 'Positive', 'Positive', 'Negative', 'Positive', 'Positive', 'Po

In [38]:
predicted_final= {'Review':pred_sentences,'Prediction':predictions}

In [39]:
sentiments = pd.DataFrame(predicted_final)

In [41]:
sentiments.to_csv('submission.csv')