# Transfer learning with BERT as a base model

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification # for sentimental analysis

### Load Amazon Reviews Dataset

In [3]:
amazon_reviews_df = pd.read_csv('data/cleaned_amazon_reviews.csv')

In [4]:
amazon_reviews_df.head()

Unnamed: 0,sentiments,cleaned_review,cleaned_review_length,review_score
0,positive,i wish would have gotten one earlier love it a...,19,5.0
1,neutral,i ve learned this lesson again open the packag...,88,1.0
2,neutral,it is so slow and lags find better option,9,2.0
3,neutral,roller ball stopped working within months of m...,12,1.0
4,neutral,i like the color and size but it few days out ...,21,1.0


In [5]:
amazon_reviews_df.describe()

Unnamed: 0,cleaned_review_length,review_score
count,17340.0,17340.0
mean,30.300461,3.649077
std,35.83654,1.6735
min,0.0,1.0
25%,9.0,2.0
50%,20.0,5.0
75%,38.0,5.0
max,571.0,5.0


### Data Preprocessing

In [6]:
# remove not needed columns
amazon_reviews_df.drop(columns=["cleaned_review_length", "review_score"], inplace=True)

In [7]:
# remove NaN values
amazon_reviews_df.dropna(inplace=True)

In [36]:
X_amazon_reviews = amazon_reviews_df.drop(columns=['sentiments'])
y_amazon_reviews = amazon_reviews_df['sentiments']

# encode target class
le = LabelEncoder()
y_amazon_reviews_labels = le.fit_transform(y_amazon_reviews)
y_amazon_reviews = to_categorical(y_amazon_reviews_labels)

In [9]:
# split train-test
X_train_reviews, X_test_reviews, y_train_reviews, y_test_reviews = train_test_split(X_amazon_reviews,
                                                                                    y_amazon_reviews,
                                                                                    random_state=42,
                                                                                    test_size=0.2)

In [10]:
# split train-validation
X_train_reviews, X_valid_reviews, y_train_reviews, y_valid_reviews = train_test_split(X_train_reviews,
                                                                                      y_train_reviews,
                                                                                      random_state=42,
                                                                                      test_size=0.33)

### Data preparation for BERT model structure

In [11]:
model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [12]:
train_encodings = tokenizer(list(X_train_reviews['cleaned_review']),
                            truncation=True,
                            padding=True,
                            return_tensors="np",
                            return_token_type_ids=False)
valid_encodings = tokenizer(list(X_valid_reviews['cleaned_review']),
                            truncation=True,
                            padding=True,
                            return_tensors="np",
                            return_token_type_ids=False)
test_encodings = tokenizer(list(X_test_reviews['cleaned_review']),
                           truncation=True,
                           padding=True,
                           return_tensors="np",
                           return_token_type_ids=False)

In [13]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train_reviews
))

valid_dataset = tf.data.Dataset.from_tensor_slices((
    dict(valid_encodings),
    y_valid_reviews
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test_reviews
))

In [14]:
batch_size = 8

train_dataset = train_dataset.shuffle(1000).batch(batch_size, drop_remainder=True)
valid_dataset = valid_dataset.shuffle(1000).batch(batch_size, drop_remainder=True)
test_dataset = test_dataset.shuffle(1000).batch(batch_size, drop_remainder=True)

In [15]:
seq_num = train_dataset.take(1).element_spec[0]['input_ids'].shape[1]

### BERT model build and fine-tuning

In [16]:
model = TFAutoModelForSequenceClassification.from_pretrained(model_name)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
input_ids = tf.keras.layers.Input(shape=(seq_num,), name='input_ids', dtype='int32')
attention_mask = tf.keras.layers.Input(shape=(seq_num,), name='attention_mask', dtype='int32')

embeddings = model.bert(input_ids, attention_mask)[1]

x = tf.keras.layers.Dense(seq_num * 2, activation='relu')(embeddings)
y = tf.keras.layers.Dense(len(y_amazon_reviews[0]), activation='softmax', name='outputs')(x)

In [18]:
model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=y)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 512)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 512)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  109482240   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 512,                                           

In [19]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.CategoricalCrossentropy(), 
              metrics=[tf.keras.metrics.CategoricalAccuracy('accuracy')])

In [20]:
model.fit(train_dataset, validation_data=valid_dataset, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fedc9b20430>

In [21]:
# save model to be able to load later
model.save('sentiment-bert-amazon-model')



In [23]:
predictions = model.predict(test_dataset)



In [55]:
pred_results = []
for pred in predictions:
    pred_results.append(np.rint(pred).astype(int))

In [60]:
eval = model.evaluate(test_dataset)

