In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer
from transformers import TFAutoModel

import warnings
warnings.filterwarnings('ignore')

In [2]:
# import transformers
# transformers.__version__

## Data Collection

In [3]:
df = pd.read_csv('data/train.tsv', sep='\t')
df.head(2)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2


In [4]:
df.shape

(156060, 4)

In [5]:
df.isna().sum()

PhraseId      0
SentenceId    0
Phrase        0
Sentiment     0
dtype: int64

In [6]:
df.duplicated().sum()

0

In [7]:
df.drop_duplicates(subset=['SentenceId'], keep='first').shape


(8529, 4)

In [8]:
df['Sentiment'].value_counts()

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

In [None]:
df['Sentiment'].value_counts().plot(kind='bar')

## Data Preprocessing

In [9]:
seq_len = 512
num_samples = len(df)

num_samples, seq_len

(156060, 512)

In [10]:
# initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# tokenize - this time returning Numpy tensors
tokens = tokenizer(df['Phrase'].tolist(), 
                   max_length=seq_len, 
                   truncation=True,
                   padding='max_length', 
                   add_special_tokens=True,
                   return_tensors='np')


In [11]:
tokens.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [13]:
# tokens

In [18]:
# with open('data/movie-xids.npy', 'wb') as f:
#     np.save(f, tokens['input_ids'])
    
# with open('data/movie-xmask.npy', 'wb') as f:
#     np.save(f, tokens['attention_mask'])

In [19]:
del tokens

In [14]:
# first extract sentiment column
arr = df['Sentiment'].values

In [15]:
# we then initialize the zero array
labels = np.zeros((num_samples, arr.max()+1))
labels.shape

(156060, 5)

In [16]:
labels[np.arange(num_samples), arr] = 1

labels

array([[0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       ...,
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.]])

In [23]:
# with open('data/movie-labels.npy', 'wb') as f:
#     np.save(f, labels)

## Dataset Building

In [50]:
# with open('data/movie-xids.npy', 'rb') as f:
#     Xids = np.load(f, allow_pickle=True)
    
# with open('data/movie-xmask.npy', 'rb') as f:
#     Xmask = np.load(f, allow_pickle=True)
    
# with open('data/movie-labels.npy', 'rb') as f:
#     labels = np.load(f, allow_pickle=True)

In [17]:
Xids = tokens['input_ids']
Xmask = tokens['attention_mask']

In [18]:
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))

dataset.take(1)

<TakeDataset shapes: ((512,), (512,), (5,)), types: (tf.int32, tf.int32, tf.float64)>

In [19]:
def map_function(input_ids, masks, labels):
    return {'input_ids': input_ids, 'attention_mask': masks}, labels


In [20]:
dataset = dataset.map(map_function)
dataset.take(1)

<TakeDataset shapes: ({input_ids: (512,), attention_mask: (512,)}, (5,)), types: ({input_ids: tf.int32, attention_mask: tf.int32}, tf.float64)>

In [24]:
batch_size = 32
split = 0.9

# we need to calculate how many batches must be taken to create 90% training set
size = int((Xids.shape[0] / batch_size) * split)

print(size)

4389


In [25]:
train_ds = dataset.take(size)
val_ds = dataset.skip(size)

# free up memory
del dataset

#### save both to file using tf.data.experimental.save.

In [46]:
# tf.data.experimental.save(train_ds, 'data/train')
# tf.data.experimental.save(val_ds, 'data/val')

In [26]:
train_ds.element_spec

({'input_ids': TensorSpec(shape=(512,), dtype=tf.int32, name=None),
  'attention_mask': TensorSpec(shape=(512,), dtype=tf.int32, name=None)},
 TensorSpec(shape=(5,), dtype=tf.float64, name=None))

In [27]:
val_ds.element_spec == train_ds.element_spec

True

##### Loading the data 

In [None]:
# ds = tf.data.experimental.load('train', element_spec=train_ds.element_spec)


In [30]:
# batch_size = 32
# dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)
# dataset.take(1)

In [31]:
split = 0.9

# we need to calculate how many batches must be taken to create 90% training set
size = int((Xids.shape[0] / batch_size) * split)

size

4389

In [32]:
# train_ds = dataset.take(size)
# val_ds = dataset.skip(size)

# # free up memory
# del dataset

#### Saving the data

In [57]:
# tf.data.experimental.save(train_ds, 'train')
# tf.data.experimental.save(val_ds, 'val')

##### Model Building and Saving

In [34]:
bert = TFAutoModel.from_pretrained('bert-base-cased')

# we can view the model using the summary method
bert.summary()


Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "tf_bert_model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  108310272 
Total params: 108,310,272
Trainable params: 108,310,272
Non-trainable params: 0
_________________________________________________________________


In [35]:
# two input layers, we ensure layer name variables match to dictionary keys in TF dataset
input_ids = tf.keras.layers.Input(shape=(512,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(512,), name='attention_mask', dtype='int32')

In [36]:
# we access the transformer model within our bert object using the bert attribute (eg bert.bert instead of bert)
embeddings = bert.bert(input_ids, attention_mask=mask)[1]  # access final activations (alread max-pooled) [1]
# convert bert embeddings into 5 output classes
x = tf.keras.layers.Dense(1024, activation='relu')(embeddings)
y = tf.keras.layers.Dense(5, activation='softmax', name='outputs')(x)

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


In [38]:
# initialize model
model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

# (optional) freeze bert layer
# model.layers[2].trainable = False

# print out model summary
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 512)]        0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 108310272   input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
dense (Dense)                   (None, 1024)         787456      bert[0][1]                 

In [39]:
optimizer = tf.keras.optimizers.Adam(lr=1e-5, decay=1e-6)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

In [41]:
element_spec = ({'input_ids': tf.TensorSpec(shape=(16, 512), dtype=tf.float64, name=None),
                 'attention_mask': tf.TensorSpec(shape=(16, 512), dtype=tf.float64, name=None)},
                tf.TensorSpec(shape=(16, 5), dtype=tf.float64, name=None))

# load the training and validation sets
# train_ds = tf.data.experimental.load('train', element_spec=element_spec)
# val_ds = tf.data.experimental.load('val', element_spec=element_spec)

# view the input format
train_ds.take(1)

<TakeDataset shapes: ({input_ids: (16, 512), attention_mask: (16, 512)}, (16, 5)), types: ({input_ids: tf.float64, attention_mask: tf.float64}, tf.float64)>

In [42]:
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=3
)

Epoch 1/3


InvalidArgumentError:  Data type mismatch at component 0: expected double but got int32.
	 [[node IteratorGetNext (defined at <ipython-input-42-dd164fcc4556>:1) ]] [Op:__inference_train_function_21368]

Function call stack:
train_function
