In [1]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x # Ensuring that TensorFlow 2.x is being used
except Exception:
  pass

import tensorflow as tf

tf.compat.v1.enable_eager_execution()

`%tensorflow_version` only switches the major version: 1.x or 2.x.
You set: `2.x # Ensuring that TensorFlow 2.x is being used`. This will be interpreted as: `2.x`.


TensorFlow 2.x selected.


In [2]:
!pip install transformers



In [0]:
import numpy as np
import pandas as pd
import warnings
import tensorflow.keras as keras
warnings.filterwarnings('ignore')

from transformers import (
    BertConfig,
    BertTokenizer,
    TFBertModel,
    TFDistilBertModel,
    DistilBertTokenizer
)

In [0]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)

In [5]:
df.head()

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1


In [9]:
batch_1 = df[:5000]
batch_1[1].value_counts()

1    2607
0    2393
Name: 1, dtype: int64

In [0]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
tokenized = batch_1[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [11]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(5000, 73)

In [12]:
input_layer = keras.layers.Input(shape = (attention_mask.shape[1],), dtype='int64')
bert = TFBertModel.from_pretrained("bert-base-cased")(input_layer)
bert = bert[0][:,0,:]              
classifier = keras.layers.Dense(units=1, activation='sigmoid')(bert)
model = keras.models.Model(inputs=input_layer, outputs=classifier)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 73)]              0         
_________________________________________________________________
tf_bert_model (TFBertModel)  ((None, 73, 768), (None,  108310272 
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 768)]             0         
_________________________________________________________________
dense (Dense)                (None, 1)                 769       
Total params: 108,311,041
Trainable params: 108,311,041
Non-trainable params: 0
_________________________________________________________________


In [0]:
opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
model.compile(optimizer=opt, loss="binary_crossentropy", metrics = ["accuracy"])

In [14]:
model.fit(
    [padded, attention_mask],
    [np.array(batch_1[1])],
    epochs=8, 
    validation_split=0.1)

Train on 4500 samples, validate on 500 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<tensorflow.python.keras.callbacks.History at 0x7fd1502b4f60>

# Distil Bert Model

In [0]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
tokenized = batch_1[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [0]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [17]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(5000, 73)

In [18]:
input_layer = keras.layers.Input(shape = (attention_mask.shape[1],), dtype='int64')
bert = TFDistilBertModel.from_pretrained("distilbert-base-cased")(input_layer)
bert = bert[0][:,0,:]              
classifier = keras.layers.Dense(units=1, activation='sigmoid')(bert)
model = keras.models.Model(inputs=input_layer, outputs=classifier)
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 73)]              0         
_________________________________________________________________
tf_distil_bert_model (TFDist ((None, 73, 768),)        65190912  
_________________________________________________________________
tf_op_layer_strided_slice_1  [(None, 768)]             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 769       
Total params: 65,191,681
Trainable params: 65,191,681
Non-trainable params: 0
_________________________________________________________________


In [0]:
opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
model.compile(optimizer=opt, loss="binary_crossentropy", metrics = ["accuracy"])

In [21]:
model.fit(
    [padded, attention_mask],
    [np.array(batch_1[1])],
    epochs=10, 
    validation_split=0.1)

Train on 4500 samples, validate on 500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fd0250aa668>