[tutorial link](http://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/)

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import pandas as pd
import numpy as np
import transformers
import tensorflow.keras as keras
from tqdm import tqdm

tqdm.pandas()

In [2]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', names=['text', 'label'])
df

Unnamed: 0,text,label
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1
...,...,...
6915,"painful , horrifying and oppressively tragic ,...",1
6916,take care is nicely performed by a quintet of ...,0
6917,"the script covers huge , heavy topics in a bla...",0
6918,a seriously bad film with seriously warped log...,0


In [3]:
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = transformers.TFDistilBertModel.from_pretrained('distilbert-base-uncased')

In [4]:
# df = df.iloc[:1000]

In [5]:
df['token'] = df['text'].progress_apply(
    lambda x: tokenizer.encode(
        str(x),
        add_special_tokens=True
    )
)
df['token']

100%|██████████| 6920/6920 [00:04<00:00, 1696.75it/s]


0       [101, 1037, 18385, 1010, 6057, 1998, 2633, 182...
1       [101, 4593, 2128, 27241, 23931, 2013, 1996, 62...
2       [101, 2027, 3653, 23545, 2037, 4378, 24185, 10...
3       [101, 2023, 2003, 1037, 17453, 14726, 19379, 1...
4       [101, 5655, 6262, 1005, 1055, 12075, 2571, 376...
                              ...                        
6915    [101, 9145, 1010, 7570, 18752, 14116, 1998, 28...
6916    [101, 2202, 2729, 2003, 19957, 2864, 2011, 103...
6917    [101, 1996, 5896, 4472, 4121, 1010, 3082, 7832...
6918    [101, 1037, 5667, 2919, 2143, 2007, 5667, 2561...
6919    [101, 1037, 12090, 2135, 2512, 5054, 19570, 23...
Name: token, Length: 6920, dtype: object

In [6]:
df['token'][0]
MAX_LEN = df['token'].map(lambda x: len(x)).max()
MAX_LEN

[101,
 1037,
 18385,
 1010,
 6057,
 1998,
 2633,
 18276,
 2128,
 16603,
 1997,
 5053,
 1998,
 1996,
 6841,
 1998,
 5687,
 5469,
 3152,
 102]

67

We pad manually because "encode_plus()" has issue with the different layers:
* it creates indeed an attention layer and can pad (which is good)
* creates an error with the type layer (which is an inconsistency issue in the package)

In [7]:
df['token'] = df['token'].map(
    lambda x: np.pad(x, (0,MAX_LEN-len(x)), constant_values=0)
)
df['token'].apply(len)

0       67
1       67
2       67
3       67
4       67
        ..
6915    67
6916    67
6917    67
6918    67
6919    67
Name: token, Length: 6920, dtype: int64

In [8]:
df['attention'] = df['token'].map(
    lambda x: x != 0
)
df['attention'][0]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False])

In [9]:
np.vstack(df['token'].to_numpy())

array([[  101,  1037, 18385, ...,     0,     0,     0],
       [  101,  4593,  2128, ...,     0,     0,     0],
       [  101,  2027,  3653, ...,     0,     0,     0],
       ...,
       [  101,  1996,  5896, ...,     0,     0,     0],
       [  101,  1037,  5667, ...,     0,     0,     0],
       [  101,  1037, 12090, ...,     0,     0,     0]])

The following code was used when I did not needed a batch (did not overflow RAM)

In [50]:
from tensorflow import constant
token = constant(np.vstack(df['token'].to_numpy()))
attention = constant(np.vstack(df['attention'].to_numpy()))

token.shape, attention.shape
# 
# hidden_states = model(token, attention_mask = attention)

(TensorShape([6920, 67]), TensorShape([6920, 67]))

I used a batch with tf.data.Dataset to not overflow RAM and learn about batch dataset with TF

In [42]:
from tensorflow import concat
from tensorflow.data import Dataset

BATCH_SIZE = 50
data = Dataset.zip((
    Dataset.from_tensor_slices(np.vstack(df['token'].to_numpy())),
    Dataset.from_tensor_slices(np.vstack(df['attention'].to_numpy()))
))

hidden_states = None
for batch in tqdm(data.batch(BATCH_SIZE)):
    t = model(batch[0], attention_mask=batch[1])[0]
    if hidden_states is None:
        hidden_states = t
    else:
        hidden_states = concat([hidden_states, t], axis=0)

139it [05:53,  2.54s/it]


In [43]:
hidden_states.shape

TensorShape([6920, 67, 768])

In [44]:
features = hidden_states[:,0,:].numpy()
features.shape

(6920, 768)

In [45]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import OneClassSVM
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [46]:
x_train, x_test, y_train, y_test = train_test_split(features, df['label'].to_numpy())
y_train[y_train == 0] = -1
y_test[y_test == 0] = -1

In [47]:
svm = OneClassSVM()
svm.fit(x_train, y_train)
accuracy_score(y_test, svm.predict(x_test))

OneClassSVM()

0.500578034682081

In [48]:
lr = LogisticRegression()
lr.fit(x_train, y_train)
lr.score(x_test, y_test)

LogisticRegression()

0.8410404624277457

In [49]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
rf.score(x_test, y_test)

RandomForestClassifier()

0.7936416184971098