In [5]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.14.1-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 4.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 43.0 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 51.5 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 525 kB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 27.5 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attem

In [3]:
# Check for NVIDIA GPU on system.
!nvidia-smi

Sun Dec 19 12:13:27 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P8    29W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# Get .tsv files from local filesystem.
from google.colab import files
uploaded = files.upload()

Saving dataset_sentences.tsv to dataset_sentences.tsv
Saving imdb_train.tsv to imdb_train.tsv
Saving imdb_val.tsv to imdb_val.tsv


In [6]:
'''
Methods for data preprocessing and tokenization for the BERT Classifier.
Code is largely based on the following article: 

Title: Sentiment Analysis in 10 Minutes with BERT and TensorFlow
Author: Orhan G. Yalçın
URL: https://towardsdatascience.com/sentiment-analysis-in-10-minutes-with-bert-and-hugging-face-294e8a04b671
Date: 28-11-2020
Issued on: 19-12-2021
'''

import os
import shutil
import io

import tensorflow as tf
import pandas as pd
import numpy as np

from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

DATA_COL = "DATA_COLUMN"
LABEL_COL = "LABEL_COLUMN"
DATA_DIR = "/content/drive/MyDrive"


def to_input(train, test):
  '''
  '''
  train_input = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COL], 
                                                          text_b = None,
                                                          label = x[LABEL_COL]), axis = 1)

  val_input = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COL], 
                                                          text_b = None,
                                                          label = x[LABEL_COL]), axis = 1)
  
  return train_input, val_input

  train_InputExamples, validation_InputExamples = convert_data_to_examples(train, 
                                                                           test, 
                                                                           DATA_COL, 
                                                                           LABEL_COL)
  
def to_tf_dataset(examples, tokenizer, max_length=128):
    features = []

    for e in examples:
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # Pad if max_length > len(s), Please note deprecation.
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )
    
    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )

In [None]:
# Initialize model and tokenizer
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Set up data. 
train = pd.read_csv(io.BytesIO(uploaded['imdb_train.tsv']), delimiter='\t')
test = pd.read_csv(io.BytesIO(uploaded['imdb_val.tsv']), delimiter='\t')
train_InputExamples, validation_InputExamples = to_input(train, test)
train_data = to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)
validation_data = to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)

# Compile and fit model to data.
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])
model.fit(train_data, epochs=2, validation_data=validation_data)

# Make inferences and write to gold file.
pred =  pd.read_csv('dataset_sentences.tsv', delimiter='\t')
with open('predictions.tsv', 'w') as f:
    f.write('sentence index' + '\t' + 'sentence' + '\t' + 'label' + '\t' + 'num' + '\n')
    for sentence, sentence_idx in zip(list(pred['sentence']), list(pred['sentence_index'])):
        tokens = tokenizer(sentence, max_length=128, padding=True, truncation=True, return_tensors='tf')
        tf_out = model(tokens)
        pred_conf = np.array(tf.nn.sigmoid(tf_out[0]))[0]
        pred_lbl = np.argmax(pred_conf)
        pred_num = (0.5 - pred_conf[0] / 2) if pred_lbl == 0 else (0.5 + pred_conf[1] / 2) 

        f.write(str(sentence_idx) + '\t' + sentence + '\t' + str(pred_lbl) + '\t' + str(pred_num) + '\n')

In [None]:
# Extract gold file to local filesystem.
!cd sample_data
files.download('predictions.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>