In [4]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [5]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [7]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from keras.preprocessing.sequence import pad_sequences

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn

In [9]:
df_train = pd.read_csv("train.csv", header = None)
df_valid = pd.read_csv("validation.csv", header = None)

df_train.columns = ["text", "label"]
df_valid.columns = ["text", "label"]

df_train['text'] = df_train['text'].str.lower()
df_valid['text'] = df_valid['text'].str.lower()

In [10]:
df_train.shape, df_valid.shape

((16800, 2), (11200, 2))

In [11]:
unique_labels = df_train.label.unique()

In [12]:
unique_labels

array(['LO', 'NI', 'DS', 'CL', 'DC', 'SE', 'CR'], dtype=object)

In [13]:
le = preprocessing.LabelEncoder()
le.fit(unique_labels)

df_train['label_transformed'] = le.transform(df_train.label)
df_valid['label_transformed'] = le.transform(df_valid.label)

df_train['label_inverse'] = le.inverse_transform(df_train.label_transformed)
df_valid['label_inverse'] = le.inverse_transform(df_valid.label_transformed)

In [14]:
df_valid.head()

Unnamed: 0,text,label,label_transformed,label_inverse
0,manne et al. designed the first algorithm co...,DC,2,DC
1,we consider the challenge of creating guidel...,SE,6,SE
2,network virtualization techniques allow for ...,NI,5,NI
3,"in the min $k$-cut problem, input is an edge...",DS,3,DS
4,we introduce the notion of being weihrauch-c...,LO,4,LO


In [15]:
df_train.head()

Unnamed: 0,text,label,label_transformed,label_inverse
0,we extend to natural deduction the approach ...,LO,4,LO
1,"over the last decade, the ieee 802.11 has em...",NI,5,NI
2,motivated by the problem of storing coloured...,DS,3,DS
3,we consider the downlink of a cellular syste...,NI,5,NI
4,meroitic is the still undeciphered language ...,CL,0,CL


In [16]:
from transformers import *

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased', do_lower_case=True)

Loading BERT tokenizer...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=385.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=227845.0, style=ProgressStyle(descripti…




In [17]:
sentences_train = df_train.text.values
labels_train = df_train.label.values

sentences_valid = df_valid.text.values
labels_valid = df_valid.text.values

In [18]:
def create_input_ids_attention_maks(sentences, max_len):
  input_ids = []

  # For every sentence...
  for sent in sentences:
      # `encode` will:
      #   (1) Tokenize the sentence.
      #   (2) Prepend the `[CLS]` token to the start.
      #   (3) Append the `[SEP]` token to the end.
      #   (4) Map tokens to their IDs.
      encoded_sent = tokenizer.encode(
                          sent,                      
                          add_special_tokens = True,
                          max_length = max_len,          
                          truncation=True,
                    )
      
      # Add the encoded sentence to the list.
      input_ids.append(encoded_sent)

  # pad the sequences
  input_ids = pad_sequences(input_ids, maxlen=max_len, dtype="long", 
                            value=0, truncating="post", padding="post")

  # Create attention masks
  attention_masks = []

  for sent in input_ids:
      
      # Create the attention mask.
      #   - If a token ID is 0, then it's padding, set the mask to 0.
      #   - If a token ID is > 0, then it's a real token, set the mask to 1.
      att_mask = [int(token_id > 0) for token_id in sent]
      
      attention_masks.append(att_mask)

  return [input_ids, attention_masks]

In [19]:
input_ids_train, attention_masks_train = create_input_ids_attention_maks(sentences_train, 256)

In [20]:
input_ids_valid, attention_masks_valid = create_input_ids_attention_maks(sentences_valid, 256)

In [21]:
# Create torch tensors required for DataLoader
train_inputs = torch.tensor(input_ids_train)
validation_inputs = torch.tensor(input_ids_valid)

train_labels = torch.tensor(df_train.label_transformed)
validation_labels = torch.tensor(df_valid.label_transformed)

train_masks = torch.tensor(attention_masks_train)
validation_masks = torch.tensor(attention_masks_valid)

In [22]:
batch_size = 32

# DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


In [23]:
#Create custom scibert model for sequence classification
class CustomSciBERTModel(nn.Module):
    def __init__(self):
      super(CustomSciBERTModel, self).__init__()
      self.scibert = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
      self.num_labels = 7
      self.linear_layer = nn.Linear(768, 7)

    def forward(self, ids, mask, labels):
      output = self.scibert(
            input_ids=ids, 
            attention_mask=mask)
      logits = self.linear_layer(output[1]) 

      loss = None
      # while training return loss and while validation/ testing resturn logits
      if labels is not None:
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        return loss
      else:
        return logits

In [24]:
from transformers import AdamW, get_linear_schedule_with_warmup

# Create model object
model = CustomSciBERTModel()

# Get model on to GPU
model.cuda()

# Optimizer
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

# Number of epochs
epochs = 1

# training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value 
                                            num_training_steps = total_steps)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442221694.0, style=ProgressStyle(descri…




In [29]:
# Compute accuracy
def accuracy(preds, labels):
    pred = np.argmax(preds, axis=1).flatten()
    labels = labels.flatten()
    return np.sum(pred == labels) / len(labels)

In [30]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [31]:
import random

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []

# store predictions on validation data
valid_preds = []

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # backward pass. PyTorch doesn't do this automatically because 
        model.zero_grad()        

        # Forward pass
        outputs = model(ids = b_input_ids, mask = b_input_mask, labels = b_labels)
        
        # forward pass gives loss as output
        loss = outputs

        # accumulate loss
        total_loss += loss.item()

        # backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0. to avoid exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    # evaluation mode.
    model.eval()

    # evaluation loss and accuracy
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        

            # It gives logits when labels are not given to model
            outputs = model(b_input_ids, 
                            b_input_mask, labels = None)
        
        # Get the "logits" output by the model. The "logits" are the output
        logits = outputs

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        valid_preds.extend(np.argmax(logits, axis=1).flatten())
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...
  Batch    40  of    525.    Elapsed: 0:01:01.
  Batch    80  of    525.    Elapsed: 0:02:01.
  Batch   120  of    525.    Elapsed: 0:03:01.
  Batch   160  of    525.    Elapsed: 0:04:01.
  Batch   200  of    525.    Elapsed: 0:05:01.
  Batch   240  of    525.    Elapsed: 0:06:02.
  Batch   280  of    525.    Elapsed: 0:07:02.
  Batch   320  of    525.    Elapsed: 0:08:02.
  Batch   360  of    525.    Elapsed: 0:09:02.
  Batch   400  of    525.    Elapsed: 0:10:02.
  Batch   440  of    525.    Elapsed: 0:11:02.
  Batch   480  of    525.    Elapsed: 0:12:01.
  Batch   520  of    525.    Elapsed: 0:13:01.

  Average training loss: 0.15
  Training epcoh took: 0:13:09

Running Validation...
  Accuracy: 0.93
  Validation took: 0:03:06

Training complete!
