In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Tokenize the classifier dataset

In [None]:
!pip install sentencepiece
!pip install transformers



In [None]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [None]:
import torch


def my_tokenize(articles, labels, max_len):
    '''
    Tokenize a dataset of articles.

    Parameters:
      `articles` - List of articles, represented as strings.
        `labels` - List of labels for the corresponding articles.
       `max_len` - Truncate all of the articles down to this length.

    Returns:
      `input_ids` - All of the articles represented as lists of token IDs,
                    padded out to `max_len`, and cast as a PyTorch tensor.
         `labels` - The labels for the corresponding articles, formatted as
                    a PyTorch tensor.
      `attention_masks` - PyTorch tensor with the same dimensions as
                          `input_ids`. For each token, simply indicates whether
                           it is padding or not.
    '''
    # Tokenize all of the articles and map the tokens to thier word IDs.
    input_ids = []
    attention_masks = []

    print('Tokenizing {:,} articles...'.format(len(articles)))

    # For every article...
    for art in articles:

        # Report progress.
        if ((len(input_ids) % 100) == 0):
            print('  Tokenized {:,} articles.'.format(len(input_ids)))

        # `encode_plus` will:
        #   (1) Tokenize the article.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        #   (5) Pad or truncate the article to `max_length`
        #   (6) Create attention masks for [PAD] tokens.
        encoded_dict = tokenizer.encode_plus(
                            art,                      # article to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = max_len,      # Pad & truncate all articles.
                            padding = 'max_length',
                            truncation = True,
                            return_attention_mask = True,   # Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                    )

        # Add the encoded article to the list.
        input_ids.append(encoded_dict['input_ids'])

        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    # Convert string labels to numeric values
    label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
    numeric_labels = [label_mapping[label] for label in labels]

    # Convert the labels to a tensor
    b_labels = torch.tensor(numeric_labels).to(device)

    return input_ids, labels, attention_masks

In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/xian/sentences_with_sentiment.csv')
sentiment_mapping = {0: 'Neutral', 1: 'Positive', -1: 'Negative'}
df['sentiment'] = df['sentiment'].map(sentiment_mapping)
df

Unnamed: 0,text,meta,sentence_id,sentiment
0,Yes.,{'company_name': 'Equitas Small Finance Bank L...,290E07F9-CC12-49A7-AFDA-90D8B990B3E7_001_049_0...,Neutral
1,Our solutions address those challenges by rapi...,"{'company_name': 'Cognyte Software Ltd', 'econ...",5FEED293-F3D6-4EB2-AF78-10F836DF18E1_000_002_0...,Positive
2,"On the film side, we are seeing an unprecedent...","{'company_name': 'Imax Corp', 'economic_sector...",F8CD4487-B16C-406B-AEB8-329507BAA839_000_002_0...,Positive
3,"And with that, I'd like to turn the call over ...","{'company_name': 'RadNet Inc', 'economic_secto...",35C8EA53-5EAC-4949-AD85-77E4E021A114_000_001_0...,Neutral
4,Welcome to our fourth quarter and full year 20...,"{'company_name': 'SB Financial Group Inc', 'ec...",D4DF925D-2B99-4B7A-8EB4-5F28BD9B4215_000_002_0...,Neutral
...,...,...,...,...
514,We'll work with our customers on their require...,"{'company_name': 'Meritor Inc', 'economic_sect...",AA183E6A-5BC6-4081-91E9-A03A0C3C289F_000_004_0...,Positive
515,I think that's fair.,"{'company_name': 'Genuine Parts Co', 'economic...",44629B65-7B84-46E8-85E6-D0FF3179698E_001_087_0...,Neutral
516,"Our customers have chemical plants, typically ...","{'company_name': 'TEL FSI Inc', 'economic_sect...",2117359B-FFB1-4BCF-B6B5-7BE3A91DFE13_001_035_0...,Neutral
517,I should probably also point out that we had o...,{'company_name': 'Multi-Fineline Electronix In...,3583F2E7-7827-43E4-9A57-4C2248CCB82C_001_075_0...,Positive


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

# Split into training+validation and testing sets
X_temp, X_test, y_temp, y_test = train_test_split(df['text'], df['sentiment'], test_size=0.2, random_state=42)

# Split the training+validation set into training and validation sets
# X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

In [None]:
train = pd.DataFrame({'sentence': X_temp, 'label': y_temp})
test = pd.DataFrame({'sentence': X_test, 'label': y_test})

In [None]:
train

Unnamed: 0,sentence,label
433,"In content operations, for example, we are hel...",Positive
517,I should probably also point out that we had o...,Positive
208,I can give you the swing on the circuit board ...,Neutral
332,IT Services business delivered a strong year o...,Positive
220,"Thanks, Neil.",Neutral
...,...,...
71,"In the large majority of cases, we're very hap...",Positive
106,"Thank you, Mark.",Neutral
270,So have the LTVs somehow fallen and you have a...,Negative
435,And again we believe that the market is up at ...,Positive


In [None]:
train.to_csv('/content/drive/MyDrive/xian/train_bert_trust_issue.csv', index=False)
test.to_csv('/content/drive/MyDrive/xian/test_bert_trust_issue.csv', index=False)

In [None]:
# Tokenize our entire training set.
sentences = train['sentence']
labels = train['label']

(train_input_ids,
 train_labels,
 train_attention_masks) = my_tokenize(sentences, labels, max_len = 500)


Tokenizing 415 articles...
  Tokenized 0 articles.
  Tokenized 100 articles.
  Tokenized 200 articles.
  Tokenized 300 articles.
  Tokenized 400 articles.


In [None]:
# Print sentence 0, now as a list of IDs.
print('Original: ', train.sentence.iloc[0])
print('Token IDs:', train_input_ids[0])

Original:  In content operations, for example, we are helping clients generate automated content, image, audio, and video.
Token IDs: tensor([  101,  1999,  4180,  3136,  1010,  2005,  2742,  1010,  2057,  2024,
         5094,  7846,  9699, 12978,  4180,  1010,  3746,  1010,  5746,  1010,
         1998,  2678,  1012,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
        

# Finetune FinBERT

In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# Load the pretrained FinBERT model with a single linear classification layer on top.
model = BertForSequenceClassification.from_pretrained(
    "yiyanghkust/finbert-pretrain", # Use the 12-layer FinBERT model, with an uncased vocab.
    num_labels = 3, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/359 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at yiyanghkust/finbert-pretrain and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [None]:
max_len = 400
batch_size = 32
epochs = 5 # optimal:3
learning_rate = 5e-5

In [None]:
# Makes a directory with parameter values for each model
dir_name = '{}_{}_{:.0e}'.format(max_len, batch_size, learning_rate)
drive_dir_path = '/content/drive/MyDrive/xian/final' + dir_name
# !mkdir $drive_dir_path

In [None]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)

# Create a 75-25 train-validation split.
# Calculate the number of samples to include in each set.
train_size = int(0.75 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order.
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [None]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
# I believe the 'W' stands for 'Weight Decay fix"

optimizer = torch.optim.AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = 1e-8  # default
                )

In [None]:
from transformers import get_linear_schedule_with_warmup

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

###  Create a checkpoint saving function and a loading function

In [None]:
# Saving function

import shutil
def save_ckp(state, is_best, checkpoint_path, best_model_path):
    f_path = checkpoint_path
    torch.save(state, f_path)
    if is_best:
        best_fpath = best_model_path
        shutil.copyfile(f_path, best_fpath)

In [None]:
# Loading function

def load_ckp(checkpoint_fpath, model, optimizer):
  checkpoint = torch.load(checkpoint_fpath)
  model.load_state_dict(checkpoint['state_dict'])
  optimizer.load_state_dict(checkpoint['optimizer'])
  val_loss_min = checkpoint['valid_loss_min']
  return model, optimizer, checkpoint['epoch'], val_loss_min

In [None]:
import os
import csv

def check_gpu_mem():
    '''
    Uses Nvidia's SMI tool to check the current GPU memory usage.
    Reported values are in "MiB". 1 MiB = 2^20 bytes = 1,048,576 bytes.
    '''

    # Run the command line tool and get the results.
    buf = os.popen('nvidia-smi --query-gpu=memory.total,memory.used --format=csv')

    # Use csv module to read and parse the result.
    reader = csv.reader(buf, delimiter=',')

    # Use a pandas table just for nice formatting.
    df = pd.DataFrame(reader)

    # Use the first row as the column headers.
    new_header = df.iloc[0] #grab the first row for the header
    df = df[1:] #take the data less the header row
    df.columns = new_header #set the header row as the df header

    # Display the formatted table.
    #display(df)

    return df


In [None]:
%mkdir checkpoint_fin best_model_fin

### Training loop

In [None]:
# # Load a checkpoint to continue training (if any, otherwise run the next cell))
# ckp_path = "/content/checkpoint_fin/current_checkpoint_3.pt" # Manually copy from dir
# model, optimizer, start_epoch, val_loss_min = load_ckp(ckp_path, model, optimizer)

# # Sanity check
# print("model = ", model)
# print("optimizer = ", optimizer)
# print("start_epoch = ", start_epoch)
# print("valid_loss_min = ", val_loss_min)
# print("valid_loss_min = {:.6f}".format(val_loss_min))

In [None]:
# Do not run if a checkpoint is loaded
start_epoch = 0
val_loss_min = np.Inf

In [None]:
import random
import numpy as np

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# We'll store a number of quantities such as training and validation loss,
# validation accuracy, and timings.
training_stats = []

# "global" refers to progress across the multiple epochs, to distinguish from
# progress within a single epoch.
global_train_loss = 0
global_step = 0

device = torch.device("cuda")

# For each epoch...
for epoch_i in range(start_epoch, epochs):

    # ========================================
    #               Training
    # ========================================

    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode. Don't be mislead--the call to
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 50 batches.
        if step % 10 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)

            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader.
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)
        print(b_input_ids)
        print(b_input_ids.shape)
        b_input_mask = batch[1].to(device)
        print(b_input_mask)
        print(b_input_mask.shape)
        b_labels = batch[2].to(device)
        print(b_labels)
        print(b_labels.shape)

        # Check GPU memory for the first couple steps.
        if step < 2:
            print('\n  Step {:} GPU Memory Use:'.format(step))
            df = check_gpu_mem()
            print('    Before forward-pass: {:}'.format(df.iloc[0, 1]))

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because
        # accumulating the gradients is "convenient while training RNNs".
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()

        # Perform a forward pass (evaluate the model on this training batch).
        # The documentation for this `model` function is here:
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        # It returns different numbers of parameters depending on what arguments
        # arge given and what flags are set. For our useage here, it returns
        # the loss (because we provided labels) and the "logits"--the model
        # outputs prior to activation.
        outputs_dict = model(
                    b_input_ids,
                    token_type_ids=None,
                    attention_mask=b_input_mask,
                    labels=b_labels,
                    return_dict=True)

        loss = outputs_dict['loss']
        logits = outputs_dict['logits']

        # Report GPU memory use for the first couple steps.
        if step < 2:
            df = check_gpu_mem()
            print('     After forward-pass: {:}'.format(df.iloc[0, 1]))

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value
        # from the tensor.
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Report GPU memory use for the first couple steps.
        if step < 2:
            df = check_gpu_mem()
            print('    After gradient calculation: {:}'.format(df.iloc[0, 1]))
            mem_use = df.iloc[0, 1]

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)

    # Measure how long this epoch took.
    training_time = time.time() - t0
    training_time_str = format_time(training_time)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time_str))

    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:

        # Unpack this training batch from our dataloader.
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():

            # Forward pass, calculate logit predictions.
            # token_type_ids is the same as the "segment ids", which
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here:
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs_dict = model(b_input_ids,
                                   token_type_ids=None,
                                   attention_mask=b_input_mask,
                                   labels=b_labels,
                                  return_dict = True)

            loss = outputs_dict['loss']

            # Get the "logits" output by the model. The "logits" are the output
            # values prior to applying an activation function like the softmax.
            logits = outputs_dict['logits']

        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)


    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)

    validation_time = format_time(time.time() - t0)
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Seconds': training_time,
            'Training Time': training_time_str,
            'Validation Time': validation_time
        }
    )

    # Save a checkpoint to resume training if interrupted
    checkpoint = {
    'epoch': epoch_i + 1,
    'valid_loss_min' : avg_val_loss,
    'state_dict': model.state_dict(),
    'optimizer': optimizer.state_dict()
    }

    cur_epoch = str(epoch_i + 1)

    ckp_save_path = "/content/checkpoint_fin/current_checkpoint_" + cur_epoch + ".pt"
    best_save_path = "/content/best_model_fin/best_model_" + cur_epoch + ".pt"

    save_ckp(checkpoint, False, ckp_save_path, best_save_path)
    !cp $ckp_save_path $drive_dir_path # save checkpoint to Google Drive directory

    # Save the model if validation loss has decreased
    if avg_val_loss <= val_loss_min:
      print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(val_loss_min, avg_val_loss))

      # save checkpoint as best model
      save_ckp(checkpoint, True, ckp_save_path, best_save_path)
      !cp $best_save_path $drive_dir_path # save best model to Google Drive directory
      val_loss_min = avg_val_loss

print("")
print("Training complete!")


Training...
tensor([[ 101, 1998, 1037,  ...,    0,    0,    0],
        [ 101, 2057, 2428,  ...,    0,    0,    0],
        [ 101, 1045, 2228,  ...,    0,    0,    0],
        ...,
        [ 101, 8307, 2089,  ...,    0,    0,    0],
        [ 101, 1045, 2359,  ...,    0,    0,    0],
        [ 101, 3398, 1010,  ...,    0,    0,    0]], device='cuda:0')
torch.Size([32, 500])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
torch.Size([32, 500])
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan], device='cuda:0')
torch.Size([32])

  Step 0 GPU Memory Use:
    Before forward-pass:  15093 MiB


OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 9.06 MiB is free. Process 4114 has 14.74 GiB memory in use. Of the allocated memory 14.36 GiB is allocated by PyTorch, and 255.54 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
import random
import numpy as np

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# We'll store a number of quantities such as training and validation loss,
# validation accuracy, and timings.
training_stats = []

# "global" refers to progress across the multiple epochs, to distinguish from
# progress within a single epoch.
global_train_loss = 0
global_step = 0

device = torch.device("cuda")

# For each epoch...
for epoch_i in range(start_epoch, epochs):

    # ========================================
    #               Training
    # ========================================

    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode. Don't be mislead--the call to
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 50 batches.
        if step % 10 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)

            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader.
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        print(f"Input IDs: {b_input_ids}")
        print(f"Attention Mask: {b_input_mask}")
        print(f"Labels: {b_labels}")


        # Check GPU memory for the first couple steps.
        if step < 2:
            print('\n  Step {:} GPU Memory Use:'.format(step))
            df = check_gpu_mem()
            print('    Before forward-pass: {:}'.format(df.iloc[0, 1]))

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because
        # accumulating the gradients is "convenient while training RNNs".
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()

        # Perform a forward pass (evaluate the model on this training batch).
        # The documentation for this `model` function is here:
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        # It returns different numbers of parameters depending on what arguments
        # arge given and what flags are set. For our useage here, it returns
        # the loss (because we provided labels) and the "logits"--the model
        # outputs prior to activation.
        outputs_dict = model(b_input_ids,
                             token_type_ids=None,
                             attention_mask=b_input_mask,
                             labels=b_labels,
                             return_dict=True)

        loss = outputs_dict['loss']
        logits = outputs_dict['logits']

        # Report GPU memory use for the first couple steps.
        if step < 2:
            df = check_gpu_mem()
            print('     After forward-pass: {:}'.format(df.iloc[0, 1]))

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value
        # from the tensor.
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Report GPU memory use for the first couple steps.
        if step < 2:
            df = check_gpu_mem()
            print('    After gradient calculation: {:}'.format(df.iloc[0, 1]))
            mem_use = df.iloc[0, 1]

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)

    # Measure how long this epoch took.
    training_time = time.time() - t0
    training_time_str = format_time(training_time)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time_str))

    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:

        # Unpack this training batch from our dataloader.
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():

            # Forward pass, calculate logit predictions.
            # token_type_ids is the same as the "segment ids", which
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here:
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs_dict = model(b_input_ids,
                                   token_type_ids=None,
                                   attention_mask=b_input_mask,
                                   labels=b_labels,
                                  return_dict = True)

            loss = outputs_dict['loss']

            # Get the "logits" output by the model. The "logits" are the output
            # values prior to applying an activation function like the softmax.
            logits = outputs_dict['logits']

        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)


    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)

    validation_time = format_time(time.time() - t0)
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Seconds': training_time,
            'Training Time': training_time_str,
            'Validation Time': validation_time
        }
    )

    # Save a checkpoint to resume training if interrupted
    checkpoint = {
    'epoch': epoch_i + 1,
    'valid_loss_min' : avg_val_loss,
    'state_dict': model.state_dict(),
    'optimizer': optimizer.state_dict()
    }

    cur_epoch = str(epoch_i + 1)

    ckp_save_path = "/content/checkpoint_fin/current_checkpoint_" + cur_epoch + ".pt"
    best_save_path = "/content/best_model_fin/best_model_" + cur_epoch + ".pt"

    save_ckp(checkpoint, False, ckp_save_path, best_save_path)
    !cp $ckp_save_path $drive_dir_path # save checkpoint to Google Drive directory

    # Save the model if validation loss has decreased
    if avg_val_loss <= val_loss_min:
      print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(val_loss_min, avg_val_loss))

      # save checkpoint as best model
      save_ckp(checkpoint, True, ckp_save_path, best_save_path)
      !cp $best_save_path $drive_dir_path # save best model to Google Drive directory
      val_loss_min = avg_val_loss

print("")
print("Training complete!")


NameError: name 'torch' is not defined

## Finetune RoBERTa

In [None]:
!pip install datasets huggingface_hub tensorboard==2.11
!sudo apt-get install git-lfs --yes

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting tensorboard==2.11
  Downloading tensorboard-2.11.0-py3-none-any.whl.metadata (1.9 kB)
Collecting google-auth-oauthlib<0.5,>=0.4.1 (from tensorboard==2.11)
  Downloading google_auth_oauthlib-0.4.6-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting tensorboard-data-server<0.7.0,>=0.6.0 (from tensorboard==2.11)
  Downloading tensorboard_data_server-0.6.1-py3-none-manylinux2010_x86_64.whl.metadata (1.1 kB)
Collecting tensorboard-plugin-wit>=1.6.0 (from tensorboard==2.11)
  Downloading tensorboard_plugin_wit-1.8.1-py3-none-any.whl.metadata (873 bytes)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (

In [None]:
import torch
from datasets import load_dataset
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer
from sklearn.model_selection import train_test_split

from huggingface_hub import HfFolder, notebook_login

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model_id = "roberta-base"
repository_id = "hzduuuu/roberta-base-sentiment"

In [None]:
#load dataset
df = load_dataset("csv", data_files = "/content/drive/MyDrive/xian/train_bert_trust_issue.csv")
train_test_split = df['train'].train_test_split(test_size=0.25)
train = train_test_split['train']
eval = train_test_split['test']

# Tokenize the datasets
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_function(examples):
    return tokenizer(examples['sentence'], padding="max_length", truncation=True)

train_dataset = train.map(tokenize_function, batched=True)
eval_dataset = eval.map(tokenize_function, batched=True)

# Create label2id and id2label mappings
labels_list = ['Negative', 'Neutral', 'Positive']
label2id = {label: i for i, label in enumerate(labels_list)}

# Convert string labels to numeric values using label2id mapping
def convert_labels(examples):
    examples['label'] = [label2id[label] for label in examples['label']]
    return examples

train_dataset = train_dataset.map(convert_labels, batched=True)
eval_dataset = eval_dataset.map(convert_labels, batched=True)

# Set the format of the datasets to be compatible with PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

In [None]:
# Load the model
model = RobertaForSequenceClassification.from_pretrained(model_id, num_labels=3)

# Define TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=100,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token()
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Train the model
trainer.train()

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Save the tokenizer and model locally
local_directory = "/content/drive/MyDrive/xian/roberta"
#tokenizer.save_pretrained(local_directory)
model.save_pretrained(local_directory)

# Push to the Hugging Face hub
from huggingface_hub import HfApi, HfFolder

api = HfApi()
api.upload_folder(
    folder_path=local_directory,
    path_in_repo=".",
    repo_id=repository_id,
    repo_type="model"
)

# You can also push the results to the hub using the Trainer's method
trainer.create_model_card()
trainer.push_to_hub()

NameError: name 'model' is not defined

In [None]:
# Save our tokenizer and create a model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
# Push the results to the hub
trainer.push_to_hub()

NameError: name 'tokenizer' is not defined

In [None]:
import torch
from datasets import load_dataset
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer
from sklearn.model_selection import train_test_split
from huggingface_hub import HfFolder, notebook_login



# Model ID and Repository ID
model_id = "roberta-base"
repository_id = "hzduuuu/roberta-base-sentiment"

# Load dataset
df = load_dataset("csv", data_files="/content/drive/MyDrive/xian/train_bert_trust_issue.csv")
train_test_split = df['train'].train_test_split(test_size=0.25)
train = train_test_split['train']
eval = train_test_split['test']

# Tokenize the datasets
tokenizer = AutoTokenizer.from_pretrained(model_id)

def tokenize_function(examples):
    return tokenizer(examples['sentence'], padding="max_length", truncation=True)

train_dataset = train.map(tokenize_function, batched=True)
eval_dataset = eval.map(tokenize_function, batched=True)

# Create label2id and id2label mappings
labels_list = ['Negative', 'Neutral', 'Positive']
label2id = {label: i for i, label in enumerate(labels_list)}

# Convert string labels to numeric values using label2id mapping
def convert_labels(examples):
    examples['label'] = [label2id[label] for label in examples['label']]
    return examples

train_dataset = train_dataset.map(convert_labels, batched=True)
eval_dataset = eval_dataset.map(convert_labels, batched=True)

# Set the format of the datasets to be compatible with PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Load the model
model = RobertaForSequenceClassification.from_pretrained(model_id, num_labels=3)

# Define TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    fp16=True,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token()
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Train the model
trainer.train()

# Save the tokenizer and model locally
local_directory = "/content/drive/MyDrive/xian/roberta"
tokenizer.save_pretrained(local_directory)
model.save_pretrained(local_directory)

# Push to the Hugging Face hub
from huggingface_hub import HfApi, HfFolder

api = HfApi()
api.upload_folder(
    folder_path=local_directory,
    path_in_repo=".",
    repo_id=repository_id,
    repo_type="model"
)

# You can also push the results to the hub using the Trainer's method
trainer.create_model_card()
trainer.push_to_hub()

Generating train split: 0 examples [00:00, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.09,1.085302
2,1.0329,1.025457


Epoch,Training Loss,Validation Loss
1,1.09,1.085302
2,1.0329,1.025457
3,0.7433,0.806609
4,0.7679,0.7961
5,0.4994,0.818968


No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/hzduuuu/roberta-base-sentiment/commit/0f305024286bcfb57c70e1f8c36fd570afe6cd4c', commit_message='End of training', commit_description='', oid='0f305024286bcfb57c70e1f8c36fd570afe6cd4c', pr_url=None, pr_revision=None, pr_num=None)

(Optional) Check GPU RAM

In [None]:
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize

import psutil
import humanize
import os
import GPUtil as GPU

GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
    process = psutil.Process(os.getpid())
    print("Gen RAM Free: " + humanize.naturalsize(psutil.virtual_memory().available), " |     Proc size: " + humanize.naturalsize(process.memory_info().rss))
    print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total     {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

Collecting gputil
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gputil
  Building wheel for gputil (setup.py) ... [?25l[?25hdone
  Created wheel for gputil: filename=GPUtil-1.4.0-py3-none-any.whl size=7392 sha256=fc67374c44eb4ab11f18475c751c13812dc8716148eb17cd172664697eb8527e
  Stored in directory: /root/.cache/pip/wheels/a9/8a/bd/81082387151853ab8b6b3ef33426e98f5cbfebc3c397a9d4d0
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.4.0
Gen RAM Free: 11.4 GB  |     Proc size: 1.6 GB
GPU RAM Free: 41MB | Used: 15061MB | Util  98% | Total     15360MB
