In [96]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [97]:
%cd '/content/drive/MyDrive/CS7643/Project'

/content/drive/MyDrive/CS7643/Project


In [None]:
!pip install transformers
!pip install datasets
!pip install tokenizers

In [99]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [100]:
from transformers import BertTokenizer, BertModel

#tokenizer = BertTokenizer.from_pretrained('data/tokenizer/fake_tokenizer', max_len=512) # This is the tokenizer I made. Not sure if we would use
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Retrieve dataset from /data folder below root directory ('/content/drive/MyDrive/CS7643/Project') then perform tokenize operations on each entry

In [102]:
from datasets import load_dataset
# All data will be loaded together unless data_files is specified
dataset = load_dataset('data', data_files ={'train':'train.csv','test':'test.csv'})
dataset = dataset.rename_column("Unnamed: 0", 'id')

Using custom data configuration data-cf9ebd5eebbe0a50
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/data-cf9ebd5eebbe0a50/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/2 [00:00<?, ?it/s]

In [103]:
# Use Huggingface map function to tokenize all entries in the dataset as required by Bert Model 
dataset = dataset.map(lambda e: tokenizer(e['text'], add_special_tokens = True, max_length = 64, truncation = True, padding = 'max_length'))

Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/data-cf9ebd5eebbe0a50/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-51ef8476bc4af83e.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/data-cf9ebd5eebbe0a50/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-aaa944c4ffb770de.arrow


In [104]:
# Dataset needs to be in pytorch format for the data fed into Bert Model 
dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])

In [105]:
dataset.column_names

{'test': ['id',
  'text',
  'label',
  'input_ids',
  'token_type_ids',
  'attention_mask'],
 'train': ['id',
  'text',
  'label',
  'input_ids',
  'token_type_ids',
  'attention_mask']}

In [140]:
test  = dataset['test']
print('test shape: ', test.shape)
train = dataset['train']
print('train shape: ', train.shape)

test shape:  (8980, 6)
train shape:  (35918, 6)


In [141]:
# Look at encoded input_ids
tokens = test['input_ids']
print(tokenizer.decode(tokens[0]))

[CLS] 21st century wire says ben stein, reputable professor from, pepperdine university ( also of some hollywood fame appearing in tv shows and films such as ferris bueller s day off ) made some provocative statements on judge jeanine pirro s show recently. while discussing the halt that was imposed on president trump s executive [SEP]


In [142]:
# Shard is a hugging face dataset method to split data up. Need to split up shard by index value
shards = test.shard(num_shards = 12, index = 0)
print('shard 0 shape: ', shards.shape)

shard 0 shape:  (749, 6)



# This worked but was not parallelized and not on the GPU so took forever for just a test size amount of data

In [None]:
# get contextualized embeddings from bert model

'''
import torch
from time import time
start = time()
bert_embeddings = []
num_shards = 12
with torch.no_grad():
    for s in range(1,num_shards - 1):
      shards = test.shard(num_shards = 12, index = s)
      bert_embeddings.append(model(shards['input_ids'])[0])
    # X_train_bert = bert_model(train_indices)[0]  # Models outputs are tuples
    # X_val_bert = bert_model(val_indices)[0]
    # X_test_bert = bert_model(test_indices)[0]
end = time()
elapsed = end - start
if elapsed < 180:
    print(f'code took {elapsed:0.2f} seconds to execute')
else:
    print(f'code took {elapsed / 60:0.2f} minutes to execute')
    '''

code took 19.18 minutes to execute


# This worked well to get a list of (batch_size) embeddings

In [122]:
from torch.utils.data import DataLoader

dataloader = DataLoader(dataset= test['input_ids'], batch_size = 256)

In [72]:
test['input_ids'].shape

torch.Size([8980, 64])

In [73]:
!pip install tqdm



In [74]:
print('batch size: ', len(dataloader), ' datasize: ', len(dataloader.dataset))

batch size:  36  datasize:  8980



REF:  https://discuss.huggingface.co/t/how-to-ensure-fast-inference-on-both-cpu-and-gpu-with-bertforsequenceclassification/1694

In [131]:
#Ref: https://discuss.huggingface.co/t/how-to-ensure-fast-inference-on-both-cpu-and-gpu-with-bertforsequenceclassification/1694
from tqdm.notebook import tqdm
import numpy as np
import torch

# Get the progress bar for later modification
progress_bar = tqdm(dataloader, ascii=True)

all_embeddings = []

for data in progress_bar:
  with torch.no_grad():        
    outputs = model(data.to("cuda"))[0]

    all_embeddings.append(outputs)


  0%|          | 0/36 [00:00<?, ?it/s]

In [132]:
len(all_embeddings)

36

In [133]:
#output is list of batch_size with (batch, sequence_length, hidden_size)
all_embeddings[0].shape

torch.Size([256, 64, 768])