In [1]:
import os
assert os.environ['COLAB_TPU_ADDR'], 'Make sure to select TPU from Edit > Notebook settings > Hardware accelerator'

In [2]:
!pip install cloud-tpu-client==0.10 torch==1.11.0 https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-1.11-cp37-cp37m-linux_x86_64.whl

Collecting torch-xla==1.11
  Using cached https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-1.11-cp37-cp37m-linux_x86_64.whl (152.9 MB)


In [3]:
# imports the torch_xla package
import torch_xla
import torch_xla.core.xla_model as xm



In [4]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 37.8 gigabytes of available RAM

You are using a high-RAM runtime!


# New Section

In [5]:
!pip install -q transformers

In [6]:
import torch
import json 
import logging

from tqdm.notebook import tqdm
from transformers import BertTokenizer, BertForMaskedLM, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset


In [7]:
def my_collate(batch):
    '''
    'doc_key': ex['doc_key'],
    'input_token_ids':input_tokens['input_ids'],
    'input_attn_mask': input_tokens['attention_mask'],
    'tgt_token_ids': tgt_tokens['input_ids'],
    'tgt_attn_mask': tgt_tokens['attention_mask'],
    '''
    doc_keys = [ex['doc_key'] for ex in batch]
    input_token_ids = torch.stack([torch.LongTensor(ex['input_token_ids']) for ex in batch]) 
    input_attn_mask = torch.stack([torch.BoolTensor(ex['input_attn_mask']) for ex in batch])
    tgt_token_ids = torch.stack([torch.LongTensor(ex['tgt_token_ids']) for ex in batch]) 
    tgt_attn_mask = torch.stack([torch.BoolTensor(ex['tgt_attn_mask']) for ex in batch])

    return {
        'input_token_ids': input_token_ids,
        'input_attn_mask': input_attn_mask,
        'tgt_token_ids': tgt_token_ids,
        'tgt_attn_mask': tgt_attn_mask,
        'doc_key': doc_keys,
    }


class IEDataset(Dataset):
    def __init__(self, input_file):
        super().__init__()
        self.examples = []
        with open(input_file, 'r') as f:
            for line in f:
                ex = json.loads(line.strip())
                self.examples.append(ex)
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        return self.examples[idx]
    

In [8]:
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [9]:
dataset = 'KAIROS_oracle'
folder = 'KAIROS'

In [10]:
train = IEDataset(f'gdrive/MyDrive/6740Training/preprocessed_{folder}/train_{dataset}.jsonl')
test = IEDataset(f'gdrive/MyDrive/6740Training/preprocessed_{folder}/test_{dataset}.jsonl')

In [11]:
device = xm.xla_device()

In [12]:
# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# print(device)

In [13]:
n_epochs = 5
lr = 5e-5
batch_size = 16

In [14]:
def log(msg):
  with open(f'gdrive/MyDrive/6740Training/Logs/{dataset}_log.txt', 'a') as file:
    file.write(msg)
    file.write("\n")

In [15]:
with open(f'gdrive/MyDrive/6740Training/Logs/{dataset}_log.txt', 'w') as file:
    file.write(f'{dataset} logger')
    file.write("\n")

In [16]:
train = DataLoader(train, 
    pin_memory=True, num_workers=2, 
    collate_fn=my_collate,
    batch_size=batch_size, 
    shuffle=True)

test = DataLoader(test, 
    pin_memory=True, num_workers=2, 
    collate_fn=my_collate,
    batch_size=batch_size, 
    shuffle=True)

In [17]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer.add_tokens(['<args>'])

model = BertForMaskedLM.from_pretrained('bert-base-uncased', output_hidden_states = True)
model.resize_token_embeddings(len(tokenizer))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Embedding(30523, 768)

In [18]:
num_train_steps = int(len(train) * n_epochs)
print(num_train_steps)
print(int(num_train_steps * .1))
optimizer = AdamW(model.parameters(), lr=lr, correct_bias = True, weight_decay = 0.01)

scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps = int(num_train_steps * .1), num_training_steps=num_train_steps
)

960
96




In [19]:
def eval_model(i):
  model.eval()
  training_loss = 0
  testing_loss = 0

  torch.save({
              'epoch': 0,
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimizer.state_dict(),
              'scheduler_state_dict': scheduler.state_dict(),
              'loss': 0,
              }, f'gdrive/MyDrive/6740Training/Checkpoints/{dataset}_{i}.pt')

  with torch.no_grad():
      for idx, data in tqdm(enumerate(train)):
        input_ids = data['input_token_ids'].to(device)
        labels = data['tgt_token_ids'].to(device)
        attention_mask = data['input_attn_mask'].to(device)
        output_mlm = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = output_mlm['loss']

        training_loss += loss.item() * batch_size

      log(f'Iter {i}: Train Loss = {training_loss / len(train)}')

  with torch.no_grad():
      for idx, data in tqdm(enumerate(test)):
        input_ids = data['input_token_ids'].to(device)
        labels = data['tgt_token_ids'].to(device)
        attention_mask = data['input_attn_mask'].to(device)
        output_mlm = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = output_mlm['loss']

        testing_loss += loss.item() * batch_size

      log(f'Iter {i}: Test Loss = {testing_loss / len(test)}')

In [20]:
model.to(device);

In [21]:
#eval_model(0)

In [None]:
i = 0
i_eval = 500

# model.train()

training_loss = 0
testing_loss = 0

for i_epoch in range(n_epochs):
    model.train()
    print(f'Starting Epoch {i_epoch + 1}')
    for idx, data in tqdm(enumerate(train)):
        input_ids = data['input_token_ids'].to(device)
        labels = data['tgt_token_ids'].to(device)
        attention_mask = data['input_attn_mask'].to(device)

        output_mlm = model(input_ids, attention_mask=attention_mask, labels=labels)

        input_ids = data['input_token_ids'].to('cpu')
        labels = data['tgt_token_ids'].to('cpu')
        attention_mask = data['input_attn_mask'].to('cpu')

        optimizer.zero_grad()
        loss = output_mlm['loss']

        loss.backward()
        optimizer.step()
        scheduler.step()

        optimizer.zero_grad()

        i += batch_size

        if i >= i_eval:
          print('Running Eval')
          i_eval += 500
          eval_model(i)

Starting Epoch 1


0it [00:00, ?it/s]

Running Eval


0it [00:00, ?it/s]

0it [00:00, ?it/s]