In [14]:
from collections import defaultdict
from dataclasses import dataclass
import os
import random
import time
from typing import Callable, Dict, List, Generator, Tuple
from data_pre_process import *
from model import *
from data_loader import *
import gc

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tqdm._tqdm_notebook import tqdm_notebook as tqdm

import torch
from torch import nn, optim
import torch.cuda.amp 
from pathlib import Path
from torch.cuda.amp import GradScaler as scaler

from torch.utils.data import Dataset, Subset, DataLoader

from transformers import BertTokenizer, AdamW, BertModel, get_linear_schedule_with_warmup, BertPreTrainedModel

In [2]:
bert_model = 'bert-base-uncased'
do_lower_case = 'uncased' in bert_model
device = torch.device('cuda')

data_dir_t = Path('data_2/v1.0/train')
data_path_t = data_dir_t/'nq-train-00.jsonl.gz'

data_dir_v = Path('data_2/v1.0/dev')
data_path_v = data_dir_v/'nq-dev-00.jsonl.gz'

In [3]:
chunksize = 10
max_seq_len = 384
max_question_len = 64
doc_stride = 128
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case='uncased' in 'bert-base-uncased')

convert_func = functools.partial(convert_data,
                                 tokenizer=tokenizer,
                                 max_seq_len=max_seq_len,
                                 max_question_len=max_question_len,
                                 doc_stride=doc_stride)

In [4]:
start = time.time()
with gzip.open(data_path_t, "rb") as f:
    data = f.read()
x = data.splitlines()
data_reader = JsonlReader(x, convert_func, chunksize=chunksize)
end = time.time()
print("Loading Data:", end - start, "seconds")

train_size = len(x)

Loading Data: 38.346622467041016 seconds


In [6]:
num_labels = 5
n_epochs = 1
lr = 2e-5
warmup = 0.05
batch_size = 16
accumulation_steps = 4

In [11]:
model = BertForQuestionAnswering.from_pretrained(bert_model, num_labels=5)
model = model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
train_optimization_steps = int(n_epochs * train_size / batch_size / accumulation_steps)
warmup_steps = int(train_optimization_steps * warmup)

optimizer = AdamW(optimizer_grouped_parameters, lr=lr, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=train_optimization_steps)

s = torch.cuda.amp.GradScaler()
model.zero_grad()
model = model.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

In [16]:
global_step = 0
for examples in tqdm(data_reader, total=int(np.ceil(train_size / chunksize))):
    train_dataset = TextDataset(examples)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    for x_batch, y_batch in train_loader:
        x_batch, attention_mask, token_type_ids = x_batch
        y_batch = (y.to(device) for y in y_batch)

        y_pred = model(x_batch.to(device),
                       attention_mask=attention_mask.to(device),
                       token_type_ids=token_type_ids.to(device))
        loss = loss_fn(y_pred, y_batch)
#         with amp.scale_loss(loss, optimizer) as scaled_loss:
        loss.backward()
        if (global_step + 1) % accumulation_steps == 0:
            optimizer.step()
            scheduler.step()
            model.zero_grad()

        global_step += 1
        break
        
    break
    if (time.time() - start_time) / 3600 > 7:
        break

del examples, train_dataset, train_loader
x = gc.collect()

HBox(children=(FloatProgress(value=0.0, max=597.0), HTML(value='')))