In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3"
import torch
print(torch.__version__)
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
import random
random.seed(0)
import numpy as np
np.random.seed(0)
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

1.1.0


In [2]:
import re
from tqdm import tqdm
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader

In [3]:
MAX_SEQ_LENGTH = 256
BATCH_SIZE = 4
NUM_EPOCHS = 5
LEARNING_RATE_MODEL = 1e-5
LEARNING_RATE_CLASSIFIER = 1e-3
WARMUP_STEPS = 0
MAX_GRAD_NORM = 1
GRADIENT_ACCUMULATION_STEPS = 4

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
model = BertForSequenceClassification.from_pretrained("bert-large-uncased", num_labels=2)
model = torch.nn.DataParallel(model)
model.to(device)

DataParallel(
  (module): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 1024, padding_idx=0)
        (position_embeddings): Embedding(512, 1024)
        (token_type_embeddings): Embedding(2, 1024)
        (LayerNorm): LayerNorm(torch.Size([1024]), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=True)
                (dropout): Dropout(p=0.1)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=1024, out_features=1024, bias=True)
    

In [5]:
def _parse_imdb_line(line):
    line = re.sub(r"<br \/>", " ", line)
    return line


def load_data(path):
    indices, sentiments = [], []
    for folder, sentiment in (('neg', 0), ('pos', 1)):
        folder = os.path.join(path, folder)
        for name in tqdm(os.listdir(folder)):
            with open(os.path.join(folder, name), 'r') as reader:
                  text = _parse_imdb_line(reader.read())
            ids = tokenizer.encode(text, max_length=MAX_SEQ_LENGTH, pad_to_max_length=True)
            indices.append(ids)
            sentiments.append(sentiment)
    return np.array(indices), np.array(sentiments)


train_path = os.path.join("../datasets", 'aclImdb', 'train')
test_path = os.path.join("../datasets", 'aclImdb', 'test')
X_train, y_train = load_data(train_path)
X_test, y_test = load_data(test_path)

100%|██████████| 12500/12500 [00:52<00:00, 240.23it/s]
100%|██████████| 12500/12500 [00:54<00:00, 231.41it/s]
100%|██████████| 12500/12500 [00:51<00:00, 242.65it/s]
100%|██████████| 12500/12500 [00:52<00:00, 238.79it/s]


In [6]:
X_train = torch.tensor(X_train, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)

In [7]:
train_data = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_data = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

In [8]:
optimizer_grouped_parameters = [
        {"params": model.module.bert.parameters(), "lr": LEARNING_RATE_MODEL},
        {"params": model.module.classifier.parameters(), "lr": LEARNING_RATE_CLASSIFIER}
    ]
optimizer = AdamW(optimizer_grouped_parameters)
scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=WARMUP_STEPS,
                num_training_steps=len(train_loader) // GRADIENT_ACCUMULATION_STEPS * NUM_EPOCHS)
total_step = len(train_loader)
for epoch in range(NUM_EPOCHS):
    model.train()
    model.zero_grad()
    for i, (cur_X_train, cur_y_train) in enumerate(train_loader):
        cur_X_train = cur_X_train.to(device)
        cur_y_train = cur_y_train.to(device)
        outputs = model(cur_X_train)
        loss = nn.CrossEntropyLoss()(outputs[0], cur_y_train)
        loss /= GRADIENT_ACCUMULATION_STEPS
        loss.backward()
        if (i + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
            optimizer.step()
            scheduler.step()
            model.zero_grad()
        if (i + 1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                   .format(epoch + 1, NUM_EPOCHS, i + 1, total_step, loss.item()))
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for cur_X_test, cur_y_test in tqdm(test_loader):
            cur_X_test = cur_X_test.to(device)
            cur_y_test = cur_y_test.to(device)
            outputs = model(cur_X_test)
            _, predicted = torch.max(outputs[0], 1)
            total += cur_y_test.size(0)
            correct += (predicted == cur_y_test).sum().item()
        print('Accuracy: {} %'.format(100 * correct / total))

Epoch [1/5], Step [100/6250], Loss: 0.1172
Epoch [1/5], Step [200/6250], Loss: 0.1167
Epoch [1/5], Step [300/6250], Loss: 0.1562
Epoch [1/5], Step [400/6250], Loss: 0.1158
Epoch [1/5], Step [500/6250], Loss: 0.0507
Epoch [1/5], Step [600/6250], Loss: 0.0329
Epoch [1/5], Step [700/6250], Loss: 0.0147
Epoch [1/5], Step [800/6250], Loss: 0.0256
Epoch [1/5], Step [900/6250], Loss: 0.0037
Epoch [1/5], Step [1000/6250], Loss: 0.0201
Epoch [1/5], Step [1100/6250], Loss: 0.0091
Epoch [1/5], Step [1200/6250], Loss: 0.0039
Epoch [1/5], Step [1300/6250], Loss: 0.0012
Epoch [1/5], Step [1400/6250], Loss: 0.0041
Epoch [1/5], Step [1500/6250], Loss: 0.0128
Epoch [1/5], Step [1600/6250], Loss: 0.0246
Epoch [1/5], Step [1700/6250], Loss: 0.2041
Epoch [1/5], Step [1800/6250], Loss: 0.0426
Epoch [1/5], Step [1900/6250], Loss: 0.1126
Epoch [1/5], Step [2000/6250], Loss: 0.0154
Epoch [1/5], Step [2100/6250], Loss: 0.0445
Epoch [1/5], Step [2200/6250], Loss: 0.0011
Epoch [1/5], Step [2300/6250], Loss: 0.00

100%|██████████| 6250/6250 [22:45<00:00,  4.58it/s]


Accuracy: 92.828 %
Epoch [2/5], Step [100/6250], Loss: 0.0002
Epoch [2/5], Step [200/6250], Loss: 0.0006
Epoch [2/5], Step [300/6250], Loss: 0.0014
Epoch [2/5], Step [400/6250], Loss: 0.0002
Epoch [2/5], Step [500/6250], Loss: 0.0003
Epoch [2/5], Step [600/6250], Loss: 0.0082
Epoch [2/5], Step [700/6250], Loss: 0.0034
Epoch [2/5], Step [800/6250], Loss: 0.0017
Epoch [2/5], Step [900/6250], Loss: 0.0001
Epoch [2/5], Step [1000/6250], Loss: 0.0007
Epoch [2/5], Step [1100/6250], Loss: 0.0000
Epoch [2/5], Step [1200/6250], Loss: 0.0004
Epoch [2/5], Step [1300/6250], Loss: 0.0013
Epoch [2/5], Step [1400/6250], Loss: 0.2141
Epoch [2/5], Step [1500/6250], Loss: 0.0007
Epoch [2/5], Step [1600/6250], Loss: 0.2006
Epoch [2/5], Step [1700/6250], Loss: 0.0026
Epoch [2/5], Step [1800/6250], Loss: 0.0003
Epoch [2/5], Step [1900/6250], Loss: 0.0019
Epoch [2/5], Step [2000/6250], Loss: 0.0100
Epoch [2/5], Step [2100/6250], Loss: 0.0037
Epoch [2/5], Step [2200/6250], Loss: 0.0010
Epoch [2/5], Step [230

100%|██████████| 6250/6250 [22:24<00:00,  4.65it/s]


Accuracy: 93.576 %
Epoch [3/5], Step [100/6250], Loss: 0.0006
Epoch [3/5], Step [200/6250], Loss: 0.0003
Epoch [3/5], Step [300/6250], Loss: 0.0001
Epoch [3/5], Step [400/6250], Loss: 0.0005
Epoch [3/5], Step [500/6250], Loss: 0.0007
Epoch [3/5], Step [600/6250], Loss: 0.0598
Epoch [3/5], Step [700/6250], Loss: 0.0001
Epoch [3/5], Step [800/6250], Loss: 0.0003
Epoch [3/5], Step [900/6250], Loss: 0.0002
Epoch [3/5], Step [1000/6250], Loss: 0.0002
Epoch [3/5], Step [1100/6250], Loss: 0.0001
Epoch [3/5], Step [1200/6250], Loss: 0.0006
Epoch [3/5], Step [1300/6250], Loss: 0.0476
Epoch [3/5], Step [1400/6250], Loss: 0.0001
Epoch [3/5], Step [1500/6250], Loss: 0.0002
Epoch [3/5], Step [1600/6250], Loss: 0.0000
Epoch [3/5], Step [1700/6250], Loss: 0.0018
Epoch [3/5], Step [1800/6250], Loss: 0.0011
Epoch [3/5], Step [1900/6250], Loss: 0.0001
Epoch [3/5], Step [2000/6250], Loss: 0.0014
Epoch [3/5], Step [2100/6250], Loss: 0.0002
Epoch [3/5], Step [2200/6250], Loss: 0.0006
Epoch [3/5], Step [230

100%|██████████| 6250/6250 [22:24<00:00,  4.65it/s]


Accuracy: 93.196 %
Epoch [4/5], Step [100/6250], Loss: 0.0001
Epoch [4/5], Step [200/6250], Loss: 0.0001
Epoch [4/5], Step [300/6250], Loss: 0.0000
Epoch [4/5], Step [400/6250], Loss: 0.0001
Epoch [4/5], Step [500/6250], Loss: 0.0005
Epoch [4/5], Step [600/6250], Loss: 0.5662
Epoch [4/5], Step [700/6250], Loss: 0.0001
Epoch [4/5], Step [800/6250], Loss: 0.0001
Epoch [4/5], Step [900/6250], Loss: 0.0003
Epoch [4/5], Step [1000/6250], Loss: 0.0000
Epoch [4/5], Step [1100/6250], Loss: 0.0000
Epoch [4/5], Step [1200/6250], Loss: 0.0001
Epoch [4/5], Step [1300/6250], Loss: 0.0000
Epoch [4/5], Step [1400/6250], Loss: 0.0002
Epoch [4/5], Step [1500/6250], Loss: 0.0000
Epoch [4/5], Step [1600/6250], Loss: 0.0001
Epoch [4/5], Step [1700/6250], Loss: 0.0000
Epoch [4/5], Step [1800/6250], Loss: 0.0000
Epoch [4/5], Step [1900/6250], Loss: 0.0001
Epoch [4/5], Step [2000/6250], Loss: 0.0000
Epoch [4/5], Step [2100/6250], Loss: 0.0000
Epoch [4/5], Step [2200/6250], Loss: 0.0000
Epoch [4/5], Step [230

100%|██████████| 6250/6250 [22:24<00:00,  4.65it/s]


Accuracy: 93.536 %
Epoch [5/5], Step [100/6250], Loss: 0.0000
Epoch [5/5], Step [200/6250], Loss: 0.0001
Epoch [5/5], Step [300/6250], Loss: 0.0000
Epoch [5/5], Step [400/6250], Loss: 0.0000
Epoch [5/5], Step [500/6250], Loss: 0.0000
Epoch [5/5], Step [600/6250], Loss: 0.0000
Epoch [5/5], Step [700/6250], Loss: 0.0000
Epoch [5/5], Step [800/6250], Loss: 0.0000
Epoch [5/5], Step [900/6250], Loss: 0.0000
Epoch [5/5], Step [1000/6250], Loss: 0.0000
Epoch [5/5], Step [1100/6250], Loss: 0.0000
Epoch [5/5], Step [1200/6250], Loss: 0.0000
Epoch [5/5], Step [1300/6250], Loss: 0.0000
Epoch [5/5], Step [1400/6250], Loss: 0.0000
Epoch [5/5], Step [1500/6250], Loss: 0.0001
Epoch [5/5], Step [1600/6250], Loss: 0.0000
Epoch [5/5], Step [1700/6250], Loss: 0.0000
Epoch [5/5], Step [1800/6250], Loss: 0.0000
Epoch [5/5], Step [1900/6250], Loss: 0.0000
Epoch [5/5], Step [2000/6250], Loss: 0.0000
Epoch [5/5], Step [2100/6250], Loss: 0.0000
Epoch [5/5], Step [2200/6250], Loss: 0.0000
Epoch [5/5], Step [230

100%|██████████| 6250/6250 [22:23<00:00,  4.65it/s]

Accuracy: 93.536 %





In [9]:
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for cur_X_test, cur_y_test in tqdm(test_loader):
        cur_X_test = cur_X_test.to(device)
        cur_y_test = cur_y_test.to(device)
        outputs = model(cur_X_test)
        _, predicted = torch.max(outputs[0], 1)
        total += cur_y_test.size(0)
        correct += (predicted == cur_y_test).sum().item()
    print('Accuracy: {} %'.format(100 * correct / total))

100%|██████████| 6250/6250 [22:23<00:00,  4.65it/s]

Accuracy: 93.536 %



