In [1]:
!pip install transformers==3.0.0

Collecting transformers==3.0.0
  Downloading transformers-3.0.0-py3-none-any.whl (754 kB)
[K     |████████████████████████████████| 754 kB 5.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 32.3 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 38.5 MB/s 
Collecting tokenizers==0.8.0-rc4
  Downloading tokenizers-0.8.0rc4-cp37-cp37m-manylinux1_x86_64.whl (3.0 MB)
[K     |████████████████████████████████| 3.0 MB 33.6 MB/s 
Installing collected packages: tokenizers, sentencepiece, sacremoses, transformers
Successfully installed sacremoses-0.0.46 sentencepiece-0.1.96 tokenizers-0.8.0rc4 transformers-3.0.0


In [2]:
!unzip /content/cola_public_1.1.zip

Archive:  /content/cola_public_1.1.zip
   creating: cola_public/
  inflating: cola_public/README      
   creating: cola_public/tokenized/
  inflating: cola_public/tokenized/in_domain_dev.tsv  
  inflating: cola_public/tokenized/in_domain_train.tsv  
  inflating: cola_public/tokenized/out_of_domain_dev.tsv  
   creating: cola_public/raw/
  inflating: cola_public/raw/in_domain_dev.tsv  
  inflating: cola_public/raw/in_domain_train.tsv  
  inflating: cola_public/raw/out_of_domain_dev.tsv  


In [2]:
import os, sys

import random
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
# from nltk.tokenize import TweetTokenizer

import itertools
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

## Dataloader and utils function

In [20]:
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    NUM_LABELS = 2
    
    def __init__(self, dataset_path, tokenizer, split):
        if (split=='train' or split=='dev'):
          df = pd.read_csv(dataset_path, names=["code", "label", "notated_by_author","text"], sep="\t")
        else:
          df = pd.read_csv(dataset_path, sep="\t")
          df['text'] = df['Sentence']
          df['label'] = 0
        
        self.data = df
        self.tokenizer = tokenizer
    
    def __getitem__(self, index):
        data = self.data.loc[index,:]
        text, label = data['text'], data['label']
        subwords = self.tokenizer(text, padding='max_length', truncation=True, max_length=128)
        item = {key: torch.tensor(val) for key, val in subwords.items()}
        item['labels'] = torch.tensor(label)
        return item
    
    def __len__(self):
        return len(self.data)    

In [4]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
set_seed(42)

## Load model

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
config = BertConfig.from_pretrained('bert-base-uncased')
config.num_labels = CustomDataset.NUM_LABELS

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

## Load data

In [21]:
train_dataset_path = '/content/cola_public/raw/in_domain_train.tsv'
test_dataset_path = '/content/cola_in_domain_test.tsv'
dev_dataset_path = '/content/cola_public/raw/in_domain_dev.tsv'

In [22]:
train_dataset = CustomDataset(train_dataset_path, tokenizer, "train")
test_dataset = CustomDataset(test_dataset_path, tokenizer, "test")
dev_dataset = CustomDataset(dev_dataset_path, tokenizer, "dev")

train_loader = DataLoader(dataset=train_dataset,  batch_size=8, shuffle=True)  
test_loader = DataLoader(dataset=test_dataset,  batch_size=8,  shuffle=False)  
dev_loader = DataLoader(dataset=dev_dataset,  batch_size=8, shuffle=False)

In [8]:
print(train_dataset[0])
print(len(train_dataset))

{'input_ids': tensor([  101,  2256,  2814,  2180,  1005,  1056,  4965,  2023,  4106,  1010,
         2292,  2894,  1996,  2279,  2028,  2057, 16599,  1012,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 

## Train

In [9]:
optimizer = optim.Adam(model.parameters(), lr=3e-5)
model = model.cuda()

In [10]:
# Train
device = 'cuda'
n_epochs = 4
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)
 
    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch in enumerate(train_pbar):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f}".format((epoch+1),
            total_train_loss/(i+1)))
        
    # # eval on dev
    model.eval()
    torch.set_grad_enabled(False)
    
    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []

    pbar = tqdm(dev_loader, leave=True, total=len(dev_loader))
    for i, batch in enumerate(pbar):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]

        valid_loss = loss.item()
        total_loss = total_loss + valid_loss

        pbar.set_description("DEV LOSS:{:.4f} ".format(total_loss/(i+1)))

        logits = outputs[1]
        batch_hyp = torch.argmax(logits, dim=-1)
        list_hyp += batch_hyp.cpu().numpy().tolist()
        list_label += labels.cpu().numpy().tolist()
    
    acc = accuracy_score(list_label, list_hyp)
    f1 = f1_score(list_label, list_hyp, average='macro')
    rec = recall_score(list_label, list_hyp, average='macro')
    prec = precision_score(list_label, list_hyp, average='macro')

    print(f"epoch: {epoch}")
    print("Acc: ", acc)
    print("F1: ", f1)
    print("recall: ", rec)
    print("precision: ", prec)

(Epoch 1) TRAIN LOSS:0.5035: 100%|██████████| 1069/1069 [07:17<00:00,  2.44it/s]
DEV LOSS:0.4222 : 100%|██████████| 66/66 [00:08<00:00,  7.53it/s]


epoch: 0
Acc:  0.8064516129032258
F1:  0.7409495720564423
recall:  0.719516319972941
precision:  0.8039812646370024


(Epoch 2) TRAIN LOSS:0.2985: 100%|██████████| 1069/1069 [07:17<00:00,  2.44it/s]
DEV LOSS:0.4020 : 100%|██████████| 66/66 [00:08<00:00,  7.58it/s]


epoch: 1
Acc:  0.8444022770398482
F1:  0.8057832668176752
recall:  0.7898274987316083
precision:  0.832057740747917


(Epoch 3) TRAIN LOSS:0.1861: 100%|██████████| 1069/1069 [07:17<00:00,  2.44it/s]
DEV LOSS:0.6445 : 100%|██████████| 66/66 [00:08<00:00,  7.55it/s]


epoch: 2
Acc:  0.7950664136622391
F1:  0.7257113115891742
recall:  0.7061474716725858
precision:  0.7854683840749415


(Epoch 4) TRAIN LOSS:0.1367: 100%|██████████| 1069/1069 [07:17<00:00,  2.44it/s]
DEV LOSS:0.4554 : 100%|██████████| 66/66 [00:08<00:00,  7.57it/s]

epoch: 3
Acc:  0.857685009487666
F1:  0.8302081319672658
recall:  0.8251648909183156
precision:  0.8360071301247771





In [15]:
# model.eval()
# torch.save(model.state_dict(),'bert_cola.bin')

## Evaluate on test

In [26]:
loaded_model = BertForSequenceClassification.from_pretrained('/content/bert_cola.bin', config=config)

In [30]:
model= loaded_model
model.eval()
torch.set_grad_enabled(False)
device = 'cpu'
total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []

pbar = tqdm(test_loader, leave=True, total=len(test_loader))
for i, batch in enumerate(pbar):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs[0]

    logits = outputs[1]
    batch_hyp = torch.argmax(logits, dim=-1)
    list_hyp += batch_hyp.cpu().numpy().tolist()
    list_label += labels.cpu().numpy().tolist()
    

print(list_label)
print(list_hyp)


df = pd.DataFrame({'label':list_hyp}).reset_index()
df.to_csv('pred4.csv', index=False)

print(df)

100%|██████████| 67/67 [03:16<00:00,  2.93s/it]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 


  _warn_prf(average, modifier, msg_start, len(result))


In [61]:
def predict(text):
  subwords = tokenizer.encode(text)
  subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

  logits = model(subwords)[0]
  label = torch.argmax(logits, dim=-1).item()
  return label

In [62]:
predict("This is a text")

1