In [None]:
from google.colab import drive
import os
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pip install HTMLParser



In [None]:
pip install transformers==3.0.0



In [None]:
import json
import tqdm

path = "/content/drive/My Drive/5001_Group/"
js = open(path + "Sarcasm_Headlines_Dataset.json")
dataset = []
for line in js.readlines():
  js_line = json.loads(line)
  dataset.append(js_line)

headlines = []
labels = []
for item in dataset:
  headlines.append(item["headline"])
  labels.append(int(item["is_sarcastic"]))

path = "/content/drive/My Drive/5001_Group/"
js = open(path + "Sarcasm_Headlines_Dataset.json")
dataset = []
for line in js.readlines():
  js_line = json.loads(line)
  dataset.append(js_line)

articals = []
with open("/content/drive/My Drive/5001_Group/articals.txt", "r") as f:
  lines = f.readlines()
f.close()
for line in lines:
  articals.append(line)

In [None]:
import torch
from torch.utils.data import Dataset
import numpy as np
from transformers import BertTokenizer

class BERTDataset(Dataset):
  def __init__(self, sent, labels, max_seq_len):
    self.sent = sent
    self.labels = labels
    self.max_seq_len = max_seq_len
    self.dataset = self.preprocess(sent, labels)

  def preprocess(self, sent, labels):
    data = []
    bert_tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
    for i in range(len(labels)):
      ids = bert_tokenizer.__call__(sent[i], max_length=self.max_seq_len, padding='max_length', pad_to_max_length=True, truncation=True, return_token_type_ids=False)
      data.append([ids["input_ids"], ids["attention_mask"], labels[i]])
    return data

  def __getitem__(self, idx):
    ids = self.dataset[idx][0]
    mask = self.dataset[idx][1]
    label = self.dataset[idx][2]
    item = {}
    item['input_ids'] = ids
    item['mask'] = mask
    item["label"] = label
    return item

  def __len__(self):
    return len(self.dataset)
  
  def collate_fn(self, batch):
    input_ids = [x["input_ids"] for x in batch]
    masks = [x["mask"] for x in batch]
    labels = [x["label"] for x in batch]

    batch_len = len(input_ids)

    batch_labels = np.ones((batch_len))
    for i in range(batch_len):
      batch_labels[i] = labels[i]

    batch_ids = torch.tensor(input_ids, dtype=torch.long)
    batch_labels = torch.tensor(batch_labels, dtype=torch.long)
    batch_masks = torch.tensor(masks, dtype=torch.bool)

    return [batch_ids, batch_labels, batch_masks]

In [None]:
from transformers.modeling_bert import *
from torch.nn.utils.rnn import pad_sequence

class Bert(BertPreTrainedModel):
  def __init__(self,config):
    super(Bert, self).__init__(config)
    self.num_labels = 2
    self.bert = BertModel(config)
    self.dropout = nn.Dropout(0.1)
    self.classifier = nn.Linear(768, self.num_labels)
    self.init_weights()

  def forward(self, input_ids, token_type_ids=None, mask=None, labels=None, batch_len = None):
    _, output = self.bert(input_ids=input_ids,
                         attention_mask=mask,
                         token_type_ids=token_type_ids,
                         output_hidden_states=False)
    dropout_output = self.dropout(output)
    logits = self.classifier(dropout_output)
    outputs = (logits,)

    if labels is not None:
      loss_func = nn.CrossEntropyLoss()
      loss = loss_func(logits.view(-1, 2), labels.view(-1))
      outputs = (loss,) + outputs

    return outputs

In [None]:
import logging
from tqdm import tqdm
from sklearn.metrics import f1_score

device = 'cuda' if torch.cuda.is_available() else 'cpu'

def train_epoch(train_loader, model, optimizer, scheduler, epoch):
  model.train()
  train_losses = 0
  for idx, batch_samples in enumerate(tqdm(train_loader)):
    batch_ids, batch_labels, batch_masks = batch_samples

    batch_ids = batch_ids.to(device)
    batch_labels = batch_labels.to(device)
    batch_masks = batch_masks.to(device)

    loss = model(batch_ids, labels=batch_labels, mask = batch_masks)
    loss = loss[0]
    train_losses = train_losses + loss.item()
    model.zero_grad()
    loss.backward()
    nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=5)
    optimizer.step()
    scheduler.step()

  train_loss = float(train_losses) / len(train_loader)
  print("Epoch: {}, train loss: {}".format(epoch, train_loss))

def dev_epoch(dev_loader, model, optimizer, scheduler, epoch):
  model.eval()
  dev_losses = 0
  pred_labels = []
  true_labels = []
  for idx, batch_samples in enumerate(tqdm(dev_loader)):
    batch_ids, batch_labels, batch_masks = batch_samples

    batch_ids = batch_ids.to(device)
    batch_labels = batch_labels.to(device)
    batch_masks = batch_masks.to(device)

    batch_output = model(batch_ids, mask = batch_masks)
    batch_output= batch_output[0].detach().cpu().numpy()
    
    batch_labels = batch_labels.to('cpu').numpy()
    pred_labels.extend(np.argmax(batch_output, axis=1))
    true_labels.extend(batch_labels)

  f1 = f1_score(pred_labels, true_labels)
  print("Epoch: {}, f1: {}".format(epoch, f1 * 100))

def train_model(train_loader, dev_loader, model, optimizer, scheduler, epoch_num):
  for epoch in range(1, epoch_num + 1):
    train_epoch(train_loader, model, optimizer, scheduler, epoch)
    dev_epoch(dev_loader, model, optimizer, scheduler, epoch)
  print("Training Finished!")

def test_model(test_loader, model, optimizer, scheduler):
  model.eval()
  pred_labels = []
  true_labels = []
  for idx, batch_samples in enumerate(tqdm(test_loader)):
    batch_ids, batch_labels, batch_masks = batch_samples

    batch_ids = batch_ids.to(device)
    batch_labels = batch_labels.to(device)
    batch_masks = batch_masks.to(device)

    batch_output = model(batch_ids, mask=batch_masks)
    batch_output= batch_output[0].detach().cpu().numpy()
    batch_labels = batch_labels.to('cpu').numpy()
    pred_labels.extend(np.argmax(batch_output, axis=1))
    true_labels.extend(batch_labels)

  f1 = f1_score(pred_labels, true_labels)
  print("test f1: {}".format(f1 * 100))
  return (pred_labels, true_labels)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_rest, y_train, y_rest = train_test_split(articals, labels, test_size = 0.3, random_state=1)
X_dev, X_test, y_dev, y_test = train_test_split(X_rest, y_rest, test_size = 2/3, random_state=1)

max_seq_length = 128
training_set = BERTDataset(X_train, y_train, max_seq_length)
dev_set = BERTDataset(X_dev, y_dev, max_seq_length)
test_set = BERTDataset(X_test, y_test, max_seq_length)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers.optimization import get_cosine_schedule_with_warmup, AdamW

train_loader = DataLoader(training_set, batch_size=64, shuffle=False, collate_fn=training_set.collate_fn)
dev_loader = DataLoader(dev_set, batch_size=64, shuffle=False, collate_fn=dev_set.collate_fn)
test_loader = DataLoader(test_set, batch_size=64, shuffle=False, collate_fn=test_set.collate_fn)

model = Bert.from_pretrained("bert-base-uncased")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
model.to(device)
bert_optimizer = list(model.bert.named_parameters())
classifier_optimizer = list(model.classifier.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in bert_optimizer if not any(nd in n for nd in no_decay)],
      'lr': 1e-5, 'weight_decay': 0.01},
    {'params': [p for n, p in bert_optimizer if any(nd in n for nd in no_decay)],
      'lr': 1e-5, 'weight_decay': 0.0},
    {'params': [p for n, p in classifier_optimizer if not any(nd in n for nd in no_decay)],
      'lr': 5e-4, 'weight_decay': 0.01},
    {'params': [p for n, p in classifier_optimizer if any(nd in n for nd in no_decay)],
      'lr': 5e-4, 'weight_decay': 0.0}]

epoch_num = 3
optimizer = AdamW(optimizer_grouped_parameters, lr=5e-4, correct_bias=False)
train_size = len(training_set)
train_steps_per_epoch = train_size // 64
scheduler = get_cosine_schedule_with_warmup(optimizer,
                                            num_warmup_steps=train_steps_per_epoch,
                                            num_training_steps=epoch_num * train_steps_per_epoch)
print("--------Start Training!--------")
train_model(train_loader, dev_loader, model, optimizer, scheduler, epoch_num)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing Bert: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing Bert from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing Bert from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Bert were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should pr

cuda
--------Start Training!--------


100%|██████████| 293/293 [06:38<00:00,  1.36s/it]


Epoch: 1, train loss: 0.3679056069098489


100%|██████████| 42/42 [00:21<00:00,  1.97it/s]


Epoch: 1, f1: 91.52139461172742


100%|██████████| 293/293 [06:49<00:00,  1.40s/it]


Epoch: 2, train loss: 0.1537201500596294


100%|██████████| 42/42 [00:21<00:00,  1.98it/s]


Epoch: 2, f1: 93.81907490790012


100%|██████████| 293/293 [06:49<00:00,  1.40s/it]


Epoch: 3, train loss: 0.11761340398640203


100%|██████████| 42/42 [00:21<00:00,  1.98it/s]

Epoch: 3, f1: 94.09351927809679
Training Finished!





In [None]:
pred_labels, true_labels = test_model(test_loader, model, optimizer, scheduler)

100%|██████████| 84/84 [00:42<00:00,  1.98it/s]

test f1: 94.33042064621013



