In [None]:
!pip install transformers
!pip install datasets

In [1]:
import torch
import csv
from transformers import pipeline
from transformers import RobertaTokenizer, RobertaForSequenceClassification,Trainer, TrainingArguments
from torch.utils.data import DataLoader
import numpy as np
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from datasets import ClassLabel
import random

from collections import Counter

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', max_length = 512)
model = RobertaForSequenceClassification.from_pretrained('roberta-base')
device=torch.device('cuda')
model.to(device)

In [3]:
with open('critic_labeled_data_4000.tsv') as f:
  data = list(csv.reader(f, delimiter="\t"))

In [4]:
random.shuffle(data)
train_data=data[:int(len(data)*0.8)]
test_data=data[int(len(data)*0.8):]

In [5]:
c2l = ClassLabel(num_classes=2, names=['no', 'yes'])
def change_form(list_pair):
  my_list=[]
  for pair in list_pair:
    seq=pair[0]+'  '+pair[1]
    my_list.append({'text':seq,'label':c2l.str2int(pair[2])})
  return my_list

train_change=change_form(train_data)
test_change=change_form(test_data)
dataset_train = Dataset.from_list(train_change)
dataset_test=Dataset.from_list(test_change)
     

In [6]:
def tokenization(batched_text):
    return tokenizer(batched_text['text'], padding = True, truncation=True)


train_data_batch = dataset_train.map(tokenization, batched = True, batch_size = len(dataset_train))
test_data_batch = dataset_test.map(tokenization, batched = True, batch_size = len(dataset_test))

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [7]:
train_data_batch.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_data_batch.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [8]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [9]:
training_args = TrainingArguments(
    output_dir='test_trainer', 
    evaluation_strategy='epoch',
    per_device_train_batch_size=512, 
    per_device_eval_batch_size=512, 
    num_train_epochs = 20.0,
    logging_steps=8,
    save_strategy="no"
    )
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data_batch,
    eval_dataset=test_data_batch
)
     

In [None]:
trainer.train()

In [11]:
pipe = pipeline("text-classification",model=model.cpu(),tokenizer=tokenizer)

In [13]:
Counter(dataset_test['label'])

Counter({0: 289, 1: 961})

In [23]:
pipe(dataset_test['text'][0])

[{'label': 'LABEL_1', 'score': 0.9998888969421387}]

In [28]:
critic_output=[]
for line in dataset_test['text']:
  result=pipe(line)[0]
  label=result['label']
  score=result['score']
  if label=='LABEL_1' and score>0.95:
    critic_output.append('1')
  else:
    critic_output.append('0')

In [29]:
Counter(critic_output)

Counter({'1': 1018, '0': 232})

In [16]:
pipe = pipeline("text-classification",model=model.cpu(),tokenizer=tokenizer)

with open('arxivgpt2_output_4000.tsv') as f:
  eval_data = list(csv.reader(f, delimiter="\t"))

def change_form_eval(list_pair):
  my_list=[]
  for pair in list_pair:
    seq=pair[0]+'  '+pair[1]
    my_list.append({'text':seq})
  return my_list

eval_change=change_form_eval(eval_data)
dataset_eval = Dataset.from_list(eval_change)

eval_output=[]
for line in dataset_eval['text']:
  result=pipe(line)[0]
  label=result['label']
  score=result['score']
  if label=='LABEL_1' and score>0:
    eval_output.append('1')
  else:
    eval_output.append('0')

print(Counter(eval_output))

Counter({'1': 3627, '0': 373})
