In [None]:
!pip install datasets

In [None]:
!pip install evaluate

In [None]:
!pip install transformers

In [22]:
import torch
import numpy as np
import pickle
from tqdm.auto import tqdm, trange
import pandas as pd
import re
from datasets import Dataset
import evaluate
from transformers import AutoModelForSequenceClassification, AutoTokenizer, RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer
from google.colab import drive
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim

In [6]:
dataset = pd.read_csv('shakespearen.csv')

In [7]:
def preprocess_text(text):
  for a in '.,?!:;-)(':
    text = text.replace(a, ' '+a)
  text = text.replace('"', '')
  text = text.replace("'", "")
  text = re.sub(' +', ' ', text)
  text = text.lower()
  return text

In [8]:
dataset['og'] = dataset['og'].apply(preprocess_text)
dataset['t'] = dataset['t'].apply(preprocess_text)

### Тренируем классификатор

In [10]:
device = torch.device('cuda:0')

In [None]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', max_length = 256)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

In [12]:
#подготовим датасет для обучения
train_df = []
for sent in dataset['og'][:41430]:
  train_df.append({'text': sent, 'label': 1})
for sent in dataset['t'][:41430]:
  train_df.append({'text': sent, 'label': 0})

test_df = []
for sent in dataset['og'][41430:]:
  test_df.append({'text': sent, 'label': 1})
for sent in dataset['t'][41430:]:
  test_df.append({'text': sent, 'label': 0})

In [13]:
train_df = pd.DataFrame(train_df)
test_df = pd.DataFrame(test_df)

In [14]:
train_df = Dataset.from_pandas(train_df)
test_df = Dataset.from_pandas(test_df)

In [None]:
train_df = train_df.map(lambda x: tokenizer(x['text'], padding='max_length', truncation=True, max_length=256))

In [16]:
test_df = test_df.map(lambda x: tokenizer(x['text'], padding='max_length', truncation=True, max_length=256))

Map:   0%|          | 0/20714 [00:00<?, ? examples/s]

In [None]:
train_df = train_df.class_encode_column('label')
train_df.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Stringifying the column:   0%|          | 0/82860 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/82860 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/20714 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/20714 [00:00<?, ? examples/s]

In [17]:
test_df = test_df.class_encode_column('label')
test_df.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Stringifying the column:   0%|          | 0/20714 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/20714 [00:00<?, ? examples/s]

In [None]:
train_df = train_df.train_test_split(test_size=0.2, stratify_by_column='label', seed=42)

In [None]:
clf_metrics = evaluate.combine(["accuracy"])

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return clf_metrics.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
training_args = TrainingArguments(
    output_dir='results',
    save_total_limit=2,
    load_best_model_at_end=True,
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    evaluation_strategy='epoch',
    save_strategy = 'epoch',
    warmup_steps=10
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df['train'],
    eval_dataset=train_df['test'],
    compute_metrics = compute_metrics
)

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.0729,0.056972,0.989621
2,0.0408,0.033326,0.992517


TrainOutput(global_step=8286, training_loss=0.06916236394043852, metrics={'train_runtime': 6173.0347, 'train_samples_per_second': 21.477, 'train_steps_per_second': 1.342, 'total_flos': 1.744110563770368e+16, 'train_loss': 0.06916236394043852, 'epoch': 2.0})

## Evaluation

In [9]:
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
model_ft = RobertaForSequenceClassification.from_pretrained('/content/drive/MyDrive/checkpoint-8286').to(device)
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
model_ft.eval()

In [21]:
results = []
for i_batch, sample_batched in enumerate(DataLoader(test_df, batch_size=64)):
  with torch.no_grad():
    res = model_ft(input_ids=sample_batched['input_ids'].to(device), attention_mask=sample_batched['attention_mask'].to(device))
    results.append(res)

In [33]:
pred = []
for res in results:
  pred.extend(nn.functional.softmax(res.logits, dim=-1).tolist())

In [39]:
preds = []
for el in pred:
  if el[0]>el[1]:
    preds.append(0)
  else:
    preds.append(1)

In [40]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(preds, test_df['label'])
print(f'acc score: {acc}')

acc score: 0.9523510669112677
