In [1]:
pip install evaluate --quiet

[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
# import evaluate

import pandas as pd
import numpy as np
from tqdm import tqdm

In [4]:
df_p = pd.read_csv('/kaggle/input/politenessdataset/politeness.csv')
polite = df_p[df_p['score'] > 0.98]
nonpolite = df_p[df_p['score'] < 0.05]
print(len(polite))
print(len(nonpolite))

89735
88324


In [5]:
un_df = pd.concat([polite, nonpolite])['txt'].tolist()
labels = [0.0] * len(polite) + [1.0] * len(nonpolite)

X_train, X_test, y_train, y_test = train_test_split(un_df, labels, test_size=0.33, random_state=42, shuffle=True)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels = 1)

In [9]:
def prep(text, tokenizer=tokenizer):
    return tokenizer(text, padding = 'max_length', max_length = 128, truncation=True, return_tensors='pt')

In [10]:
X_train = prep(X_train)
X_test = prep(X_test)

In [11]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(X_train, y_train)
val_dataset = TextDataset(X_test, y_test)


In [25]:
metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [26]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [33]:
training_args = TrainingArguments(output_dir="./politeness_clf_roberta",
                                  evaluation_strategy="epoch",
                                  per_device_train_batch_size = 32,
                                  per_device_eval_batch_size = 32,
                                  save_strategy = 'epoch',
                                  num_train_epochs=2,
                                  save_total_limit =1)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [12]:
import wandb
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
wandb_api = user_secrets.get_secret('wandb-key') 
wandb.login(key=wandb_api)
trainer.train()

## Eval

In [None]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
model = AutoModelForSequenceClassification.from_pretrained('/kaggle/input/politeness-roberta/politeness_clf_roberta/checkpoint-7458').to('cuda')

In [7]:
def prep(text, tokenizer=tokenizer):
    return tokenizer(text, padding = 'max_length', max_length = 128, truncation=True, return_tensors='pt').to('cuda')

In [12]:
preds = []
for i in range(0, 1000, 100):
    enc_i = prep(X_test[i:i+100])
    
    with torch.no_grad():
        pred = model(**enc_i)
        preds.append(pred)

In [23]:
res = []
for i in range(len(preds)):
    ln = [round(preds[i].logits[j].item()) for j in range(100)]
    res.extend(ln)

In [29]:
from sklearn.metrics import accuracy_score #ок, потому что классы сбалансированные
acc = accuracy_score(res, y_test[:1000])
print(f'acc score: {acc}')

acc score: 0.999
