In [1]:
pip install evaluate --quiet

[0mNote: you may need to restart the kernel to use updated packages.


In [17]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
import evaluate

import pandas as pd
import numpy as np
from tqdm import tqdm

In [7]:
df_p = pd.read_csv('/kaggle/input/politenessdataset/politeness.csv')
df_p = df_p[df_p['is_useful']==1]
polite = df_p[df_p['score'] > 0.6][:60000]
nonpolite = df_p[df_p['score'] < 0.4][:60000]
# print(len(polite))
# print(len(nonpolite))

60000
60000


In [10]:
un_df = pd.concat([polite, nonpolite])['txt'].tolist()
labels = [[0.0, 1.0]] * len(polite) + [[1.0, 0.0]] * len(nonpolite)

X_train, X_test, y_train, y_test = train_test_split(un_df, labels, test_size=0.33, random_state=42, shuffle=True)

In [12]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels = 2)

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [13]:
def prep(text, tokenizer=tokenizer):
    return tokenizer(text, padding = 'max_length', max_length = 128, truncation=True, return_tensors='pt')

In [14]:
X_train = prep(X_train)
X_test = prep(X_test)

In [15]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(X_train, y_train)
val_dataset = TextDataset(X_test, y_test)


In [18]:
metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [22]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=[ np.argmax(np.asarray(i)) for i in labels])

In [23]:
training_args = TrainingArguments(output_dir="./politeness_clf_roberta",
                                  evaluation_strategy="epoch",
                                  per_device_train_batch_size = 32,
                                  per_device_eval_batch_size = 32,
                                  save_strategy = 'epoch',
                                  num_train_epochs=2,
                                  save_total_limit =1)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [24]:
import wandb
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
wandb_api = user_secrets.get_secret('wandb-key') 
wandb.login(key=wandb_api)
trainer.train()

***** Running training *****
  Num examples = 80400
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 5026
  Number of trainable parameters = 124647170
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
  import sys
  


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2134,0.215993,0.929369
2,0.1646,0.174078,0.946263


***** Running Evaluation *****
  Num examples = 39600
  Batch size = 32
Saving model checkpoint to ./politeness_clf_roberta/checkpoint-2513
Configuration saved in ./politeness_clf_roberta/checkpoint-2513/config.json
Model weights saved in ./politeness_clf_roberta/checkpoint-2513/pytorch_model.bin
  import sys
  
***** Running Evaluation *****
  Num examples = 39600
  Batch size = 32
Saving model checkpoint to ./politeness_clf_roberta/checkpoint-5026
Configuration saved in ./politeness_clf_roberta/checkpoint-5026/config.json
Model weights saved in ./politeness_clf_roberta/checkpoint-5026/pytorch_model.bin
Deleting older checkpoint [politeness_clf_roberta/checkpoint-2513] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=5026, training_loss=0.21800658259788358, metrics={'train_runtime': 2206.2378, 'train_samples_per_second': 72.884, 'train_steps_per_second': 2.278, 'total_flos': 1.0577064425472e+16, 'train_loss': 0.21800658259788358, 'epoch': 2.0})

In [26]:
tokenizer.save_pretrained('roberta_polit_clf')
trainer.model.save_pretrained('roberta_polit_clf')

tokenizer config file saved in roberta_polit_clf/tokenizer_config.json
Special tokens file saved in roberta_polit_clf/special_tokens_map.json
Configuration saved in roberta_polit_clf/config.json
Model weights saved in roberta_polit_clf/pytorch_model.bin
