In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-1.11.0-py3-none-any.whl (264 kB)
[K     |████████████████████████████████| 264 kB 915 kB/s eta 0:00:01     |██████████████████████████      | 215 kB 915 kB/s eta 0:00:01
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 8.5 MB/s eta 0:00:01
Installing collected packages: xxhash, datasets
Successfully installed datasets-1.11.0 xxhash-2.0.2


### Import libraries

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random

import torch

from datasets.arrow_dataset import Dataset

from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer
from transformers import Trainer
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/commonlitreadabilityprize/sample_submission.csv
/kaggle/input/commonlitreadabilityprize/train.csv
/kaggle/input/commonlitreadabilityprize/test.csv


### Seed everything for reproducibility

In [7]:
seed = 3

# python RNG
random.seed(seed)

# pytorch RNGs
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

# numpy RNG
np.random.seed(seed)

### Load training and prediction data

In [8]:
source_dir = '/kaggle/input/commonlitreadabilityprize/'
train_data = pd.read_csv(source_dir + 'train.csv')
train_data = train_data.drop(columns = ["id", "url_legal", "license", "standard_error"]).rename(columns = {"target": "labels"})

preds_data = pd.read_csv(source_dir + 'test.csv')

### Replace new line indicators with spaces to remove unnecessary tokens 

In [9]:
train_data['excerpt'] = train_data['excerpt'].apply(lambda x : x.replace('\n', ' '))
preds_data['excerpt'] = preds_data['excerpt'].apply(lambda x : x.replace('\n', ' '))

### Split train.csv into training and validation sets to evaluate model training

In [10]:
train_set, eval_set = train_test_split(train_data, test_size = 0.2, random_state = 42)

### Convert pandas DataFrame to Dataset object

This is done in order to map tokens into a class which is compatible with Huggingface model

In [11]:
train_dataset = Dataset.from_pandas(df = train_set)
eval_dataset = Dataset.from_pandas(df = eval_set)

### Load pretrained RoBERTa tokenizer

* Convert text into numerical form to pass into the transformer

In [12]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
tokenizer.save_pretrained('./Commonlit-RoBERTa-Base/tokenizer')

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

('./Commonlit-RoBERTa-Base/tokenizer/tokenizer_config.json',
 './Commonlit-RoBERTa-Base/tokenizer/special_tokens_map.json',
 './Commonlit-RoBERTa-Base/tokenizer/vocab.json',
 './Commonlit-RoBERTa-Base/tokenizer/merges.txt',
 './Commonlit-RoBERTa-Base/tokenizer/added_tokens.json',
 './Commonlit-RoBERTa-Base/tokenizer/tokenizer.json')

In [13]:
tokenizer

PreTrainedTokenizerFast(name_or_path='roberta-base', vocab_size=50265, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

### Map tokenized text excerpts to Dataset

In [14]:
MAX_LENGTH = 256

def tokenize_data(dataset):
    token_sequence = tokenizer(dataset["excerpt"], padding = "max_length", truncation = True, max_length = MAX_LENGTH)
    return token_sequence

# Map tokenized text (input_ids, attention_mask) to new dataset
tokenized_train_dataset = train_dataset.map(tokenize_data, batched = True).remove_columns(["__index_level_0__"])
tokenized_eval_dataset = eval_dataset.map(tokenize_data, batched = True).remove_columns(["__index_level_0__"])

# Convert into PyTorch tensors
tokenized_train_dataset.set_format("torch")
tokenized_eval_dataset.set_format("torch")

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [15]:
tokenized_train_dataset['input_ids']

tensor([[    0,   133,   745,  ...,     1,     1,     1],
        [    0,   133,  1114,  ...,     1,     1,     1],
        [    0, 37818,     5,  ...,     1,     1,     1],
        ...,
        [    0,   100,  1017,  ...,  6063,  4864,     2],
        [    0,  1121,   209,  ...,     1,     1,     1],
        [    0, 39488,    16,  ...,     1,     1,     1]])

### Load RoBERTa-Base model

In [16]:
# Load model from Huggingface
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels = 1)

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

In [17]:
%%capture

# Send model to GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

### Set up Trainer instance to train model

In [18]:
batch_size = 16

training_args = TrainingArguments(
    output_dir="./Commonlit-RoBERTa-Base-CP", # Select model path for checkpoint
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    evaluation_strategy = 'epoch',
    save_strategy = "epoch", # Save checkpoint at end of each epoch
    metric_for_best_model = 'eval_loss',
    greater_is_better = False,
    load_best_model_at_end = True,
    report_to = "none",
    seed = 3)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_train_dataset,
    eval_dataset = tokenized_eval_dataset)

In [19]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.408047
2,No log,0.544274
3,No log,0.444234
4,0.296400,0.277034
5,0.296400,0.333592


TrainOutput(global_step=710, training_loss=0.22970777431004483, metrics={'train_runtime': 438.708, 'train_samples_per_second': 1.618, 'total_flos': 0, 'epoch': 5.0, 'init_mem_cpu_alloc_delta': 0, 'init_mem_gpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 604647424, 'train_mem_gpu_alloc_delta': 1562336768, 'train_mem_cpu_peaked_delta': 4280320, 'train_mem_gpu_peaked_delta': 4541207552})

In [20]:
%%capture
# Convert model to evaluation mode to indicate model is ready for prediction
model.eval()

### Save pre-trained model

In [21]:
trainer.save_model('./Commonlit-RoBERTa-Base')

### Make predictions

In [22]:
preds_targets = []

for excerpt in preds_data['excerpt']:
    token_seq = tokenizer(excerpt, padding = "max_length", max_length = MAX_LENGTH, truncation = True, return_tensors = "pt")
    token_seq.to(device)
    pred = model(**token_seq) # Unpack token sequences tensor 
    preds_targets.append(pred.logits[0].item())
    
preds_targets

[-0.007161587942391634,
 -0.3461619019508362,
 -0.18076074123382568,
 -2.6938998699188232,
 -1.7281757593154907,
 -0.8599075675010681,
 0.5470359325408936]

In [23]:
submission_df = pd.DataFrame({'id' : preds_data['id'], 'target': preds_targets})

In [24]:
submission_df.to_csv("submission.csv", index = False)