In [1]:
from transformers import * 
import numpy as np 
import pandas as pd
import torch 
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler 
import time 
import datetime 
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import train_test_split, KFold
import re

PyTorch version 1.7.0+cu110 available.
TensorFlow version 2.5.0 available.


In [2]:
train = pd.read_csv('./data/train.csv') 
test = pd.read_csv('./data/test.csv') 
submission = pd.read_csv('./data/sample_submission.csv') 

train.shape, test.shape, submission.shape

((2834, 6), (7, 4), (7, 2))

In [3]:
train_texts = train['excerpt'].values 
train_targets = train['target'].values

In [4]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-large") 

def roberta_tokenizer(sent, MAX_LEN):  
    encoded_dict = tokenizer.encode_plus(
        text = sent, 
        add_special_tokens = True, 
        pad_to_max_length = False, 
        return_attention_mask = True 
    )
    input_id = encoded_dict['input_ids'] 
    attention_mask = encoded_dict['attention_mask'] 
    if len(input_id) > 512: 
        input_id = input_id[:129] + input_id[-383:] 
        attention_mask = attention_maks[:129] + attention_mask[-383:] 
        print("Long Text!! Using Head+Tail Truncation") 
    elif len(input_id) <= 512: 
        input_id = input_id + [0]*(512 - len(input_id)) 
        attention_mask = attention_mask + [0]*(512-len(attention_mask)) 
        
    return input_id, attention_mask

In [5]:
BATCH_SIZE = 4
NUM_EPOCHS = 10 
VALID_SPLIT = 0.1 
MAX_LEN = 512 

In [6]:
def clean_text(s): 
    # find alphabets
    cleaned = re.sub("[^a-zA-Z]", " ", s)
    # convert to lower case
    cleaned = s.lower()
    return cleaned 

In [7]:
N = train_texts.shape[0] 

input_ids = np.zeros((N,MAX_LEN),dtype=int) 
attention_masks = np.zeros((N, MAX_LEN),dtype=int) 
targets = np.zeros((N), dtype=np.double) 

for i in tqdm(range(N), position = 0, leave=True): 
    try: 
        cur_str = train_texts[i] 
        cur_target = train_targets[i] 
        input_id, attention_mask = roberta_tokenizer(cur_str, MAX_LEN=MAX_LEN) 
        input_ids[i,] = input_id 
        attention_masks[i,] = attention_mask 
        targets[i,] = cur_target 
    except Exception as e: 
        print(e)
        print(cur_str) 
        pass

100%|██████████| 2834/2834 [00:02<00:00, 1126.69it/s]


In [8]:
input_ids = torch.tensor(input_ids, dtype=int)
attention_masks = torch.tensor(attention_masks, dtype=int) 
targets = torch.tensor(targets, dtype=torch.float32) 

train_inputs, val_inputs, train_targets, val_targets = train_test_split(input_ids, targets, random_state = 42, test_size=VALID_SPLIT)
train_attention_mask, val_attention_mask, _, _ = train_test_split(attention_masks, targets, random_state = 42, test_size=VALID_SPLIT) 

train_inputs.shape, train_attention_mask.shape, train_targets.shape


(torch.Size([2550, 512]), torch.Size([2550, 512]), torch.Size([2550]))

In [9]:
val_inputs.shape, val_attention_mask.shape, val_targets.shape

(torch.Size([284, 512]), torch.Size([284, 512]), torch.Size([284]))

In [10]:
train_data = TensorDataset(train_inputs, train_attention_mask, train_targets) 
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE) 

validation_data = TensorDataset(val_inputs, val_attention_mask, val_targets)
validation_sampler = SequentialSampler(validation_data) 
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=BATCH_SIZE) 

In [11]:
model = RobertaForSequenceClassification.from_pretrained("roberta-large", num_labels=1) 
model.cuda() 
print() # avoid printing model structure 

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.weight', 'classif




In [12]:
def format_time(elapsed):  
    elapsed_rounded = int(round(elapsed)) 
    return str(datetime.timedelta(seconds=elapsed_rounded)) 


device = torch.device("cuda") 


In [13]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 10
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps) 

model.zero_grad() 

best_val_loss = 1e9 

for epoch_i in range(0,epochs): 
    print("")
    print("======= Epoch {:} / {:} =======".format(epoch_i+1, epochs)) 
    print("Training ...")
    t0 = time.time() 
    total_loss = 0 
    model.train()
    for step, batch in enumerate(train_dataloader): 
        if step%20 == 0 and not step == 0: 
            elapsed = format_time(time.time()-t0)
            print('   Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed)) 
            print('   Current average loss = {}'.format(total_loss / step)) 
            
        batch = tuple(t.to(device) for t in batch) 
        
        b_input_ids, b_input_masks, b_target = batch 
        
        outputs = model(b_input_ids, 
                        token_type_ids = None, 
                        attention_mask = b_input_masks,
                        labels = b_target) 
        
        # using MSE loss 
        loss = outputs[0] 
        
        total_loss += loss.item() 
        loss.backward() 
        torch.nn.utils.clip_grad_norm_(model.parameters(),1.0)
        optimizer.step() 
        scheduler.step() 
        model.zero_grad() 
        
    avg_train_loss = total_loss / len(train_dataloader) 
    
    print("Average training loss = {}".format(avg_train_loss)) 
    print("Training epoch took = {}".format(format_time(time.time() - t0)))
 
    ##### validation ##### 
    print("")
    print("Running Validation...") 
    
    t0 = time.time() 
    model.eval() 
    
    eval_loss = 0 
    nb_eval_steps, nb_eval_examples = 0,0 
    
    for batch in validation_dataloader: 
        batch = tuple(t.to(device) for t in batch) 
        b_input_ids, b_input_masks, b_target = batch 
        with torch.no_grad(): 
            outputs = model(b_input_ids, 
                            token_type_ids = None, 
                            attention_mask = b_input_masks, 
                            labels = b_target) 
        loss = outputs[0] 
        
        eval_loss += loss.item() 

    avg_val_loss = eval_loss / len(validation_dataloader)
    print("Average validation loss = {}".format(avg_val_loss)) 
    print("Validation took: {:}".format(format_time(time.time()-t0))) 
    
    if avg_val_loss < best_val_loss: 
        best_val_loss = avg_val_loss 
        torch.save(model.state_dict(), "RoBERTa_large_" + str(epoch_i+1)) 
    
print("")
print("Training Complete!")


Training ...
   Batch    20 of   638. Elapsed: 0:00:10.
   Current average loss = 1.6143383882939815
   Batch    40 of   638. Elapsed: 0:00:20.
   Current average loss = 1.2977006256580352
   Batch    60 of   638. Elapsed: 0:00:31.
   Current average loss = 1.2457288133601347
   Batch    80 of   638. Elapsed: 0:00:41.
   Current average loss = 1.2682549266144634
   Batch   100 of   638. Elapsed: 0:00:51.
   Current average loss = 1.179296936392784
   Batch   120 of   638. Elapsed: 0:01:01.
   Current average loss = 1.1275114925578236
   Batch   140 of   638. Elapsed: 0:01:11.
   Current average loss = 1.0678264158777893
   Batch   160 of   638. Elapsed: 0:01:21.
   Current average loss = 1.0097389032715
   Batch   180 of   638. Elapsed: 0:01:32.
   Current average loss = 0.966230737304108
   Batch   200 of   638. Elapsed: 0:01:42.
   Current average loss = 0.9048311510961503
   Batch   220 of   638. Elapsed: 0:01:52.
   Current average loss = 0.906846819403158
   Batch   240 of   638.

   Batch   540 of   638. Elapsed: 0:04:37.
   Current average loss = 0.21553837612823204
   Batch   560 of   638. Elapsed: 0:04:47.
   Current average loss = 0.21583792032407864
   Batch   580 of   638. Elapsed: 0:04:57.
   Current average loss = 0.21594577036027252
   Batch   600 of   638. Elapsed: 0:05:08.
   Current average loss = 0.21210878489228585
   Batch   620 of   638. Elapsed: 0:05:18.
   Current average loss = 0.21045429894039708
Average training loss = 0.20952675357274128
Training epoch took = 0:05:27

Running Validation...
Average validation loss = 0.24676763305676655
Validation took: 0:00:10

Training ...
   Batch    20 of   638. Elapsed: 0:00:11.
   Current average loss = 0.13915174249559642
   Batch    40 of   638. Elapsed: 0:00:21.
   Current average loss = 0.15996879409067333
   Batch    60 of   638. Elapsed: 0:00:31.
   Current average loss = 0.16275491500273348
   Batch    80 of   638. Elapsed: 0:00:41.
   Current average loss = 0.15937889369670302
   Batch   100 of

   Batch   400 of   638. Elapsed: 0:03:23.
   Current average loss = 0.07422086712787859
   Batch   420 of   638. Elapsed: 0:03:33.
   Current average loss = 0.07314619978424161
   Batch   440 of   638. Elapsed: 0:03:44.
   Current average loss = 0.07403147812318904
   Batch   460 of   638. Elapsed: 0:03:54.
   Current average loss = 0.07392048978793395
   Batch   480 of   638. Elapsed: 0:04:04.
   Current average loss = 0.07456916815814717
   Batch   500 of   638. Elapsed: 0:04:14.
   Current average loss = 0.07513615748193116
   Batch   520 of   638. Elapsed: 0:04:24.
   Current average loss = 0.07470798383807191
   Batch   540 of   638. Elapsed: 0:04:34.
   Current average loss = 0.07464414634138208
   Batch   560 of   638. Elapsed: 0:04:44.
   Current average loss = 0.07400515451611551
   Batch   580 of   638. Elapsed: 0:04:55.
   Current average loss = 0.07362427026483005
   Batch   600 of   638. Elapsed: 0:05:05.
   Current average loss = 0.0735316483831654
   Batch   620 of   63

   Batch   240 of   638. Elapsed: 0:02:02.
   Current average loss = 0.03676180217929262
   Batch   260 of   638. Elapsed: 0:02:13.
   Current average loss = 0.0367840432038065
   Batch   280 of   638. Elapsed: 0:02:23.
   Current average loss = 0.036729433295632975
   Batch   300 of   638. Elapsed: 0:02:33.
   Current average loss = 0.036359845565554376
   Batch   320 of   638. Elapsed: 0:02:43.
   Current average loss = 0.03621090354681655
   Batch   340 of   638. Elapsed: 0:02:53.
   Current average loss = 0.035956274449263755
   Batch   360 of   638. Elapsed: 0:03:04.
   Current average loss = 0.035781770514828774
   Batch   380 of   638. Elapsed: 0:03:14.
   Current average loss = 0.03528091569912105
   Batch   400 of   638. Elapsed: 0:03:24.
   Current average loss = 0.03509451700272621
   Batch   420 of   638. Elapsed: 0:03:34.
   Current average loss = 0.03510170080269399
   Batch   440 of   638. Elapsed: 0:03:44.
   Current average loss = 0.03484723217565757
   Batch   460 of 

In [14]:
best_val_loss

0.24199586408868642

# Make Prediction

In [13]:
# load best model 
test_model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=1) 
checkpoint = torch.load('RoBERTa_baseline_6') 
test_model.load_state_dict(checkpoint)
test_model.cuda() 

test_model.eval() # convert to evaluation mode. 
print() # avoid printing model structure 

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie




In [19]:
test_texts = test['excerpt'].values 

predictions = [] 

for text in tqdm(test_texts, position=0,leave=True): 
    input_id, attention_mask = roberta_tokenizer(text, MAX_LEN=MAX_LEN) 
    input_id = torch.tensor(input_id, dtype=int) 
    attention_mask = torch.tensor(attention_mask, dtype=int) 
    
    input_id = torch.reshape(input_id, (-1,MAX_LEN)) 
    attention_mask = torch.reshape(attention_mask, (-1,MAX_LEN)) 
    
    input_id = input_id.to(device) 
    attention_mask = attention_mask.to(device) 
    
    with torch.no_grad(): 
            outputs = test_model(input_id, 
                                 token_type_ids=None, 
                                 attention_mask=attention_mask) 
    
    yhat = outputs[0].item() 
    predictions.append(yhat)
    

100%|██████████| 7/7 [00:00<00:00, 55.60it/s]


In [22]:
submission.iloc[:,1] = predictions 

submission

Unnamed: 0,id,target
0,c0f722661,0.372574
1,f0953f0a5,-0.225584
2,0df072751,-0.119132
3,04caf4e0c,-2.694268
4,0e63f8bea,-1.715674
5,12537fe78,-0.712704
6,965e592c0,0.168142
