In [1]:
from transformers import * 
import numpy as np 
import pandas as pd
import torch 
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler 
import time 
import datetime 
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import re

PyTorch version 1.7.0+cu110 available.
TensorFlow version 2.5.0 available.


In [2]:
train = pd.read_csv('./data/train.csv') 
test = pd.read_csv('./data/test.csv') 
submission = pd.read_csv('./data/sample_submission.csv') 

train.shape, test.shape, submission.shape

((2834, 6), (7, 4), (7, 2))

In [21]:
train['length'] = train['excerpt'].apply(lambda x : len(x)) 

In [23]:
train['length'].describe()

count    2834.000000
mean      971.732886
std       117.257578
min       669.000000
25%       885.000000
50%       971.000000
75%      1058.000000
max      1341.000000
Name: length, dtype: float64

In [3]:
train_texts = train['excerpt'].values 
train_targets = train['target'].values

In [4]:
tokenizer = ElectraTokenizer.from_pretrained("google/electra-large-discriminator") 

def electra_tokenizer(sent, MAX_LEN):  
    encoded_dict = tokenizer.encode_plus(
        text = sent, 
        add_special_tokens = True, 
        pad_to_max_length = False, 
        return_attention_mask = True 
    )
    input_id = encoded_dict['input_ids'] 
    attention_mask = encoded_dict['attention_mask'] 
    if len(input_id) > 512: 
        input_id = input_id[:129] + input_id[-383:] 
        attention_mask = attention_maks[:129] + attention_mask[-383:] 
        print("Long Text!! Using Head+Tail Truncation") 
    elif len(input_id) <= 512: 
        input_id = input_id + [0]*(512 - len(input_id)) 
        attention_mask = attention_mask + [0]*(512-len(attention_mask)) 
        
    return input_id, attention_mask

In [6]:
BATCH_SIZE = 8
NUM_EPOCHS = 10 
VALID_SPLIT = 0.1 
MAX_LEN = 512 

In [7]:
N = train_texts.shape[0] 

input_ids = np.zeros((N,MAX_LEN),dtype=int) 
attention_masks = np.zeros((N, MAX_LEN),dtype=int) 
targets = np.zeros((N), dtype=np.double) 

for i in tqdm(range(N), position = 0, leave=True): 
    try: 
        cur_str = train_texts[i] 
        cur_target = train_targets[i] 
        input_id, attention_mask = electra_tokenizer(cur_str, MAX_LEN=MAX_LEN) 
        input_ids[i,] = input_id 
        attention_masks[i,] = attention_mask 
        targets[i,] = cur_target 
    except Exception as e: 
        print(e)
        print(cur_str) 
        pass

100%|██████████| 2834/2834 [00:14<00:00, 201.30it/s]


In [8]:
input_ids = torch.tensor(input_ids, dtype=int)
attention_masks = torch.tensor(attention_masks, dtype=int) 
targets = torch.tensor(targets, dtype=torch.float32) 

train_inputs, val_inputs, train_targets, val_targets = train_test_split(input_ids, targets, random_state = 84, test_size=VALID_SPLIT)
train_attention_mask, val_attention_mask, _, _ = train_test_split(attention_masks, targets, random_state = 84, test_size=VALID_SPLIT) 

train_inputs.shape, train_attention_mask.shape, train_targets.shape


(torch.Size([2550, 512]), torch.Size([2550, 512]), torch.Size([2550]))

In [9]:
val_inputs.shape, val_attention_mask.shape, val_targets.shape

(torch.Size([284, 512]), torch.Size([284, 512]), torch.Size([284]))

In [10]:
train_data = TensorDataset(train_inputs, train_attention_mask, train_targets) 
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE) 

validation_data = TensorDataset(val_inputs, val_attention_mask, val_targets)
validation_sampler = SequentialSampler(validation_data) 
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=BATCH_SIZE) 

In [11]:
model = ElectraForSequenceClassification.from_pretrained("google/electra-large-discriminator", num_labels=1) 
model.cuda() 
print() # avoid printing model structure 

Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier




In [12]:
def format_time(elapsed):  
    elapsed_rounded = int(round(elapsed)) 
    return str(datetime.timedelta(seconds=elapsed_rounded)) 


device = torch.device("cuda") 


In [13]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 10
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps) 

model.zero_grad() 

best_val_loss = 1e9 

for epoch_i in range(0,epochs): 
    print("")
    print("======= Epoch {:} / {:} =======".format(epoch_i+1, epochs)) 
    print("Training ...")
    t0 = time.time() 
    total_loss = 0 
    model.train()
    for step, batch in enumerate(train_dataloader): 
        if step%20 == 0 and not step == 0: 
            elapsed = format_time(time.time()-t0)
            print('   Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed)) 
            print('   Current average loss = {}'.format(total_loss / step)) 
            
        batch = tuple(t.to(device) for t in batch) 
        
        b_input_ids, b_input_masks, b_target = batch 
        
        outputs = model(b_input_ids, 
                        token_type_ids = None, 
                        attention_mask = b_input_masks,
                        labels = b_target) 
        
        # using MSE loss 
        loss = outputs[0] 
        
        total_loss += loss.item() 
        loss.backward() 
        torch.nn.utils.clip_grad_norm_(model.parameters(),1.0)
        optimizer.step() 
        scheduler.step() 
        model.zero_grad() 
        
    avg_train_loss = total_loss / len(train_dataloader) 
    
    print("Average training loss = {}".format(avg_train_loss)) 
    print("Training epoch took = {}".format(format_time(time.time() - t0)))
 
    ##### validation ##### 
    print("")
    print("Running Validation...") 
    
    t0 = time.time() 
    model.eval() 
    
    eval_loss = 0 
    nb_eval_steps, nb_eval_examples = 0,0 
    
    for batch in validation_dataloader: 
        batch = tuple(t.to(device) for t in batch) 
        b_input_ids, b_input_masks, b_target = batch 
        with torch.no_grad(): 
            outputs = model(b_input_ids, 
                            token_type_ids = None, 
                            attention_mask = b_input_masks, 
                            labels = b_target) 
        loss = outputs[0] 
        
        eval_loss += loss.item() 

    avg_val_loss = eval_loss / len(validation_dataloader)
    print("Average validation loss = {}".format(avg_val_loss)) 
    print("Validation took: {:}".format(format_time(time.time()-t0))) 
    
    if avg_val_loss < best_val_loss: 
        best_val_loss = avg_val_loss 
        torch.save(model.state_dict(), "ELECTRA_large_" + str(epoch_i+1)) 
    
print("")
print("Training Complete!")


Training ...
   Batch    20 of   319. Elapsed: 0:00:18.
   Current average loss = 0.963998019695282
   Batch    40 of   319. Elapsed: 0:00:35.
   Current average loss = 0.8611662644892931
   Batch    60 of   319. Elapsed: 0:00:52.
   Current average loss = 0.774686798453331
   Batch    80 of   319. Elapsed: 0:01:10.
   Current average loss = 0.7445972943678498
   Batch   100 of   319. Elapsed: 0:01:27.
   Current average loss = 0.6895752669870854
   Batch   120 of   319. Elapsed: 0:01:45.
   Current average loss = 0.6539979668334126
   Batch   140 of   319. Elapsed: 0:02:02.
   Current average loss = 0.6126591837831906
   Batch   160 of   319. Elapsed: 0:02:20.
   Current average loss = 0.603658590791747
   Batch   180 of   319. Elapsed: 0:02:38.
   Current average loss = 0.5974167899125152
   Batch   200 of   319. Elapsed: 0:02:55.
   Current average loss = 0.5732560712657869
   Batch   220 of   319. Elapsed: 0:03:12.
   Current average loss = 0.5664583317935467
   Batch   240 of   3

   Batch   140 of   319. Elapsed: 0:02:02.
   Current average loss = 0.04029890782465892
   Batch   160 of   319. Elapsed: 0:02:20.
   Current average loss = 0.0403666461177636
   Batch   180 of   319. Elapsed: 0:02:37.
   Current average loss = 0.04124160390864644
   Batch   200 of   319. Elapsed: 0:02:55.
   Current average loss = 0.041473537236452106
   Batch   220 of   319. Elapsed: 0:03:12.
   Current average loss = 0.041926716458560394
   Batch   240 of   319. Elapsed: 0:03:29.
   Current average loss = 0.04161102710058913
   Batch   260 of   319. Elapsed: 0:03:47.
   Current average loss = 0.04303386415211627
   Batch   280 of   319. Elapsed: 0:04:04.
   Current average loss = 0.04310386467259377
   Batch   300 of   319. Elapsed: 0:04:22.
   Current average loss = 0.04314722948397199
Average training loss = 0.043240444870449625
Training epoch took = 0:04:38

Running Validation...
Average validation loss = 0.27644926961511374
Validation took: 0:00:10

Training ...
   Batch    20 

In [14]:
best_val_loss

0.27644926961511374