In [None]:
import numpy as np 
import pandas as pd 
import os, random, sys, time, re

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import torchtext
from torchtext import vocab, data

from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import StratifiedKFold, KFold

import warnings
warnings.filterwarnings('ignore')

In [None]:
DATA_PATH = "../input/commonlitreadabilityprize/"
EMB_PATH = "../input/embeddings-glove-crawl-torch-cached"
EMB_FILENAME = 'crawl-300d-2M.vec'#'glove.840B.300d.txt'
N_FOLDS = 5
EPOCHES = 25
BATCH_SIZE = 256
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
SEED = 42
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
def tokenizer(text):
    return tknzr.tokenize(text)

In [None]:
train_csv = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'), index_col='id')
# test_csv = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'), index_col='id')

subm = pd.read_csv(os.path.join(DATA_PATH, 'sample_submission.csv'), index_col='id')

y = (train_csv.target.values > 0).astype(int)

In [None]:
# define the columns that we want to process and how to process
txt_field = data.Field(sequential=True, tokenize=tokenizer, include_lengths=False,  use_vocab=True)
num_field = data.Field(sequential=False, dtype=torch.float,  use_vocab=False)
raw_field = data.RawField()

train_fields = [
    ('id', raw_field), 
    ('url_legal', raw_field),
    ('license', raw_field),
    ('excerpt', txt_field), 
    ('target', num_field),
    ('standard_error', num_field),
]

test_fields = [
    ('id', raw_field), 
    ('url_legal', raw_field),
    ('license', raw_field),
    ('excerpt', txt_field), 
]

In [None]:
# Loading csv file
train_ds = data.TabularDataset(path=os.path.join(DATA_PATH, 'train.csv'), 
                           format='csv',
                           fields=train_fields, 
                           skip_header=True)

test_ds = data.TabularDataset(path=os.path.join(DATA_PATH, 'test.csv'), 
                           format='csv',
                           fields=test_fields, 
                           skip_header=True)

In [None]:
# specify the path to the localy saved vectors
vec = vocab.Vectors(os.path.join(EMB_PATH, EMB_FILENAME), cache=EMB_PATH)
# build the vocabulary using train and validation dataset and assign the vectors
txt_field.build_vocab(train_ds, test_ds, max_size=300000, vectors=vec)

embs_vocab = train_ds.fields['excerpt'].vocab.vectors
print('Embedding vocab size: ', embs_vocab.size()[0])
vocab_size = embs_vocab.size()[0]

In [None]:
# Wrapper for loaders, which structured fields
class BatchWrapper:
      def __init__(self, dataloader, mode='train'):
            self.dataloader, self.mode = dataloader, mode
     
      def __iter__(self):
            if self.mode =='test':
                for batch in self.dataloader:
                    yield (batch.id, batch.excerpt)
            else:
                for batch in self.dataloader:
                    yield (batch.excerpt,  batch.target)
  
      def __len__(self):
            return len(self.dl)

def wrapper(ds, mode='train', **kwargs):
    dataloader = data.BucketIterator(ds, device=DEVICE, **kwargs)
    return BatchWrapper(dataloader, mode)

cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

def splits_cv(dataset, cv=cv, y=y, batch_size=BATCH_SIZE):
    """
        Split dataset to train and validation used cross-validator and wrap loader
    """
    for indices in cv.split(range(len(dataset)), y):
        (train_data, valid_data) = tuple([dataset.examples[i] for i in index] for index in indices)
        yield tuple(wrapper(
            data.Dataset(d, dataset.fields), batch_size=batch_size) for d in (train_data, valid_data) if d)

In [None]:
test_loader = wrapper(test_ds, batch_size=BATCH_SIZE, shuffle=False, repeat=False, mode='test')

In [None]:
class PositionAwareAttention(nn.Module):
    
    def __init__(self, input_size, attn_size):
        super().__init__()
        self.input_size  = input_size
        self.wx = nn.Conv1d(input_size, attn_size, 1, bias=True)  # from input to attention matrix
        self.wh = nn.Conv1d(input_size, attn_size, 1, bias=False) # from hidden to attention matrix
        self.wt = nn.Conv1d(attn_size, 1, 1, bias=True)           # from attention matrix to score
        
    def forward(self, x, h):
        x = x.permute(1,2,0) # features last
        wx = self.wx(x)
        wh = self.wh(h.permute(1,0,2).contiguous().view(-1,self.input_size,1))
        score = self.wt(torch.tanh(wx + wh))
        score = F.softmax(score, dim=2)
        out = torch.bmm(score, x.permute(0,2,1)).squeeze()
        
        return out
    
class RecNN(nn.Module):
    def __init__(self, embs_vocab, hidden_size, layers=1, atten_features = 24, 
                 dropout=0., bidirectional=False):
        super().__init__()

        self.hidden_size = hidden_size
        self.bidirectional = bidirectional
        self.num_layers = layers
        self.emb_dim = embs_vocab.size(1)
        self.emb = nn.Embedding(embs_vocab.size(0), self.emb_dim)
        self.emb.weight.data.copy_(embs_vocab) # load pretrained vectors
        self.emb.weight.requires_grad = False # make embedding non trainable
        
        self.lstm = nn.LSTM(self.emb_dim, self.hidden_size,
                            num_layers=layers, bidirectional=bidirectional, dropout=dropout)
        
        self.gru = nn.GRU(self.emb_dim, self.hidden_size,
                            num_layers=layers, bidirectional=bidirectional, dropout=dropout)
        self.pregru = nn.Conv1d(self.emb_dim, self.emb_dim, 1, bias=True)
        self.atten = PositionAwareAttention(hidden_size*(bidirectional+1), atten_features)
        
        self.out = nn.Linear(2* hidden_size*(bidirectional+1), 32)
        self.last = nn.Linear(32, 1)
                
    def forward(self, x):
        
        embs = self.emb(x)
        
        lstm, (h1, c) = self.lstm(embs)
        gru = F.relu(self.pregru(embs.permute(1,2,0)), inplace=True).permute(2,0,1)
        
        gru, h2 = self.gru(gru, h1)
        lstm = lstm + gru
        
        x_max, _ = lstm.max(dim=0, keepdim=False) 
        x_atten = self.atten(lstm, h1+h2)
        out = self.out(torch.cat([x_max, x_atten],dim = 1))
        out = self.last(F.relu(out)).squeeze()
        return out

In [None]:
### Table for results
header = r'''
             Train        Validation
Epoch |  MSE  |  RMSE |  MSE  |  RMSE | Time, m
'''
#          Epoch         metrics            time
raw_line = '{:6d}' + '\u2502{:7.3f}'*4 + '\u2502{:6.2f}'

In [None]:
@torch.no_grad()
def validation_fn(model, loader, loss_fn):
    y_pred, y_true, tloss = [], [], []
    model.eval();
    for texts, target in loader:
        outputs = model(texts)
        loss = loss_fn(outputs, target)
        tloss.append(loss.item())
    tloss = np.array(tloss).mean()
    return tloss

def oof_preds(train_ds, test_loader, embs_vocab,
              hidden_size=128, bidirectional=True, epochs = EPOCHES):

    for loader, vloader in splits_cv(train_ds, cv):
        
        model = model = RecNN(embs_vocab, hidden_size,
                              dropout=0.1, bidirectional=bidirectional).to(DEVICE)
        
        optimizer = optim.AdamW(model.parameters(), 1e-3, betas=(0.75, 0.999), weight_decay=1e-1)
        loss_fn = torch.nn.MSELoss()
        
        print(header)
        for epoch in range(1, epochs+1):      
            start_time = time.time()
            tloss = []          
            model.train()
            
            for texts, target in loader:
                optimizer.zero_grad()
                outputs = model(texts)
                loss = loss_fn(outputs, target)
                tloss.append(loss.item())
                loss.backward()
                optimizer.step()

            tloss = np.array(tloss).mean()
            vloss = validation_fn(model, vloader, loss_fn)
            tmetric = tloss**.5
            vmetric = vloss**.5
            if epoch % 1 == 0:
                print(raw_line.format(epoch,tloss,tmetric,vloss,vmetric,(time.time()-start_time)/60**1))

       
        # Get prediction for test set
        ids, preds = [], [] 
        with torch.no_grad():
            for batch_ids, texts in test_loader:
                outputs = model(texts)
                ids += batch_ids
                preds.append(outputs.detach().cpu().numpy())
            
        # Save prediction of test set
        preds = np.concatenate(preds)
        subm.loc[ids, 'target']  =  subm.loc[ids, 'target'].values + preds / N_FOLDS

In [None]:
oof_preds(train_ds, test_loader, embs_vocab, epochs = EPOCHES)

In [None]:
subm.to_csv('submission.csv')