In [None]:
import os
import re
import time
import string
import pickle
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random

import torch
import torch.nn as nn

import seaborn as sns

from nltk.stem import WordNetLemmatizer
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS


from collections import defaultdict

In [None]:
with open('../input/model-baseline/word_doc_freq', 'rb') as file:
    word_doc_freq=pickle.load(file)

with open('../input/model-baseline/vocab', 'rb') as file:
    vocab=pickle.load(file)
print(len(word_doc_freq))
print(len(vocab))

In [None]:

test_df=pd.read_csv('../input/commonlitreadabilityprize/test.csv')
test_df.head()

# Load Glove 100-d vectors

In [None]:
glove_path='../input/glove6b100dtxt/glove.6B.100d.txt'
glove_embeddings={}
with open(glove_path) as file:
    for line in file:
        line=line.split()
        word=line[0]
        v=np.array(line[1:]).astype(np.float)
        glove_embeddings[word]=v
print(len(glove_embeddings))

In [None]:
class Tokenizer:
    def __init__(self):
        self.lemmatizer=WordNetLemmatizer()
        self.nlp=English()
    def __call__(self, doc):
        tokens=[]
        for token in self.nlp(doc):
            if token.like_num or token.text=='':
                continue
            token=token.lower_.strip()
            for p in string.punctuation:
                token=token.replace(p, ' ')
            token=token.split(' ')
            token=[w for w in token if w!='']
            tokens+=token
        return tokens

# Creating the Masked Word Document

In [None]:
def get_masked_doc(doc):
    masked_doc=[]
    for word in doc:
        if word_doc_freq[word]>=5:
            masked_doc.append(word)
        else:
            masked_doc.append('<MASK>')
    return masked_doc

def get_rare_words(doc):
    words=set()
    for word in doc:
        if (word in word_doc_freq) and word_doc_freq[word]<5:
            words.add(word)
    return list(words)


tokenizer=Tokenizer()
test_df['doc']=test_df.excerpt.apply(tokenizer)
test_df['masked_doc']=test_df.doc.apply(get_masked_doc)
test_df['rare_words']=test_df.doc.apply(get_rare_words)
test_df.head()

# Configuration

In [None]:
MAX_SEQ_LEN=100
BATCH_SIZE=128
vocab_len=len(vocab)

In [None]:
#target_mean=train_df.target.mean()
#target_std=train_df.target.std()

target_mean=-0.9625387984618096
target_std= 1.0382744351056232
print("Taget Mean:", target_mean)
print("Taget Std:", target_std)

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, phase):
        self.df=df
        self.phase=phase
    def __getitem__(self, idx):
        row=self.df.iloc[idx]
        masked_doc=row.masked_doc
        rare_words=row.rare_words
        
        X=torch.zeros(MAX_SEQ_LEN, dtype=torch.long)
        rare_embedds=torch.zeros(100, dtype=torch.float32)
        cnt=0
        for rare_word in rare_words:
            if rare_word in glove_embeddings:
                rare_embedds += torch.tensor(glove_embeddings[rare_word], dtype=torch.float32)
                cnt+=1
        if cnt>0:
            rare_embedds/=cnt   
        for i, word in enumerate(masked_doc):
            if i >= MAX_SEQ_LEN:
                break
            if word in vocab:
                X[i]=vocab[word]
            else:
                X[i]=vocab['<MASK>']
        if self.phase=='train':
            #y=torch.tensor(row.target, dtype=torch.float32)
            y=torch.tensor(row.normalized_target, dtype=torch.float32)
            return (X, rare_embedds, y)
        return (X, rare_embedds)
    def __len__(self):
        return len(self.df)

# Model

In [None]:
class ProjectionHead(nn.Module):
    def __init__(self, in_features,out_feat):
        super().__init__()
        self.linear1=nn.Linear(in_features, 512)
        self.bn=nn.BatchNorm1d(512)
        self.dropout=nn.Dropout(0.2)
        self.relu=nn.ReLU()
        self.linear2=nn.Linear(512, out_feat)
    def forward(self, x):
        x=self.linear1(x)
        x=self.bn(x)
        x=self.dropout(x)
        x=self.relu(x)
        x=self.linear2(x)
        return x

    
class MaskModelEmbedding(nn.Module):
    def __init__(self, embedd_size, hidden_size):
        super().__init__()
        self.hidden_size=hidden_size
        self.embedding=nn.Embedding( len(vocab),embedd_size,padding_idx=0)
        self.gru=nn.GRU(embedd_size, hidden_size, num_layers=2, dropout=0.4, bidirectional=True, batch_first=True)
    def forward(self, x):
        X_embedd=self.embedding(x)
        batch_size=X_embedd.shape[0]
        (_, h_n)=self.gru(X_embedd)
        h_n=h_n.view(2, 2, batch_size, self.hidden_size)
        h_n=h_n[1, :, :, :].permute(1, 0, 2)
        h_n1=h_n[:, 0, :]
        h_n2=h_n[:, 1, :]
        h=torch.cat([h_n1, h_n2], dim=1)
        return h
    
class RareWordEmbedding(nn.Module):
    def __init__(self, glove_embedd_dim, out_dim1, out_dim2):
        super().__init__()
        self.linear1=nn.Linear(glove_embedd_dim, out_dim1)
        self.bn=nn.BatchNorm1d(out_dim1)
        self.dropout=nn.Dropout(0.2)
        self.relu=nn.ReLU()
        self.linear2=nn.Linear(out_dim1, out_dim2)
    def forward(self, x):
        x=self.linear1(x)
        x=self.bn(x)
        x=self.dropout(x)
        x=self.relu(x)
        x=self.linear2(x)
        return x


class Model(nn.Module):
    def __init__(self, mask_embedd_size, mask_hidden_size, glove_embedd_dim, rare_out_dim1, rare_out_dim2):
        super().__init__()
        self.mask_embedding=MaskModelEmbedding(mask_embedd_size, mask_hidden_size)
        self.rare_embedding=RareWordEmbedding(glove_embedd_dim, rare_out_dim1, rare_out_dim2)
        
        self.bn=nn.BatchNorm1d(2*mask_hidden_size + rare_out_dim2)
        self.relu=nn.ReLU()
        self.dropout=nn.Dropout(0.2)
        self.proj_head=ProjectionHead((2*mask_hidden_size + rare_out_dim2), 1)
    def forward(self, x, rareEmbedding):
        h1=self.mask_embedding(x)
        h2=self.rare_embedding(rareEmbedding)
        h=torch.cat([h1, h2], dim=1)
        y=self.proj_head(h)
        return y

In [None]:
models=[
    torch.load('../input/model-baseline/model_1.pt'),
    torch.load('../input/model-baseline/model_2.pt'),
    torch.load('../input/model-baseline/model_3.pt'),
    torch.load('../input/model-baseline/model_4.pt'),
    torch.load('../input/model-baseline/model_5.pt')
]


In [None]:
def infer(dataloader):
    preds=[]
    for (X, rareEmbedd) in dataloader:
        with torch.no_grad():
            y=torch.zeros(X.shape[0])
            for model in models:
                model.eval()
                y_hat=model(X, rareEmbedd).view(-1)
                y_hat=(target_std*y_hat) + target_mean
                y+=y_hat
            y/=len(models)
            preds+=list(y.numpy())
    return preds

In [None]:
infer_test_dataset=Dataset(test_df, 'test')
infer_test_dataloader=torch.utils.data.DataLoader(infer_test_dataset, batch_size=200, shuffle=False)
test_df['target'] = infer(infer_test_dataloader)

In [None]:
submission_df=test_df[['id', 'target']].copy()
submission_df.head()

In [None]:
submission_df.to_csv('submission.csv', index=False)