In [None]:
import os
import re
import time
import string
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random

import torch
import torch.nn as nn

import seaborn as sns

from nltk.stem import WordNetLemmatizer
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS


from collections import defaultdict

Reference
1. https://www.kaggle.com/narendra/commonlit-baseline-by-masking-words-grus/output#Submission

In [None]:
train_df=pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_df=pd.read_csv('../input/commonlitreadabilityprize/test.csv')


In [None]:
class Tokenizer:
    def __init__(self):
        self.lemmatizer=WordNetLemmatizer()
        self.nlp=English()
    def __call__(self, doc):
        tokens=[]
        for token in self.nlp(doc):
            if token.like_num or token.text=='':
                continue
            token=token.lower_.strip()
            for p in string.punctuation:
                token=token.replace(p, ' ')
            token=token.split(' ')
            token=[w for w in token if w!='']
            tokens+=token
        return tokens

In [None]:
tokenizer=Tokenizer()

train_df['doc']=train_df.excerpt.apply(tokenizer)
test_df['doc']=test_df.excerpt.apply(tokenizer)

In [None]:
word_doc_freq=defaultdict(int)
for doc in train_df.doc.values:
    for word in set(doc):
        if word=='sprinkle':
            print(word)
        word_doc_freq[word]+=1

print("Number Of Unique Words:", len(word_doc_freq))

In [None]:
def get_masked_doc(doc):
    masked_doc=[]
    for word in doc:
        if word_doc_freq[word]>=5:
            masked_doc.append(word)
        else:
            masked_doc.append('<MASK>')
    return masked_doc

In [None]:
train_df['masked_doc']=train_df.doc.apply(get_masked_doc)
test_df['masked_doc']=test_df.doc.apply(get_masked_doc)

train_df.head()

In [None]:
vocab={'<PAD>': 0}
vocab_id=1
for masked_doc in train_df.masked_doc:
    for word in masked_doc:
        if word not in vocab:
            vocab[word]=vocab_id
            vocab_id+=1
print("Number Of words in Vocabulary:", len(vocab))

# Model

In [None]:
class ProjectionHead(nn.Module):
    def __init__(self, in_features,out_feat):
        super().__init__()
        self.linear1=nn.Linear(in_features, 256)
        self.bn=nn.BatchNorm1d(256)
        self.dropout=nn.Dropout(0.2)
        self.relu=nn.ReLU()
        self.linear2=nn.Linear(256, out_feat)
    def forward(self, x):
        x=self.linear1(x)
        x=self.bn(x)
        x=self.dropout(x)
        x=self.relu(x)
        x=self.linear2(x)
        return x
    
class Model(nn.Module):
    def __init__(self, embedd_size, hidden_size):
        super().__init__()
        self.hidden_size=hidden_size
        self.embedding=nn.Embedding( len(vocab),embedd_size,padding_idx=0)
        self.gru=nn.GRU(embedd_size, hidden_size, num_layers=2, dropout=0.4, bidirectional=True, batch_first=True)
        #self.lstm=nn.GRU(100, 100, num_layers=2, dropout=0.3, bidirectional=True, batch_first=True)
        self.bn=nn.BatchNorm1d(2*hidden_size)
        self.relu=nn.ReLU()
        self.dropout=nn.Dropout(0.2)
        self.proj_head=ProjectionHead(2*hidden_size, 1)
    def forward(self, x):
        X_embedd=self.embedding(x)
        batch_size=X_embedd.shape[0]
        (_, h_n)=self.gru(X_embedd)
        #output, (h_n, c_n)=self.lstm(X_embedd)
        
        h_n=h_n.view(2, 2, batch_size, self.hidden_size)
        h_n=h_n[1, :, :, :].permute(1, 0, 2)
        h_n1=h_n[:, 0, :]
        h_n2=h_n[:, 1, :]
        h=torch.cat([h_n1, h_n2], dim=1)
        h=self.bn(h)
        h=self.relu(h)
        h=self.dropout(h)
        
        y=self.proj_head(h)
        return y

# Dataset

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, phase):
        self.df=df
        self.phase=phase
    def __getitem__(self, idx):
        row=self.df.iloc[idx]
        masked_doc=row.masked_doc
        X=torch.zeros(MAX_SEQ_LEN, dtype=torch.long)
        mask=torch.ones(MAX_SEQ_LEN)
        
        if len(masked_doc) < MAX_SEQ_LEN:
            mask[len(masked_doc) : ] =0 
        mask=mask.to(bool)
        for i, word in enumerate(masked_doc):
            if i >= MAX_SEQ_LEN:
                break
            if word in vocab:
                X[i]=vocab[word]
            else:
                X[i]=vocab['<MASK>']
        if self.phase=='train':
            #y=torch.tensor(row.target, dtype=torch.float32)
            y=torch.tensor(row.normalized_target, dtype=torch.float32)
            return (X, mask, y)
        return (X, mask)
    def __len__(self):
        return len(self.df)

# Configuration

In [None]:
MAX_SEQ_LEN=180
BATCH_SIZE=128
vocab_len=len(vocab)

In [None]:
def infer(model, dataloader):
    preds=[]
    model.eval()
    for (X, mask) in dataloader:
        with torch.no_grad():
            y_hat=model(X).view(-1)
            #y_hat=torch.clip(y_hat, -4.0, 2.0)
            y_hat=(target_std*y_hat) + target_mean
            preds+=list(y_hat.numpy())
    return preds

In [None]:
infer_test_dataset=Dataset(test_df, 'test')
infer_test_dataloader=torch.utils.data.DataLoader(infer_test_dataset, batch_size=200, shuffle=False)



target_mean=train_df.target.mean()
target_std=train_df.target.std()

print("Taget Mean:", target_mean)
print("Taget Std:", target_std)

In [None]:
model=torch.load('../input/commonlitmodel1/model.pt')
test_df['target'] = infer(model, infer_test_dataloader)

In [None]:
submission_df=test_df[['id', 'target']].copy()
submission_df.head()

In [None]:
submission_df.to_csv('submission.csv', index=False)