In [None]:
import os
import re
import time
import string
import pickle
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random

import torch
import torch.nn as nn

import seaborn as sns

from nltk.stem import WordNetLemmatizer
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS


from collections import defaultdict

# Load Glove 100-d vectors

In [None]:
glove_path='../input/glove6b100dtxt/glove.6B.100d.txt'
glove_embeddings={}
with open(glove_path) as file:
    for line in file:
        line=line.split()
        word=line[0]
        v=np.array(line[1:]).astype(np.float)
        glove_embeddings[word]=v
print(len(glove_embeddings))

In [None]:
class Tokenizer:
    def __init__(self):
        self.lemmatizer=WordNetLemmatizer()
        self.nlp=English()
    def __call__(self, doc):
        tokens=[]
        for token in self.nlp(doc):
            if token.like_num or token.text=='':
                continue
            token=token.lower_.strip()
            for p in string.punctuation:
                token=token.replace(p, ' ')
            token=token.split(' ')
            token=[w for w in token if w!='']
            tokens+=token
        return tokens

lets us consider all the words that appear atleast in 5 documents

# Configuration

In [None]:
MAX_SEQ_LEN=150
BATCH_SIZE=128

In [None]:
target_mean=-0.9625387984618096#train_df.target.mean()
target_std=1.0382744351056232#train_df.target.std()

print("Taget Mean:", target_mean)
print("Taget Std:", target_std)

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, phase):
        self.df=df
        self.phase=phase
    def __getitem__(self, idx):
        row=self.df.iloc[idx]
        doc=row.doc
        
        X=torch.zeros((MAX_SEQ_LEN, 100), dtype=torch.float32)
        for i, word in enumerate(doc):
            if i >= MAX_SEQ_LEN:
                break
            if word in glove_embeddings:
                X[i]=torch.tensor(glove_embeddings[word])
        
        if self.phase=='train':
            y=torch.tensor(row.normalized_target, dtype=torch.float32)
            return (X, y)
        return X
    def __len__(self):
        return len(self.df)

# Model

In [None]:
class ProjectionHead(nn.Module):
    def __init__(self, in_features,out_feat):
        super().__init__()
        self.linear1=nn.Linear(in_features, 512)
        self.bn=nn.BatchNorm1d(512)
        self.dropout=nn.Dropout(0.2)
        self.relu=nn.ReLU()
        self.linear2=nn.Linear(512, out_feat)
    def forward(self, x):
        x=self.linear1(x)
        x=self.bn(x)
        x=self.dropout(x)
        x=self.relu(x)
        x=self.linear2(x)
        return x

class Model(nn.Module):
    def __init__(self, embedd_size, hidden_size):
        super().__init__()
        self.hidden_size=hidden_size
        self.gru=nn.GRU(embedd_size, hidden_size, num_layers=2, 
                        dropout=0.2, bidirectional=True,batch_first=True)
        self.bn=nn.BatchNorm1d(2*hidden_size)
        self.relu=nn.ReLU()
        self.dropout=nn.Dropout(0.2)
        self.proj_head=ProjectionHead(2*hidden_size, 1)
    def forward(self, x):
        batch_size=x.shape[0]
        (_, h_n)=self.gru(x)
        h_n=h_n.view(2, 2, batch_size, self.hidden_size)
        h_n=h_n[1, :, :, :].permute(1, 0, 2)
        h_n1=h_n[:, 0, :]
        h_n2=h_n[:, 1, :]
        h=torch.cat([h_n1, h_n2], dim=1)
        
        h=self.bn(h)
        h=self.relu(h)
        h=self.dropout(h)
        
        y=self.proj_head(h)
        return y

# Load Models

In [None]:
models=[
    torch.load('../input/commonlit-glove-model/model_1.pt'),
    torch.load('../input/commonlit-glove-model/model_2.pt'),
    torch.load('../input/commonlit-glove-model/model_3.pt'),
    torch.load('../input/commonlit-glove-model/model_4.pt'),
    torch.load('../input/commonlit-glove-model/model_5.pt')
]

In [None]:
def infer(models, dataloader):
    preds=[]
    for X in dataloader:
        y_hat=torch.zeros(X.shape[0])
        for model in models:
            model.eval()
            with torch.no_grad():
                y=model(X).view(-1)
                y_hat+=(target_std*y) + target_mean
            
        preds+=list(y_hat.numpy()/len(models))
    return preds

# Submission

In [None]:
tokenizer=Tokenizer()

test_df=pd.read_csv('../input/commonlitreadabilityprize/test.csv')
test_df['doc']=test_df.excerpt.apply(tokenizer)
test_df.head()

In [None]:
infer_test_dataset=Dataset(test_df, 'test')
infer_test_dataloader=torch.utils.data.DataLoader(infer_test_dataset, batch_size=200, shuffle=False)
test_df['target'] = infer(models, infer_test_dataloader)


In [None]:
submission_df=test_df[['id', 'target']].copy()
submission_df.head()

In [None]:
submission_df.to_csv('submission.csv', index=False)