In [None]:
!pip install "../input/textstat/Pyphen-0.10.0-py3-none-any.whl"
!pip install "../input/textstat/textstat-0.7.0-py3-none-any.whl"

In [None]:
import os
import re
import time
import string
import pickle
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
import textstat

import torch
import torch.nn as nn

import seaborn as sns

from nltk.stem import WordNetLemmatizer
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.preprocessing import StandardScaler

from collections import defaultdict

In [None]:
nlp=English()
stop_words=nlp.Defaults.stop_words

nlp.add_pipe(nlp.create_pipe('sentencizer'))
print(nlp.pipe_names)

In [None]:
train_df=pd.read_csv('../input/commonlitreadabilityprize/train.csv')


train_df.head()

In [None]:
train_feature_columns=[
    'sentence_count', 'word_count', 'distinct_word_count', 'words_per_sentence',
    'stopword_count', 'word_count_without_stopword', 'distinct_word_count_without_stopword',
    'title_words_count', 'distinct_title_words_count','title_word_per_sentence',
    'word_redundancy', 'stopword_redundancy', 'word_redundancy_witout_stopwords','words_per_punctuation',
    '0syllable_no_stop', '0syllable_no_stop_proportion','1syllable_no_stop', '1syllable_no_stop_proportion', 
    '2syllable_no_stop', '2syllable_no_stop_proportion','3syllable_no_stop','3syllable_no_stop_proportion', 
    '4syllable_no_stop','4syllable_no_stop_proportion','>=5syllable_no_stop', '>=5syllable_no_stop_proportion'
]

# Helper Functions

In [None]:
def get_sentence_count(excerpt):
    return len( list(nlp(excerpt).sents) )

def get_word_count(excerpt):
    cnt=0
    for token in nlp(excerpt):
        if token.like_num or token.is_punct:
            continue
        cnt+=1
    return cnt
    
def get_unique_word_count(excerpt):
    word_set=set()
    for token in nlp(excerpt):
        if token.like_num or token.is_punct:
            continue
        word_set.add(token.lower_)
    return len(word_set)

def get_word_count_without_stopword(excerpt):
    cnt=0
    for token in nlp(excerpt):
        if token.like_num or token.is_punct or token.is_stop:
            continue
        cnt+=1
    return cnt

def get_distinct_word_count_without_stopword(excerpt):
    words=set()
    for token in nlp(excerpt):
        if token.like_num or token.is_punct or token.is_stop:
            continue
        words.add(token.lower_)
    return len(words)


def get_stopword_count(excerpt):
    cnt=0
    for word in nlp(excerpt):
        if word.is_stop:
            cnt+=1
    return cnt

def get_unique_stopword_count(excerpt):
    word_set=set()
    for word in nlp(excerpt):
        if word.is_stop:
            word_set.add(word)
    return len(word_set)
    
def get_punctuation_count(excerpt):
    cnt=0
    for word in nlp(excerpt):
        if word.is_punct:
            cnt+=1
    return cnt

def get_title_word_count(excerpt):
    cnt=0
    for word in nlp(excerpt):
        if word.is_title:
            cnt+=1
    return cnt

def get_unique_title_word_count(excerpt):
    words=set()
    for word in nlp(excerpt):
        if word.is_title:
            words.add(word.text)
    return len(words)

def get_capital_word_count(excerpt):
    cnt=0
    for word in nlp(excerpt):
        if word.is_upper:
            cnt+=1
    return cnt

def get_unique_capital_word_count(excerpt):
    words=set()
    for word in nlp(excerpt):
        if word.is_upper:
            words.add(word)
    return len(words)

def get_syllable_counts(excerpt):
    syllabel_freq=defaultdict(int)
    for token in nlp(excerpt):
        if token.like_num or token.is_punct:
            continue
        syllabel_freq[ textstat.syllable_count(token.text) ]+=1
    return syllabel_freq

def get_syllable_count_without_stop(excerpt):
    syllabel_freq=defaultdict(int)
    for token in nlp(excerpt):
        if token.like_num or token.is_punct or token.is_stop:
            continue
        syllabel_freq[ textstat.syllable_count(token.text) ]+=1
    return syllabel_freq

In [None]:
def get_hand_engineered_feautures(df):
    df['sentence_count']=df.excerpt.apply(get_sentence_count)
    
    df['word_count']=df.excerpt.apply(get_word_count)
    df['distinct_word_count']=df.excerpt.apply(get_unique_word_count)
    
    df['stopword_count']=df.excerpt.apply(get_stopword_count)
    df['distinct_stopword_count']=df.excerpt.apply(get_unique_stopword_count)

    df['word_count_without_stopword']=df.excerpt.apply(get_word_count_without_stopword)
    df['distinct_word_count_without_stopword']=df.excerpt.apply(get_distinct_word_count_without_stopword)

    df['punctuation_counts']=df.excerpt.apply(get_punctuation_count)
    df['title_words_count']=df.excerpt.apply(get_title_word_count)
    df['distinct_title_words_count']=df.excerpt.apply(get_unique_title_word_count)

    df['capital_word_count']=df.excerpt.apply(get_capital_word_count)
    df['distinct_capital_word_count']=df.excerpt.apply(get_unique_capital_word_count)

    df['word_redundancy']=1-(df['distinct_word_count'].div(df['word_count']))
    df['stopword_redundancy']=1-(df['stopword_count'].div(df['word_count']))
    df['word_redundancy_witout_stopwords']=1-(df['distinct_word_count_without_stopword'].div(df['word_count_without_stopword']))

    df['title_word_proportion']=df['title_words_count'].div(df['word_count'])
    df['title_word_per_sentence']=df['title_words_count'].div(df['sentence_count'])

    df['capital_word_proportaion']=df['capital_word_count'].div(df['word_count'])

    df['words_per_punctuation']=df['word_count'].div(df.punctuation_counts)
    df['words_per_sentence']=df['word_count'].div(df['sentence_count'])

    df['syllable_freq']=df.excerpt.apply(get_syllable_counts)
    df['syllable_without_stop_freq']=df.excerpt.apply(get_syllable_count_without_stop)

    df['0syllable']=df.syllable_freq.apply(lambda x: x[0])
    df['0syllable_proportion']=df['0syllable'].div(df['word_count'])

    df['1syllable']=df.syllable_freq.apply(lambda x: x[1])
    df['1syllable_proportion']=df['1syllable'].div(df['word_count'])

    df['2syllable']=df.syllable_freq.apply(lambda x: x[2])
    df['2syllable_proportion']=df['2syllable'].div(df['word_count'])

    df['3syllable']=df.syllable_freq.apply(lambda x: x[3])
    df['3syllable_proportion']=df['3syllable'].div(df['word_count'])

    df['4syllable']=df.syllable_freq.apply(lambda x: x[4])
    df['4syllable_proportion']=df['4syllable'].div(df['word_count'])

    df['>=5syllable']=df.syllable_freq.apply(lambda x: sum(x.values()) - x[0]- x[1]- x[2]- x[3]- x[4] )
    df['>=5syllable_proportion']=df['>=5syllable'].div(df['word_count'])

    df['0syllable_no_stop']=df.syllable_without_stop_freq.apply(lambda x: x[0])
    df['0syllable_no_stop_proportion']=df['0syllable_no_stop'].div(df['word_count_without_stopword'])

    df['1syllable_no_stop']=df.syllable_without_stop_freq.apply(lambda x: x[1])
    df['1syllable_no_stop_proportion']=df['1syllable_no_stop'].div(df['word_count_without_stopword'])

    df['2syllable_no_stop']=df.syllable_without_stop_freq.apply(lambda x: x[2])
    df['2syllable_no_stop_proportion']=df['2syllable_no_stop'].div(df['word_count_without_stopword'])

    df['3syllable_no_stop']=df.syllable_without_stop_freq.apply(lambda x: x[3])
    df['3syllable_no_stop_proportion']=df['3syllable_no_stop'].div(df['word_count_without_stopword'])

    df['4syllable_no_stop']=df.syllable_without_stop_freq.apply(lambda x: x[4])
    df['4syllable_no_stop_proportion']=df['4syllable_no_stop'].div(df['word_count_without_stopword'])

    df['>=5syllable_no_stop']=df.syllable_without_stop_freq.apply(lambda x: sum(x.values()) - x[0]- x[1]- x[2]- x[3]- x[4] )
    df['>=5syllable_no_stop_proportion']=df['>=5syllable_no_stop'].div(df['word_count_without_stopword'])
    
    return df

In [None]:
%%time
train_df=get_hand_engineered_feautures(train_df)
train_df.head()

In [None]:
scaler=StandardScaler()
X_train=scaler.fit_transform(train_df[train_feature_columns])
print(X_train.shape)

# Load Glove 100-d vectors

In [None]:
glove_path='../input/glove6b100dtxt/glove.6B.100d.txt'
glove_embeddings={}
with open(glove_path) as file:
    for line in file:
        line=line.split()
        word=line[0]
        v=np.array(line[1:]).astype(np.float)
        glove_embeddings[word]=v
print(len(glove_embeddings))

# Tokenizer

In [None]:
class Tokenizer:
    def __init__(self):
        self.lemmatizer=WordNetLemmatizer()
        self.nlp=English()
    def __call__(self, doc):
        tokens=[]
        for token in self.nlp(doc):
            if token.like_num or token.text=='':
                continue
            token=token.lower_.strip()
            for p in string.punctuation:
                token=token.replace(p, ' ')
            token=token.split(' ')
            token=[w for w in token if w!='']
            tokens+=token
        return tokens

In [None]:
tokenizer=Tokenizer()
train_df['doc']=train_df.excerpt.apply(tokenizer)
train_df.head()

In [None]:
target_mean=-0.9625387984618096#train_df.target.mean()
target_std=1.0382744351056232#train_df.target.std()

print("Taget Mean:", target_mean)
print("Taget Std:", target_std)

# Sequence Dataset

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, max_seq_len, phase):
        self.df=df
        self.MAX_SEQ_LEN=max_seq_len
        self.phase=phase
    def __getitem__(self, idx):
        row=self.df.iloc[idx]
        doc=row.doc
        
        X1=torch.zeros((self.MAX_SEQ_LEN, 100), dtype=torch.float32)
        X2=scaler.transform(row[train_feature_columns].values.reshape(1, -1))
        X2=torch.tensor(X2,dtype=torch.float32)
        X2=X2.view(-1)
        
        for i, word in enumerate(doc):
            if i >= self.MAX_SEQ_LEN:
                break
            if word in glove_embeddings:
                X1[i]=torch.tensor(glove_embeddings[word])
        
        if self.phase=='train' or self.phase=='val':
            y=torch.tensor(row.normalized_target, dtype=torch.float32)
            return (X1, X2, y)
        return (X1, X2)
    def __len__(self):
        return len(self.df)

# Sequence Model

In [None]:
class SequenceEmbedding(nn.Module):
    def __init__(self, embedd_size, hidden_size, projection_size):
        super().__init__()
        self.hidden_size=hidden_size
        self.gru=nn.GRU(embedd_size, hidden_size, num_layers=2, 
                        dropout=0.2, bidirectional=True,batch_first=True)
        self.bn=nn.BatchNorm1d(2*hidden_size)
        self.relu=nn.ReLU()
        self.dropout=nn.Dropout(0.2)
        self.projection_layer=nn.Linear(2*hidden_size, projection_size)
        
    def forward(self, x):
        batch_size=x.shape[0]
        (_, h_n)=self.gru(x)
        h_n=h_n.view(2, 2, batch_size, self.hidden_size)
        h_n=h_n[1, :, :, :].permute(1, 0, 2)
        h_n1=h_n[:, 0, :]
        h_n2=h_n[:, 1, :]
        h=torch.cat([h_n1, h_n2], dim=1)
        
        h=self.bn(h)
        h=self.relu(h)
        h=self.dropout(h)
        
        h=self.projection_layer(h)
        return h

class SequenceModel(nn.Module):
    def __init__(self, embedd_size, hidden_size, projection_size):
        super().__init__()
        self.sequence_embedding=SequenceEmbedding(embedd_size, hidden_size, projection_size)
        self.bn=nn.BatchNorm1d(projection_size)
        self.dropout=nn.Dropout(0.2)
        self.relu=nn.ReLU()
        self.out_layer=nn.Linear(projection_size, 1)
    def forward(self, x):
        h=self.sequence_embedding(x)
        h=self.bn(h)
        h=self.dropout(h)
        h=self.relu(h)
        y=self.out_layer(h)
        return y

# Tabular Model

In [None]:
class TabularEmbedding(nn.Module):
    def __init__(self, in_feat):
        super().__init__()
        self.linear1=nn.Linear(in_feat, 128)
        self.bn1=nn.BatchNorm1d(128)
        self.dropout1=nn.Dropout(0.4)
        self.relu1=nn.ReLU()
        self.linear2=nn.Linear(128, 64)

    def forward(self, x):
        x=self.linear1(x)
        x=self.bn1(x)
        x=self.dropout1(x)
        x=self.relu1(x)
        x=self.linear2(x)
        return x

class TabularModel(nn.Module):
    def __init__(self, in_feat):
        super().__init__()
        self.tabular_embedding=TabularEmbedding(in_feat)
        self.bn=nn.BatchNorm1d(64)
        self.dropout=nn.Dropout(0.5)
        self.relu=nn.ReLU()
        self.out=nn.Linear(64, 1)
        
    def forward(self, x):
        x=self.tabular_embedding(x)
        x=self.bn(x)
        x=self.dropout(x)
        x=self.relu(x)
        x=self.out(x)
        return x

# Model

In [None]:
class Model(nn.Module):
    def __init__(self, sequence_embedding,tabular_embedding, seq_embedd_size, tab_embedd_size):
        super().__init__()
        self.sequence_embedding=sequence_embedding
        self.tabular_embedding=tabular_embedding
        
        self.bn1=nn.BatchNorm1d(seq_embedd_size + tab_embedd_size)
        self.dropout1=nn.Dropout(0.5)
        self.relu1=nn.ReLU()
        self.linear1=nn.Linear(seq_embedd_size + tab_embedd_size, 784)
        
        self.bn2=nn.BatchNorm1d(784)
        self.dropout2=nn.Dropout(0.5)
        self.relu2=nn.ReLU()
        self.linear2=nn.Linear(784, 1)
    def forward(self, x1, x2):
        self.sequence_embedding.eval()
        self.tabular_embedding.eval()
        with torch.no_grad():
            h1=self.sequence_embedding(x1)
        with torch.no_grad():
            h2=self.tabular_embedding(x2)
        h =torch.cat([h1, h2], dim=1)
        
        h = self.bn1(h)
        h = self.dropout1(h)
        h = self.relu1(h)
        h = self.linear1(h)
        
        h = self.bn2(h)
        h = self.dropout2(h)
        h = self.relu2(h)
        
        y = self.linear2(h)
        return y

In [None]:
def infer(models, dataloader):
    preds=[]
    for (X1, X2) in dataloader:
        y_hat=torch.zeros(X1.shape[0])
        for model in models:
            model.eval()
            with torch.no_grad():
                y=model(X1, X2).view(-1)
                y_hat+=(target_std*y) + target_mean
        preds+=list(y_hat.numpy()/len(models))
    return preds

In [None]:
models=[
    torch.load('../input/gru-tabular-data/model_0.pt'),
    torch.load('../input/gru-tabular-data/model_1.pt'),
    torch.load('../input/gru-tabular-data/model_2.pt'),
    torch.load('../input/gru-tabular-data/model_3.pt'),
    torch.load('../input/gru-tabular-data/model_4.pt')
]
models[0]

# Submission

In [None]:
MAX_SEQ_LEN=150
BATCH_SIZE=128

In [None]:
test_df=pd.read_csv('../input/commonlitreadabilityprize/test.csv')
test_df=get_hand_engineered_feautures(test_df)
test_df['doc']=test_df.excerpt.apply(tokenizer)

test_df.head()

In [None]:
infer_test_dataset=Dataset(test_df, MAX_SEQ_LEN, 'test')
infer_test_dataloader=torch.utils.data.DataLoader(infer_test_dataset, batch_size=200, shuffle=False)
test_df['target'] = infer(models, infer_test_dataloader)


In [None]:
submission_df=test_df[['id', 'target']].copy()
submission_df.head()

In [None]:
submission_df.to_csv('submission.csv', index=False)