In [None]:
import os
import string
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


import seaborn as sns

from nltk.stem import WordNetLemmatizer
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS


from collections import defaultdict

In [None]:
lemmatizer=WordNetLemmatizer()

In [None]:
nlp=English()
stop_words=nlp.Defaults.stop_words

nlp.add_pipe(nlp.create_pipe('sentencizer'))
print(nlp.pipe_names)

In [None]:
train=pd.read_csv('../input/commonlitreadabilityprize/train.csv')
train=train[['id', 'excerpt', 'target']]
train.head()

In [None]:
print("Number Of Records:", train.id.nunique())

In [None]:
train.target.describe()

In [None]:
plt.plot(train.target.sort_values().values)

In [None]:

sns.displot(data=train, x='target', bins=100)
plt.title("Reading Ease")
plt.xticks(np.arange(-4, 2, 0.5))
plt.figure(figsize=(10, 5))
plt.show()

In [None]:
sns.boxplot(data=train, x='target')

Target is following normal distribution with the values ranging between -3.5, 1.5

50% of the passages have the reading ease of <= -0.9

In [None]:
class Vocab:
    def __init__(self, passages):
        self.passages=passages
        self.word2id={}
        self.id2word={}
        self.vocab_freq={}
        self.vocab_=[]
    def build(self):
        for passage in self.passages:
            for word in nlp(passage):
                if word.is_stop or word.is_punct or word.like_num or len(word.text.strip())<=1:
                    continue
                word=word.lower_.strip()
                word=lemmatizer.lemmatize(word)
                if word not in self.vocab_freq:
                    self.vocab_freq[word]=0
                self.vocab_freq[word]+=1
        for idx, word in enumerate(self.vocab_freq.keys()):
            self.word2id[word]=idx
            self.id2word[idx]=word
            self.vocab_.append(word)
            
    def __len__(self):
        return len(self.vocab_)

In [None]:
%%time
vocab=Vocab(train.excerpt.values)
vocab.build()

In [None]:
print("Vocab Size:", len(vocab))

In [None]:
df=pd.DataFrame.from_dict({"word": list(vocab.vocab_freq.keys()),"freq": list(vocab.vocab_freq.values())})
df.head()

In [None]:
df.freq.describe()

In [None]:
plt.plot(df.freq.sort_values().values)

In [None]:
plt.plot(np.log(df.freq).sort_values().values)

In [None]:
plt.plot((df[df.freq>5].freq.cumsum()).sort_values().values)

1. long-tail distribution of words

2. like Sparsity Exists till 14000 words and the frequency starts to raise gradually till 25000 and starts a sudden raise

3. 75% of the words had frequency <= 5

In [None]:
def get_frequency_distribution(passage):
    freq_dist=defaultdict(int)
    for word in nlp(passage):
        if word.is_stop or word.is_punct or word.like_num or len(word.text.strip())<=1:
            continue
        word=word.lower_.strip()
        word=lemmatizer.lemmatize(word)
        if word in vocab.vocab_freq:
            freq_dist[vocab.vocab_freq[word]]+=1
    return freq_dist

In [None]:
train['freq_dist'] = train.excerpt.apply(get_frequency_distribution)
train.head()

bins:
bin1 --> [0, 20]

bin2 --> [21,50]

bin3 --> [51, 100]

bin4 --> [101, 500]

bin5 --> [>500]

In [None]:
def get_bin_num(freq):
    if freq <= 20:
        return 0
    elif freq <= 50:
        return 1
    return 2
df['bin_num']=df.freq.apply(get_bin_num)
df.head()

In [None]:
def bin_distribution(freq_dist):
    total_cnt=0
    bins=np.zeros(3)
    for key,value in freq_dist.items():
        total_cnt+=value
        if key <= 20:
            bins[0]+=value
        elif key <= 50:
            bins[1]+=value
        else:
            bins[2]+=value
    for i in range(3):
        bins[i]/=total_cnt
        bins[i]=round(bins[i], 2)
    return bins

In [None]:
train['bin_distribution']=train.freq_dist.apply(bin_distribution)
train['bin1'] = train.bin_distribution.apply(lambda lst: lst[0])
train['bin2'] = train.bin_distribution.apply(lambda lst: lst[1])
train['bin3'] = train.bin_distribution.apply(lambda lst: lst[2])


train.head()

In [None]:
sns.countplot(data=df, x='bin_num')


In [None]:
fig, ax=plt.subplots(1, 3, sharey=True, figsize=(10, 4))

sns.histplot(train, x='bin1', ax=ax[0])
sns.histplot(train, x='bin2', ax=ax[1])
sns.histplot(train, x='bin3', ax=ax[2])

ax[0].set_title('Bin1')
ax[1].set_title('Bin2')
ax[2].set_title('Bin3')

plt.show()

In [None]:
fig, ax=plt.subplots(3, 1, sharex=True, figsize=(10, 7))

sns.boxplot(data=train, x='bin1', ax=ax[0])
sns.boxplot(data=train, x='bin2', ax=ax[1])
sns.boxplot(data=train, x='bin3', ax=ax[2])

plt.show()

The Distributions of 3 bins are different

Does these contribute to the reading easy

Hypothesis is that:
1. High Distribution of Rare words may be difficult to read and viceversa
2. High Distribution of Common Words are easy to read and viceversa.

In [None]:
train.head()

In [None]:
_, ax=plt.subplots(2, 3, figsize=(15, 6))

sns.histplot(data=train, x='bin1', y='target', ax=ax[0][0])
sns.lineplot(data=train, x='bin1', y='target', ax=ax[1][0])

sns.histplot(data=train, x='bin2', y='target', ax=ax[0][1])
sns.lineplot(data=train, x='bin2', y='target', ax=ax[1][1])

sns.histplot(data=train, x='bin3', y='target', ax=ax[0][2])
sns.lineplot(data=train, x='bin3', y='target', ax=ax[1][2])


As Expected we can see from the line graph that 
 1. ease of reading got reduced as the proporation of rare words in the text increases
 2. ease of reading got increase as the proporation of more common words increase in the text.

# Analysis of the topics with Topic Modelling LDA

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
!pip install pyldavis

In [None]:
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models import TfidfModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

In [None]:
docs=[]
for passage in train.excerpt.values:
    doc=[]
    for word in nlp(passage):
        if word.is_stop or word.is_punct or word.like_num or len(word.text.strip())<=1:
            continue
        word=word.lower_.strip()
        word=lemmatizer.lemmatize(word)
        doc.append(word)
    docs.append(doc)

In [None]:
dictionary=Dictionary(docs)
corpus=[dictionary.doc2bow(doc) for doc in docs]
#tfidfModel=TfidfModel(corpus)
#corpus=tfidfModel[corpus]
print("Number Of Unique Tokens:", len(dictionary))
print("Number Of Documents", len(corpus))

In [None]:
id2word={}
for word, idx in dictionary.token2id.items():
    id2word[idx]=word

In [None]:
#ldaModel=LdaModel(corpus, num_topics=30,passes=30,iterations=500,eval_every=1,id2word=id2word)
#gensimvis.prepare(ldaModel, corpus, dictionary)

# Modeling

In [None]:
import torch
import torch.nn as nn

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, phase):
        self.df=df
        self.phase=phase
    def __getitem__(self, idx):
        row=self.df.iloc[idx]
        X=torch.tensor(row.bin_distribution, dtype=torch.float32)
        if self.phase=='train':
            y=torch.tensor(row.target, dtype=torch.float32)
            return  X, y
        return X
    def __len__(self):
        return len(self.df)

In [None]:
class Model(nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.linear=nn.Linear(in_features, out_features)
    def forward(self, x):
        out=self.linear(x)
        return out

In [None]:
train_dataset=Dataset(train, phase='train')
train_dataloader=torch.utils.data.DataLoader(train_dataset, batch_size=1000, shuffle=True)


In [None]:
def train_epoch(model, optimizer, mse_loss):
    epoch_loss=0.0
    model.train()
    for X, y in train_dataloader:
        y_hat=model(X)
        y_hat=torch.clip(y_hat, -3.6, 1.6)
        loss=mse_loss(y_hat, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss+=loss.item()
    epoch_loss/=len(train_dataloader)
    return epoch_loss

In [None]:
models=[]
for i in range(7):
    model=Model(3, 1)
    mse_loss=torch.nn.MSELoss()
    optimizer=torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-5)
    loss=0.0
    for e in range(120):
        loss=train_epoch(model, optimizer, mse_loss)
        if (e+1)%10==0:
            print("Epoch:{} | Loss:{:.3f}".format(e+1, loss))
    print("Loss At the End of the Model Iteration {} is :{:.3f}".format(i+1,loss))
    models.append(model)

In [None]:
test_df=pd.read_csv('../input/commonlitreadabilityprize/test.csv')
test_df.head()

In [None]:
test_df['freq_dist'] = test_df.excerpt.apply(get_frequency_distribution)
test_df['bin_distribution']=test_df.freq_dist.apply(bin_distribution)


test_df.head()

In [None]:
test_dataset=Dataset(test_df, phase='test')
test_dataloader=torch.utils.data.DataLoader(test_dataset, shuffle=False, batch_size=1000)

preds=[]
for X in test_dataloader:
    batch_size=X.shape[0]
    y=torch.zeros(batch_size)
    for model in models:
        model.eval()
        with torch.no_grad():
            y_hat=model(X)
            y_hat=y_hat.view(-1)
            
            y+=y_hat
    y/=len(models)
    preds += list(y.numpy())


In [None]:
test_df['target']=preds
test_df[['id', 'target']].to_csv('submission.csv', index=False)

In [None]:
test_df.head()