# <center> CommonLit Readability Prize </center>
 


* To-Do
  * To predict the complexity of reading passages for grade 3-12 classroom use.
  
  
* About data - 
 >  * id - unique ID for excerpt
 >  * url_legal - URL of source - this is blank in the test set.
 >  * license - license of source material - this is blank in the test set.
 >  * excerpt - text to predict reading ease of
 >  * target - reading ease
 >  * standard_error - measure of spread of scores among multiple raters for each excerpt. Not included for test data.
 
 
* Special Notes - 
 * url_legal, license and standard error are not available for test data.

### **Imports**

In [None]:
import numpy as np 
import pandas as pd
import nltk
import re
import seaborn as sns
import matplotlib.pyplot as plt

from toolz import compose
from itertools import combinations

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

### Reading Data

In [None]:
## train_data
train_data = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')

##test_data
test_data = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')


train_data.head()

### EDA

In [None]:
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(30,10))
sns.histplot(train_data['target'],ax=axes[0],kde=True)
sns.histplot(train_data['standard_error'],ax=axes[1],kde=True)

axes[0].set(title='Target distribution')
axes[0].set(title='Standard error distribution')

In [None]:
## drop unnecessary columns
train_data.drop(['url_legal','license','id'],inplace=True,axis=1)

In [None]:
X = train_data['excerpt']
y = train_data['target']

## splitting into train and test
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)


#### Addition of Skip-Grams

* Instead of using normal n-grams (continuous), I will be overriding sklearn's countvectorizer to give us skip-grams. There are two parameters required - 
  * **ngram_range** - is a tuple of (min_number, max_number) which represents.
    * min_number - minimum number of words you need (if 1 - unigram, 2 - bigram etc etc.)
    * max_number - maximum number of words you need (if 1 - unigram, 2 - bigram etc etc.)
    
  <br>
    
  * **k** - number of skips in between
  
  * Ex - if sentence is **The forecast says there will be rain tomorrow.**, **k = 2** and **ngram_range = (1,3)**. Then, tokens would be - 
  
  ['be',
 'be rain',
 'be rain tomorrow',
 'be tomorrow',
 'forecast',
 'forecast says',
 'forecast says be',
 'forecast says there',
 'forecast says will',
 'forecast there',
 'forecast there be',
 'forecast there will',
 'forecast will',
 'forecast will be',
 'rain',
 'rain tomorrow',
 'says',
 'says be',
 'says be rain',
 'says there',
 'says there be',
 'says there rain',
 'says there will',
 'says will',
 'says will be',
 'says will rain',
 'the',
 'the forecast',
 'the forecast says',
 'the forecast there',
 'the forecast will',
 'the says',
 'the says there',
 'the says will',
 'the there',
 'the there will',
 'there',
 'there be',
 'there be rain',
 'there be tomorrow',
 'there rain',
 'there rain tomorrow',
 'there will',
 'there will be',
 'there will rain',
 'there will tomorrow',
 'tomorrow',
 'will',
 'will be',
 'will be rain',
 'will be tomorrow',
 'will rain',
 'will rain tomorrow',
 'will tomorrow']

In [None]:
### code to create skipgrams
class SkipGramVectorizer(CountVectorizer):
    
    def __init__(self, k=1, **kwds):
        super(SkipGramVectorizer, self).__init__(**kwds)
        self.k=k
    
    def build_sent_analyser(self, preprocess, stop_words, tokenize):
        return lambda sent : self._word_skip_grams(compose(tokenize, preprocess, self.decode)(sent),stop_words)
    
    def build_analyzer(self):    
        preprocess = self.build_preprocessor()
        stop_words = self.get_stop_words()
        tokenize = self.build_tokenizer()
        sent_analyse = self.build_sent_analyser(preprocess, stop_words, tokenize)
        return lambda doc : self._sent_skip_grams(doc, sent_analyse)
    
    def _sent_skip_grams(self, doc, sent_analyze):
        skip_grams = []
        for sent in nltk.sent_tokenize(doc):
            skip_grams.extend(sent_analyze(sent))
        return skip_grams
    
    def _word_skip_grams(self, tokens, stop_words=None):
        # handle stop words
        if stop_words is not None:
            tokens = [w for w in tokens if w not in stop_words]
        
        min_n, max_n = self.ngram_range
        k = self.k
        if max_n != 1:
            original_tokens = tokens
            if min_n == 1:
                # no need to do any slicing for unigrams
                # just iterate through the original tokens
                tokens = list(original_tokens)
                min_n += 1
            else:
                tokens = []

            n_original_tokens = len(original_tokens)

            # bind method outside of loop to reduce overhead
            tokens_append = tokens.append
            space_join = " ".join

            for n in range(min_n,min(max_n + 1, n_original_tokens + 1)):
                for i in range(n_original_tokens - n + 1):
                    # k-skip-n-grams
                    head = [original_tokens[i]]                    
                    for skip_tail in combinations(original_tokens[i+1:i+n+k], n-1):
                        tokens_append(space_join(head + list(skip_tail)))
        return tokens

#### Preprocessing 
* I am writing a tokeniser function which - 
  * uses word tokenize
  * keeps only alphabets
  * lemmatizes all the tokens.

In [None]:
## tokenizer function

lemm = nltk.wordnet.WordNetLemmatizer()
def tokenizer(text):
    text = text.lower()
    text = re.sub("[^a-zA-Z]", " ", text)
    toks = nltk.tokenize.word_tokenize(text)
    tok_ret = []
    for tok in toks:
        if tok != "":
            tok_ret.append(tok)
    return tok_ret

In [None]:
## english stopwords
stop_words = set(nltk.corpus.stopwords.words("english"))

#### **Building model using Sklearn Pipeline**

<br>

* We will be using Linear Regression as a baseline to predict the grading difficuly.

In [None]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([
                    ('vectorizer',SkipGramVectorizer(ngram_range=(1,3), k=1,stop_words=stop_words,tokenizer = tokenizer)),
                    ('clf',LinearRegression())
                
                ])
pipe.fit(X_train,y_train)
preds = pipe.predict(X_test)

In [None]:
print("MSE : " + str(mean_squared_error(y_test,preds)))
print("\n")
print("RMSE : " + str(np.sqrt(mean_squared_error(y_test,preds))))

#### **Plotting top words**

In [None]:
all_v = pipe[0].transform(X_train).toarray().sum(axis=0)
all_grams = {}
for gram,value in zip(pipe[0].get_feature_names(),all_v):
    
    total_grams = len(gram.split(" "))
    if total_grams not in all_grams:
        all_grams[total_grams] = []
        
    all_grams[total_grams].append((gram,value))

fig,axes = plt.subplots(2,2,figsize=(30,15))
fig.tight_layout(w_pad=20, h_pad=5)
row = 0
for index,key in enumerate(all_grams.keys()):
    
    sorted_grams = sorted(all_grams[key],key = lambda x : x[1],reverse=True)[:10]
    
    x = [tup[0] for tup in sorted_grams]
    y = [tup[1] for tup in sorted_grams]
    
    if index % 2 == 0 and index!=0:
        row+=1
    ax = sns.barplot(x=y,y=x,ax=axes[row][index%2],palette='plasma',orient='h')
    ax.set(title = "Top " + str(key) + " grams")
    #ax.set_xticklabels(labels=x,rotation=45,ha='right')

#### Submission File

In [None]:
test_data['target'] = pipe.predict(test_data['excerpt'])
test_data.drop(['url_legal','license','excerpt'],inplace=True,axis=1)
test_data.to_csv('/kaggle/working/submission.csv',index=False)


In [None]:
test_data.head()