# Introduction

I am predicting the reading ease of excerpts from literature. We've provided excerpts from several time periods and a wide range of reading ease scores.

Import all the required libraries

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
import spacy

In [None]:
# config params
class CFG:
    nfolds =  5

# Functions

In [None]:
def syllable_count(word):
    count = 0
    vowels = "aeiouAEIOU"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
            if word.endswith("e"):
                count -= 1
    if count == 0:
        count += 1
    return count

# Data and FE

In [None]:
xtrain = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
xtest = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

## EDA

In [None]:
palette = sns.diverging_palette(120, 220, n=20)
fig, ax = plt.subplots(1,2,figsize=(20,10))
sns.kdeplot(xtrain['target'], color=palette[-1], shade=True,ax=ax[0])
sns.kdeplot(xtrain['standard_error'], color=palette[5], shade=True,ax=ax[1])
ax[0].set_title("Target Distribution",font="Serif")
ax[1].set_title("Standard Error Distribution",font="Serif")
plt.show()

In [None]:
plt.figure(figsize=(16, 8))
sns.countplot(y="license",data=xtrain,palette=palette)
plt.title("License Distribution",font="Serif")
plt.show()

## Summary statistics features


In [None]:
# count the characters
xtrain['nof_char'] = xtrain['excerpt'].apply(len)
xtest['nof_char'] = xtest['excerpt'].apply(len)

In [None]:
# count the words
xtrain['nof_words'] = xtrain['excerpt'].apply(lambda s: len(s.split(' ')))
xtest['nof_words'] = xtest['excerpt'].apply(lambda s: len(s.split(' ')))

In [None]:
# words to characters
xtrain['w2c'] = xtrain['nof_words'] / xtrain['nof_char']
xtest['w2c'] = xtest['nof_words'] / xtest['nof_char']

In [None]:
# nof sentences
xtrain['nof_sentences'] =  xtrain['excerpt'].apply(lambda s: s.count('.'))
xtest['nof_sentences'] =  xtest['excerpt'].apply(lambda s: s.count('.'))

In [None]:
# nof syllables
xtrain['nof_syllables'] =  xtrain['excerpt'].apply(lambda s: syllable_count(s))
xtest['nof_syllables'] =  xtest['excerpt'].apply(lambda s: syllable_count(s))

In [None]:
# Fleisch score
a = 206.835 - 1.015 * (xtrain['nof_words'] / xtrain['nof_sentences'])
b = -84.6 * (xtrain['nof_syllables'] / xtrain['nof_words'])
xtrain['fleisch_score'] = a + b

a = 206.835 - 1.015 * (xtest['nof_words'] / xtest['nof_sentences'])
b = -84.6 * (xtest['nof_syllables'] / xtest['nof_words'])
xtest['fleisch_score'] = a + b

In [None]:
# Fleisch score 2
a = (xtrain['nof_words'] / xtrain['nof_sentences'])
b = (xtrain['nof_syllables'] / xtrain['nof_words'])
xtrain['fleisch_score2'] = 0.39 * a + 11.8 * b - 15.59

a = (xtest['nof_words'] / xtest['nof_sentences'])
b = (xtest['nof_syllables'] / xtest['nof_words'])
xtest['fleisch_score2'] = 0.39 * a + 11.8 * b - 15.59
 
    
del a,b

In [None]:
# count the unique words
xtrain['nof_unique_words'] = xtrain['excerpt'].apply(lambda s: len(set( s.split(' ') )))
xtest['nof_unique_words'] = xtest['excerpt'].apply(lambda s: len(set( s.split(' ') )))

In [None]:
# text diversity
xtrain['txt_diversity'] = xtrain['nof_unique_words'] / xtrain['nof_words']
xtest['txt_diversity'] = xtest['nof_unique_words'] / xtest['nof_words']

In [None]:
# word lengths
words = xtrain['excerpt'].apply(lambda s: s.split(' '))
word_lengths = words.apply(lambda s: [len(f) for f in s ])
xtrain['longest_word'] = word_lengths.apply(max)
xtrain['avg_word'] = word_lengths.apply(np.mean)

words = xtest['excerpt'].apply(lambda s: s.split(' '))
word_lengths = words.apply(lambda s: [len(f) for f in s ])
xtest['longest_word'] = word_lengths.apply(max)
xtest['avg_word'] = word_lengths.apply(np.mean)

## Spacy features

In [None]:
nlp = spacy.load('en_core_web_lg')
with nlp.disable_pipes():
    train_vectors = np.array([nlp(text).vector for text in xtrain.excerpt])
    test_vectors = np.array([nlp(text).vector for text in xtest.excerpt])
        
namelist = ['f' + str(ii) for ii in range(train_vectors.shape[1])]

train_vectors = pd.DataFrame(train_vectors)
test_vectors = pd.DataFrame(test_vectors)
train_vectors.columns = namelist
test_vectors.columns = namelist

## Combine

In [None]:
xtrain = pd.concat([xtrain, train_vectors], axis = 1)
xtest = pd.concat([xtest, test_vectors], axis = 1)


features = ['nof_words', 'nof_sentences', 'nof_syllables', 'fleisch_score',
           'txt_diversity', 'nof_unique_words', 'nof_char', 'w2c', 
            'fleisch_score2'] + namelist

# Cross Validation 

In [None]:
kf = KFold(n_splits = CFG.nfolds)

prval = np.zeros((xtrain.shape[0],1))
prfull = np.zeros((xtest.shape[0],1))

In [None]:
for id0, id1 in kf.split(xtrain):
    x0, x1 = xtrain[features].loc[id0], xtrain[features].loc[id1]
    y0, y1 = xtrain['target'][id0], xtrain['target'][id1]
    
    model = Ridge(alpha = 1)

    model.fit(x0,y0)
    
    ypred = model.predict(x1)
    prval[id1,0] =  model.predict(x1)
    prfull[:,0] += model.predict(xtest[features])/CFG.nfolds
    
    print(np.round(np.sqrt(mse(prval[id1,0], y1)),2 ))
    
# score
print('--')
print(np.round( np.sqrt(mse(prval, xtrain['target'])) , 3))

# Submission

In [None]:
xsub = xtest[["id"]].copy()
xsub["target"] = prfull
xsub.to_csv('submission.csv', index = False)
xsub