# CommonLit Readability

## Setup

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS,CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import Ridge,LinearRegression,PassiveAggressiveRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error,accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn import set_config
from xgboost import XGBRegressor,XGBClassifier
set_config(display='diagram')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# We begin importing the Datasets
train=pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
test1=pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')


In [None]:
train.head(20)

In [None]:
#We look for missing values
for  col in train.columns:
    if col!='target' and col !='standard_error':
        print(col,train[col].count())
        print('test:',col,test1[col].count())

In [None]:
#let drop features we wont use
train.drop(['url_legal','license'],axis=1,inplace=True)
test1.drop(['url_legal','license'],axis=1,inplace=True)

We have now only three columns in the training set.

## Exploration and Analysis

We begin by looking at general features in the dataset such as standard error, score and length distributions

### Simple statistics on the excerpts

In [None]:
#First take look at the distribution of the score column, it is pretty smooth
plt.figure()
train.target.describe(percentiles=np.arange(200)/200)[4:].plot()
plt.title('Distribution des scores')
plt.savefig('distribution_scores.png')
plt.show()

In [None]:
#We initialize a tokenizer we'll use to explore the words
tknzr=nltk.RegexpTokenizer(r'\w+')
train_words=train.excerpt.apply(tknzr.tokenize)
test_words=test1.excerpt.apply(tknzr.tokenize)

In [None]:
train_words

In [None]:
# Now we can take a look at the distribution for the number of words by excerpts
plt.figure()
train_words.apply(len).describe(percentiles=np.arange(200)/200)[4:].plot()
plt.title('Nombre de mots par texte')
plt.savefig('distribution_nombre_de_mots.png')
plt.show()

In [None]:
#We do the same for the standard error
train.standard_error.describe(percentiles=np.arange(200)/200)[4:].plot()

### Simple correlation between length and ease

In this part we'll look for easy correlation with the target column

In [None]:
train_words.apply(len).min()

In [None]:
#We first look for correlation with the number of word
plt.figure()
sns.lineplot(x=train_words.apply(len),y=train.target,)
sns.regplot(x=train_words.apply(len),y=train.target,marker="",color='red')
plt.title('Relation longueur de l\'extrait - difficulté')
plt.savefig('longueur_extrait.png')
plt.show()

In [None]:
#Here we can see that standard error is homogeneous toward the valeue of the target.
plt.figure()
sns.lineplot(x=train.target,y=train.standard_error)
plt.title('Erreur Standard')
plt.savefig('standard_error.png')
plt.show()

We do the same with the length of words inside a sentence

In [None]:
train_words_length_mean= [np.mean([len(i) for i in j]) for j in train_words]

In [None]:
# the correlation with the mean lenght sized of a word is a bit more pronounced
plt.figure()
sns.lineplot(x=train_words_length_mean,y=train.target)
sns.regplot(x=train_words_length_mean,y=train.target,marker="",color='red')
plt.title('Relation longueur moyenne d\'un mot - difficulté')
plt.savefig('longueur_moyenne_mot.png')
plt.show()

We observe a slight correlation between ease and the mean length of words, so text that are harder to read are more prompt to have longer words.

### Common words distribution

In the further parts we will try to link the type of vocabulary used with the ease of read we begin with the most common words.

In [None]:
common_words_counter=CountVectorizer(min_df=0.8,ngram_range=(1,3))

In [None]:
common_words_count=common_words_counter.fit_transform(train.excerpt)

In [None]:
len(common_words_counter.vocabulary_)

In [None]:
common_words_count=common_words_count.sum(axis=1)

In [None]:
#We can see a soft correlation but we have to keep in mind that we're looking to a vocabulary of only 5 words
plt.figure()
sns.lineplot(x=common_words_count.flatten().tolist()[0],y=train.target)
sns.regplot(x=common_words_count.flatten().tolist()[0],y=train.target,marker="",color='red')
plt.title('Relation nombre de mot courant - difficulté')
plt.savefig('courant_difficulté.png')
plt.show()

In [None]:
sns.lineplot(x=common_words_count.flatten().tolist()[0]/train.excerpt.apply(len),y=train.target)
sns.regplot(x=common_words_count.flatten().tolist()[0]/train.excerpt.apply(len),y=train.target,marker="",color='red')

### Rare words distribution

Now we do it with the least shared words.

In [None]:
rare_words_counter=CountVectorizer(max_df=0.05,)

In [None]:
rare_words_count=rare_words_counter.fit_transform(train.excerpt)

In [None]:
#the vocabulary is much bigger which normal
len(rare_words_counter.vocabulary_)

In [None]:
rare_words_count=rare_words_count.sum(axis=1)

In [None]:
rare_words_count.min()

In [None]:
#We can see a good correlation
plt.figure()
sns.lineplot(x=rare_words_count.flatten().tolist()[0],y=train.target)
sns.regplot(x=rare_words_count.flatten().tolist()[0],y=train.target,marker="",color='red')
plt.title('Relation nombre de mot rare - difficulté')
plt.savefig('rare_difficulté.png')
plt.show()

We can see that texts with very specific words are a bit harder to read.

In [None]:
sns.lineplot(x=rare_words_count.flatten().tolist()[0]/train.excerpt.apply(len),y=train.target)
sns.regplot(x=rare_words_count.flatten().tolist()[0]/train.excerpt.apply(len),y=train.target,marker="",color='red')

### Other words distribution

We finally do the same for the in between vocabulary

In [None]:
other_words_counter=CountVectorizer(max_df=0.6,min_df=0.1,ngram_range=(1,4))

In [None]:
other_words_count=other_words_counter.fit_transform(train.excerpt)

In [None]:
len(other_words_counter.vocabulary_)

In [None]:
other_words_count=other_words_count.sum(axis=1)

In [None]:
# we now have a correlation but in the other direction.
plt.figure()
sns.lineplot(x=other_words_count.flatten().tolist()[0],y=train.target)
sns.regplot(x=other_words_count.flatten().tolist()[0],y=train.target,marker="",color='red')
plt.title('Relation nombre de mot moyen - difficulté')
plt.savefig('autre_difficulté.png')
plt.show()

In [None]:
#sns.lineplot(x=other_words_count.flatten().tolist()[0]/train.excerpt.apply(len),y=train.target)
sns.regplot(x=other_words_count.flatten().tolist()[0]/train.excerpt.apply(len),y=train.target)

### Data Preparation

We will now prepare the data  before testing algorithm. The first step is to add the features we just discussed in the exploration

In [None]:
train['length']=train_words.apply(len)
test1['length']=test_words.apply(len)


In [None]:
train['words_length_mean']= [np.mean([len(i) for i in j]) for j in train_words]
test1['words_length_mean']= [np.mean([len(i) for i in j]) for j in test_words]

In [None]:
#Now we divide the data between a train and an evaluation set
X_t,X_e,y_t,y_e=train_test_split(train[['excerpt','length','words_length_mean']],train.target,test_size=0.1,random_state=2)

In [None]:
#For safety purpose we take a copy for the test set (which will be used for submission)
test2=test1.copy()

In [None]:
test2

In [None]:
#X_t['common_words_count']=common_words_counter.fit_transform(X_t.excerpt).sum(axis=1).flatten().tolist()[0]
#X_e['common_words_count']=common_words_counter.transform(X_e.excerpt).sum(axis=1).flatten().tolist()[0]
#test2['common_words_count']=common_words_counter.transform(test2.excerpt).sum(axis=1).flatten().tolist()[0]

In [None]:
#We count the rare words
X_t['rare_words_count']=rare_words_counter.fit_transform(X_t.excerpt).sum(axis=1).flatten().tolist()[0]
X_e['rare_words_count']=rare_words_counter.transform(X_e.excerpt).sum(axis=1).flatten().tolist()[0]
test2['rare_words_count']=rare_words_counter.transform(test2.excerpt).sum(axis=1).flatten().tolist()[0]

In [None]:
#We count the in between ones
X_t['other_words_count']=other_words_counter.fit_transform(X_t.excerpt).sum(axis=1).flatten().tolist()[0]
X_e['other_words_count']=other_words_counter.transform(X_e.excerpt).sum(axis=1).flatten().tolist()[0]
test2['other_words_count']=other_words_counter.transform(test2.excerpt).sum(axis=1).flatten().tolist()[0]

In [None]:
#we can look the result on the test set
test2

In [None]:
#We instanciate tf-idf vectorizer in order to encode the excerpts
tfidf_mid=TfidfVectorizer(max_df=0.7,min_df=0.1,ngram_range=(1,3))
tfidf_rare=TfidfVectorizer(max_df=0.05,min_df=2,ngram_range=(1,3))

In [None]:
#We encode the excerpts
word_train_rare=rare_words_counter.fit_transform(X_t.excerpt).toarray()
word_eval_rare=rare_words_counter.transform(X_e.excerpt).toarray()
word_test_rare=rare_words_counter.transform(test2.excerpt).toarray()

word_train_mid=tfidf_mid.fit_transform(X_t.excerpt).toarray()
word_eval_mid=tfidf_mid.transform(X_e.excerpt).toarray()
word_test_mid=tfidf_mid.transform(test2.excerpt).toarray()

In [None]:
#Transform them into dataset to easily glue them on our datasets
word_train_rare=pd.DataFrame(word_train_rare)
word_eval_rare=pd.DataFrame(word_eval_rare)
word_test_rare=pd.DataFrame(word_test_rare)


word_train_mid=pd.DataFrame(word_train_mid)
word_eval_mid=pd.DataFrame(word_eval_mid)
word_test_mid=pd.DataFrame(word_test_mid)

In [None]:
#We put back the index
word_train_rare.index=X_t.index
word_eval_rare.index=X_e.index
word_test_rare.index=test2.index

word_train_mid.index=X_t.index
word_eval_mid.index=X_e.index
word_test_mid.index=test2.index

In [None]:
#We can look at the encoding made by the TfidfVectorizer
word_eval_mid

In [None]:
#Because the vocabulary kept by the Tfidf-rare is very big we use a TruncatedSVD to reduce the dimensions
tsvd_mid=TruncatedSVD(n_components=80,random_state=2)
tsvd_rare=TruncatedSVD(n_components=350,random_state=2)

In [None]:
#We put everything back into datasets
word_train_mid=pd.DataFrame(tsvd_mid.fit_transform(word_train_mid))
word_eval_mid=pd.DataFrame(tsvd_mid.transform(word_eval_mid))
word_test_mid=pd.DataFrame(tsvd_mid.transform(word_test_mid))
word_train_mid.index=X_t.index
word_eval_mid.index=X_e.index
word_test_mid.index=test2.index

word_train_rare=pd.DataFrame(tsvd_rare.fit_transform(word_train_rare))
word_eval_rare=pd.DataFrame(tsvd_rare.transform(word_eval_rare))
word_test_rare=pd.DataFrame(tsvd_rare.transform(word_test_rare))
word_train_rare.index=X_t.index
word_eval_rare.index=X_e.index
word_test_rare.index=test2.index

In [None]:
#We can take a look at the summed variance ratio the TruncatedSVD have kept to ensure we choosed good values
fig,axs = plt.subplots(1,2,figsize=(15,5))
axs[0].set_title('T-SVD rare')
sns.lineplot(y=np.cumsum(tsvd_rare.explained_variance_ratio_),x=np.arange(tsvd_rare.explained_variance_ratio_.shape[0]),ax=axs[0])
axs[1].set_title('T-SVD middle')
sns.lineplot(y=np.cumsum(tsvd_mid.explained_variance_ratio_),x=np.arange(tsvd_mid.explained_variance_ratio_.shape[0]),ax=axs[1])
plt.savefig('explained_var_tsvd.png')
fig.show()

In [None]:
#Now we concatenate the data
X_t=pd.concat([X_t[[i for i in X_t.columns if i!='excerpt']],word_train_mid],axis=1)
X_e=pd.concat([X_e[[i for i in X_e.columns if i!='excerpt']],word_eval_mid],axis=1)
test2=pd.concat([test2[[i for i in test2.columns if i!='excerpt']],word_test_mid],axis=1)

X_t=pd.concat([X_t[[i for i in X_t.columns if i!='excerpt']],word_train_rare],axis=1)
X_e=pd.concat([X_e[[i for i in X_e.columns if i!='excerpt']],word_eval_rare],axis=1)
test2=pd.concat([test2[[i for i in test2.columns if i!='excerpt']],word_test_rare],axis=1)

In [None]:
#Then we'll use a standard scale to have more homogeneous data
std=StandardScaler(with_std=False)

In [None]:
X_t

In [None]:
X_t[[i for i in X_t.columns if i!='excerpt']]=std.fit_transform(X_t[[i for i in X_t.columns if i!='excerpt']])
X_e[[i for i in X_e.columns if i!='excerpt']]=std.transform(X_e[[i for i in X_e.columns if i!='excerpt']])
test2[[i for i in test2.columns if i!='excerpt' and i!='id']]=std.transform(test2[[i for i in test2.columns if i!='excerpt' and i!='id']])

## Modelisation

### Linear Regression

We begin  with a simple linear regression which perform pretty good

In [None]:
lr=LinearRegression(n_jobs=-1)

In [None]:
gs_lr=GridSearchCV(lr,param_grid={},scoring='neg_root_mean_squared_error')

In [None]:
gs_lr.fit(X_t,y_t)

In [None]:
gs_lr.best_score_

In [None]:
lr.fit(X_t,y_t)

In [None]:
mean_squared_error(y_e,lr.predict(X_e),squared=False)

### SVM

Then we test a SVM Classifier with a grid search over parameters

In [None]:
svc=SVR(C=2.8)

In [None]:
svc.fit(X_t,y_t)

In [None]:
mean_squared_error(y_e,svc.predict(X_e),squared=False)

In [None]:
gs_svm=GridSearchCV(svc,param_grid={'C':np.arange(1,12),'kernel':['rbf','poly','sigmoid',]},verbose=1000,
                #n_jobs=-1,
                scoring='neg_root_mean_squared_error')

In [None]:
gs_svm.fit(X_t,y_t)

In [None]:
gs_svm.best_score_

In [None]:
gs_svm.best_params_

In [None]:
mean_squared_error(y_e,gs_svm.predict(X_e),squared=False)

In [None]:
svc=SVR(**gs_svm.best_params_)
svc.fit(X_t,y_t)

In [None]:
# We measure the RMSE with the parameters found
mean_squared_error(y_e,svc.predict(X_e),squared=False)

### XGBoost

We test a XGBoost regression with some grid search, since is the longest algorithm to perform we only do the search on a reduced set of parameter 

In [None]:
xgb=XGBRegressor(n_jobs=-1,n_estimators=500,reg_alpha=0.2,objective='reg:squarederror')

In [None]:
xgb.get_params()

In [None]:
xgb.fit(X_t.values,y_t)

In [None]:
mean_squared_error(y_e,xgb.predict(X_e.values),squared=False)

In [None]:
gs_xgb=GridSearchCV(xgb,param_grid={
    'n_estimator':[50,],'gamma': np.arange(0.1,0.8,0.1),'learning_rate':[0.3,],'max_depth': [6,]
},
                    verbose=1000,
                #n_jobs=-1,
                scoring='neg_root_mean_squared_error')

In [None]:
gs_xgb.fit(X_t.values,y_t)

In [None]:
gs_xgb.best_score_

In [None]:
gs_xgb.best_params_

In [None]:
xgb=XGBRegressor(n_jobs=-1,n_estimators=100,objective='reg:squarederror',**gs_xgb.best_params_
                )

In [None]:
xgb.fit(X_t.values,y_t)

In [None]:
mean_squared_error(y_e,xgb.predict(X_e),squared=False)

### PassiveAgressive

We eventually test a PassiveAgressive Regressor which bring the lowest error and is pretty fast so we can explore the regularization parameter.

In [None]:
pa= PassiveAggressiveRegressor(C=0.0001,loss= 'squared_epsilon_insensitive',verbose=10,tol=0.0001,random_state=42)

In [None]:
pa.fit(X_t,y_t)

In [None]:
mean_squared_error(y_e,pa.predict(X_e),squared=False)

In [None]:
pa.get_params()

In [None]:
gs_pa=GridSearchCV(pa,param_grid={
    'C':[1e-5+i*10.**(-6) for i in np.arange(0,21,)],'verbose':[0]},
                    verbose=2,
                   cv=10,
                #n_jobs=-1,
                scoring='neg_root_mean_squared_error')

In [None]:
gs_pa.fit(X_t,y_t)

In [None]:
gs_pa.best_score_

In [None]:
gs_pa.best_params_

In [None]:
pa=PassiveAggressiveRegressor(loss='squared_epsilon_insensitive',tol=0.0001,random_state=42,**gs_pa.best_params_)

In [None]:
pa.fit(X_t,y_t)

In [None]:
#We can see that it's the most efficient algorithm we've tested
mean_squared_error(y_e,pa.predict(X_e),squared=False)

The model we'll be using for submission is the PassiveAgressive.