## CommonLit Readability - Baseline

In this competition, we're predicting the reading ease of excerpts from literature - readability score. Text readability is best defined as the ease with which a text can be read and understood in terms of the linguistic features found within a text. So the task is to build algorithms to rate the complexity of reading passages for grade 3-12 classroom use.

This post can be viewed as a continuation of my previous notebook here - https://www.kaggle.com/sharrpie/commonlit-readability-eda-fe-topic-modelling. I'd highly recommend you take a look at it to make the most out of this notebook.
In this post, we would primarily consolidate all the observations drawn from EDA and build up on it by developing a baseline for readability score prediction. So without further deviation, let's get started! And while doing so, we focus on writing clean and reusable code whose components you can plug in directly in other competitions. Code inspirations from the great @abhishekthakur! Entire structured code can be found at: https://github.com/SharvilN/Common-Readability-Prize

## Import Libraries

In [None]:
!pip install textacy --no-index --find-links=file:///kaggle/input/textacy/textacy > /dev/null
!pip install ../input/spacy-3-1-0/dist/en_core_web_lg-3.1.0.tar > /dev/null

In [None]:
import re
import os
import spacy
import joblib
import textacy
import collections
import numpy as np
import pandas as pd
import lightgbm as lgb

from enum import Enum, auto
from typing import List, Optional

from textacy import preprocessing
from textacy import text_stats
from textacy.text_stats import readability

from sklearn import metrics
from sklearn import ensemble
from sklearn import linear_model
from sklearn import pipeline
from sklearn import decomposition
from sklearn import preprocessing as skpreprocessing
from sklearn import svm
from sklearn import model_selection

## Text Preprocessor

In [None]:
class TextPreprocessor:

    def __init__(self, pipelines: List, remove_chars: Optional[List] = None):
        self.pipelines = list()
        if "unicode" in pipelines: self.pipelines.append(preprocessing.normalize.unicode)
        if "whitespace" in pipelines: self.pipelines.append(preprocessing.normalize.whitespace)
        self.preprocessor = preprocessing.make_pipeline(*self.pipelines)

        self.remove_chars = remove_chars

    def run(self, text):
        for pattern in self.remove_chars:
            text = re.sub(pattern, ' ', text)
        text = self.preprocessor(text)
        return text

## Feature Generator

In [None]:
class FeatureGenerator:

    def __init__(self, df: pd.DataFrame, spacy_model=None, preprocess: bool=True) -> None:
        self.raw = df
        self.output = self.raw.copy(deep=True)
        self.nlp = spacy_model
        self.preprocess = preprocess

    def extract_features(self):
        if self.preprocess: self._preprocess()
        self._extract_traditional_features()
        self._extract_syntatic_features()
        self._extract_pos_tag_features()
        self._spacy_vectors()

        return self.output

    def _preprocess(self):
        print('Preprocessing data...')
        preprocessor = TextPreprocessor(pipelines=["unicode", "whitespace"], 
                                       remove_chars = ["\n"])
        self.raw.loc[:, 'preprocessed_excerpt'] = self.raw['excerpt'].apply(preprocessor.run)
        self.output.loc[:, 'preprocessed_excerpt'] = self.raw['preprocessed_excerpt']

    def _extract_traditional_features(self):

        def n_longest_sent(doc) -> int:
            return max([len(sent) for sent in list(doc.sents)])

        print('Extracting traditional features...')
        for idx, row in self.raw.iterrows():
            doc = textacy.make_spacy_doc(row['preprocessed_excerpt'], lang='en_core_web_lg')
            ts = text_stats.TextStats(doc)
            self.output.loc[idx, 'n_sents'] = ts.n_sents                
            self.output.loc[idx, 'n_words'] = ts.n_words                
            self.output.loc[idx, 'n_words_per_sent'] = ts.n_words / ts.n_sents             
            self.output.loc[idx, 'n_unique_words'] = ts.n_unique_words
            self.output.loc[idx, 'n_unique_words_per_sent'] = ts.n_unique_words / ts.n_sents
            self.output.loc[idx, 'n_chars_per_word'] = ts.n_chars / ts.n_words
            self.output.loc[idx, 'n_syllables'] = ts.n_syllables
            self.output.loc[idx, 'n_syllables_per_word'] = ts.n_syllables / ts.n_words
            self.output.loc[idx, 'n_syllables_per_sent'] = ts.n_syllables / ts.n_sents
            self.output.loc[idx, 'n_monosyllable_words'] = ts.n_monosyllable_words
            self.output.loc[idx, 'n_polysyllable_words'] = ts.n_polysyllable_words
            self.output.loc[idx, 'n_long_words'] = ts.n_long_words
            self.output.loc[idx, 'n_long_words_ratio'] = ts.n_long_words / ts.n_words
            self.output.loc[idx, 'entropy'] = ts.entropy
            self.output.loc[idx, 'n_longest_sent'] = n_longest_sent(doc)

            self.output.loc[idx, 'automated_readability_index'] \
                = readability.automated_readability_index(ts.n_chars, ts.n_words, ts.n_sents)
            self.output.loc[idx, 'coleman_liau_index'] \
                = readability.coleman_liau_index(ts.n_chars, ts.n_words, ts.n_sents)
            self.output.loc[idx, 'flesch_kincaid_grade_level'] \
                = readability.flesch_kincaid_grade_level(ts.n_syllables, ts.n_words, ts.n_sents)
            self.output.loc[idx, 'flesch_reading_ease'] \
                = readability.flesch_reading_ease(ts.n_syllables, ts.n_words, ts.n_sents)
            self.output.loc[idx, 'lix'] \
                = readability.lix(ts.n_syllables, ts.n_long_words, ts.n_sents)
            self.output.loc[idx, 'smog_index'] \
                = readability.smog_index(ts.n_polysyllable_words, ts.n_sents)
            self.output.loc[idx, 'gunning_fog_index'] \
                = readability.gunning_fog_index(ts.n_words, ts.n_polysyllable_words, ts.n_sents)

    def _extract_syntatic_features(self):
        print('Extracting Syntactic features...')
        def tree_height(root):
            if not list(root.children):
                return 1
            else:
                return 1 + max(tree_height(x) for x in root.children)

        def get_average_height(paragraph):
            doc = self.nlp(paragraph) if type(paragraph) == str else paragraph
            roots = [sent.root for sent in doc.sents]
            return np.mean([tree_height(root) for root in roots])

        def count_subtrees(root):
            if not list(root.children):
                return 0
            else:
                return 1 + sum(count_subtrees(x) for x in root.children)

        def get_mean_subtrees(paragraph):
            doc = self.nlp(paragraph) if type(paragraph) == str else paragraph
            roots = [sent.root for sent in doc.sents]
            return np.mean([count_subtrees(root) for root in roots])

        def get_averge_noun_chunks(paragraph):
            doc = self.nlp(paragraph) if type(paragraph) == str else paragraph
            return len(list(doc.noun_chunks))
            
        def get_noun_chunks_size(paragraph):
            doc = self.nlp(paragraph) if type(paragraph) == str else paragraph
            noun_chunks_size = [len(chunk) for chunk in doc.noun_chunks]
            return np.mean(noun_chunks_size)
        
        self.output['avg_parse_tree_height'] = self.output.preprocessed_excerpt.apply(get_average_height)
        self.output['mean_parse_subtrees'] = self.output.preprocessed_excerpt.apply(get_mean_subtrees)
        self.output['noun_chunks'] = self.output.preprocessed_excerpt.apply(get_averge_noun_chunks)
        self.output['noun_chunk_size'] = self.output.preprocessed_excerpt.apply(get_noun_chunks_size)
        self.output['avg_noun_chunks'] = self.output['noun_chunks'] / self.output['n_sents']
        self.output['mean_noun_chunk_size'] = self.output['noun_chunk_size'] / self.output['avg_noun_chunks']
    
    def _extract_pos_tag_features(self):
        print('Extracting POS Tag features...')
        def get_pos_freq_per_word(paragraph, tag):
            doc = self.nlp(paragraph) if type(paragraph) == str else paragraph
            pos_counter = collections.Counter(([token.pos_ for token in doc]))
            pos_count_by_tag = pos_counter[tag]
            total_pos_counts = sum(pos_counter.values())
            return pos_count_by_tag / total_pos_counts

        self.output['nouns_per_word'] = self.raw.preprocessed_excerpt.apply(lambda x: get_pos_freq_per_word(x, 'NOUN'))
        self.output['proper_nouns_per_word'] = self.raw.preprocessed_excerpt.apply(lambda x: get_pos_freq_per_word(x, 'PROPN'))
        self.output['pronouns_per_word'] = self.raw.preprocessed_excerpt.apply(lambda x: get_pos_freq_per_word(x, 'PRON'))
        self.output['adj_per_word'] = self.raw.preprocessed_excerpt.apply(lambda x: get_pos_freq_per_word(x, 'ADJ'))
        self.output['adv_per_word'] = self.raw.preprocessed_excerpt.apply(lambda x: get_pos_freq_per_word(x, 'ADV'))
        self.output['verbs_per_word'] = self.raw.preprocessed_excerpt.apply(lambda x: get_pos_freq_per_word(x, 'VERB'))
        self.output['cconj_per_word'] = self.raw.preprocessed_excerpt.apply(lambda x: get_pos_freq_per_word(x, 'CCONJ'))
        self.output['sconj_per_word'] = self.raw.preprocessed_excerpt.apply(lambda x: get_pos_freq_per_word(x, 'SCONJ'))

    def get_col_names(self, prefix: str, count: int):
        return [f"{prefix}_{i}" for i in range(count)]

    def _spacy_vectors(self):
        print('Extracting Spacy vectors...')
        with self.nlp.disable_pipes():
            vectors = np.array([self.nlp(text).vector for text in self.output.preprocessed_excerpt])
            cols = self.get_col_names('spacy', len(vectors[0]))
            self.output[cols] = vectors

In [None]:
df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
fe = FeatureGenerator(df, spacy.load("en_core_web_lg"))
df_features = fe.extract_features()
df_features.to_csv("./train_features.csv", index=False)

## Cross Validation Framework

In [None]:
class ProblemType(Enum):
        BINARY = auto(),
        MULTICLASS = auto(),
        REGRESSION = auto(),
        MULTI_COL_REGRESSION = auto(),
        HOLDOUT = auto(),
        MULTILABEL = auto()
        
class CrossValidation:

    def __init__(
            self,
            df,
            target_cols,
            shuffle,
            problem_type,
            holdout_pct=None,
            n_folds=5,
            multilabel_delimiter=',',
            random_state=31
        ):
        self.df = df
        self.target_cols = target_cols
        self.n_targets = len(self.target_cols)
        self.problem_type = problem_type
        self.shuffle = shuffle
        self.n_folds = n_folds
        self.holdout_pct = holdout_pct
        self.multilabel_delimiter = multilabel_delimiter
        self.random_state = random_state

        if self.shuffle is True:
            self.df = self.df.sample(frac=1).reset_index(drop=True)
        self.df['kfold'] = -1

    def split(self):
        if self.problem_type in (ProblemType.BINARY, ProblemType.MULTICLASS):
            if self.n_targets != 1:
                raise Exception(f'Invalid number of targets {self.n_targets} for selected problem type: {self.problem_type}') 
            
            target = self.target_cols[0]
            kf = model_selection.StratifiedKFold(n_splits=self.n_folds)
        
            for fold, (train_idx, val_idx) in enumerate(kf.split(X=self.df, y=self.df[target].values)):
                print(len(train_idx), len(val_idx))
                self.df.loc[val_idx, 'kfold'] = fold

        elif self.problem_type in (ProblemType.REGRESSION, ProblemType.MULTI_COL_REGRESSION):
            if self.n_targets != 1 and self.problem_type == ProblemType.REGRESSION:
                raise Exception(f'Invalid combination of number of targets {self.n_targets} and problem type {self.problem_type}')
            if self.n_targets < 2 and self.problem_type == ProblemType.MULTI_COL_REGRESSION:
                raise Exception(f'Invalid combination of number of targets {self.n_targets} and problem type {self.problem_type}')
            
            target = self.target_cols[0]

            # calculate number of bins by Sturge's rule
            # I take the floor of the value, you can also
            # just round it
            num_bins = int(np.floor(1 + np.log2(len(self.df))))
            
            # bin targets
            self.df.loc[:, "bins"] = pd.cut(
                self.df["target"], bins=num_bins, labels=False
            )
            
            # initiate the kfold class from model_selection module
            kf = model_selection.StratifiedKFold(n_splits=self.n_folds)
            
            # fill the new kfold column
            # note that, instead of targets, we use bins!
            for fold, (train_idx, valid_idx) in enumerate(kf.split(X=self.df, y=self.df.bins.values)):
                print(len(train_idx), len(valid_idx))
                self.df.loc[valid_idx, 'kfold'] = fold
            
            # drop the bins column
            self.df = self.df.drop("bins", axis=1)

        
        elif self.problem_type == ProblemType.HOLDOUT:
            holdout_pctg = self.holdout_pct
            n_holdout_samples = int(len(self.df) * holdout_pctg / 100)
            self.df.loc[:n_holdout_samples, 'kfold'] = 0
            self.df.loc[n_holdout_samples: , 'kfold'] = 1
            print(n_holdout_samples)

        elif self.problem_type == ProblemType.MULTILABEL:
            if self.n_targets != 1:
                raise Exception(f'Invalid combination of number of targets {self.n_targets} and problem type {self.problem_type}')

            targets = self.df[self.target_cols[0]].apply(lambda x: len(x.split(self.multilabel_delimiter)))
            print(targets.value_counts())
            kf = model_selection.StratifiedKFold(n_splits=self.n_folds)

            for fold, (train_idx, valid_idx) in enumerate(kf.split(X=self.df, y=targets)):
                print(len(train_idx), len(valid_idx))
                self.df.loc[valid_idx, 'kfold'] = fold

        else:
            raise Exception(f'Invalid problem type found : {self.problem_type}')
        return self.df

## Creating Folds

In [None]:
def create_folds(data_path: str, output_path: str, target_cols: List[str], problem_type: ProblemType,  n_folds: Optional[int] = 5) -> None:

    print(f'Target columns : {target_cols}')
    print(f'Reading data from {data_path}')
    df = pd.read_csv(data_path)
    cv = CrossValidation(df,
                        target_cols,
                        n_folds=n_folds,
                        shuffle=True,
                        problem_type=problem_type)
    
    print(f'Generating folds...')
    df_split = cv.split()
    print(df_split.head())
    print(df_split.tail())
    print(df_split.groupby(by=['kfold'])['target'].median())

    print(f'Saving train folds to {output_path}')
    df_split.to_csv(output_path, index=False)

create_folds("./train_features.csv",
             "./trainfe_folds.csv",
             target_cols=["target"],
            problem_type=ProblemType.REGRESSION)

## Model Dispatcher

In [None]:
MODELS = {
    'RFR': ensemble.RandomForestRegressor(),
    'ETR': ensemble.ExtraTreesRegressor(),
    'RIDGE': linear_model.Ridge(alpha=0.5, solver='auto'),
    'LASSO': linear_model.Lasso(normalize=True),
    'OLS': linear_model.LinearRegression(normalize=True, fit_intercept=False),
    'SVM': svm.SVR(),
    'RIDGE_PIPE': pipeline.Pipeline([('poly', skpreprocessing.StandardScaler()),
                 ('fit', linear_model.Ridge(alpha=1))]),
    'LASSO_PIPE': pipeline.Pipeline([('poly', skpreprocessing.PolynomialFeatures()),
                 ('fit', linear_model.Lasso())]),
    'OLS_PIPE': pipeline.Pipeline([('poly', skpreprocessing.PolynomialFeatures()),
                 ('fit', linear_model.LinearRegression())]),
}

PARAMS = {
    'RIDGE_PIPE': {'fit__alpha':[550, 580, 600, 620, 650]},
    'LASSO_PIPE': {'fit__alpha':[0.005, 0.02, 0.03, 0.05, 0.06]}
}

## Model Training

In [None]:
def rmse(targets, preds):
    return np.sqrt(metrics.mean_squared_error(targets, preds))

In [None]:
def train(train_path, fold, store_model_at, model):
    df = pd.read_csv(train_path)
    train = df[df.kfold != fold]
    val = df[df.kfold == fold]

    ytrain = train.target.values
    yval = val.target.values

    cols_to_drop = ['id', 'url_legal', 'license', 'excerpt', 'standard_error', 'target', 'kfold',
                   'preprocessed_excerpt']
    xtrain = train.drop(cols_to_drop, axis=1)
    xval = val.drop(cols_to_drop, axis=1)

    regressor = MODELS[model]

    regressor.fit(xtrain, ytrain)
    preds = regressor.predict(xval)

    rmse_loss = rmse(preds, yval)
    print(f'Loss for fold={fold} : rmse={rmse_loss}')

    joblib.dump(regressor, f'{store_model_at}/{model}_{fold}.pkl')
    

train("./trainfe_folds.csv", 0, ".", "RIDGE")
train("./trainfe_folds.csv", 1, ".", "RIDGE")
train("./trainfe_folds.csv", 2, ".", "RIDGE")
train("./trainfe_folds.csv", 3, ".", "RIDGE")
train("./trainfe_folds.csv", 4, ".", "RIDGE")


## Inference

In [None]:
test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
fe_test = FeatureGenerator(test_df, spacy.load("en_core_web_lg"))
test_features = fe_test.extract_features()
test_features.to_csv("./test_features.csv", index=False)

In [None]:
def predict(test_path: str, model_dir: str, model_prefix: str):
    test_df = pd.read_csv(test_path)
    cols_to_drop = ['id', 'url_legal', 'license', 'excerpt', 'preprocessed_excerpt']
    test = test_df.drop(cols_to_drop, axis=1)

    models = list()
    for dirname, _, filenames in os.walk(model_dir):
        for filename in filenames:
            if model_prefix in filename:
                models.append(joblib.load(os.path.join(dirname, filename)))

    preds = [model.predict(test) for model in models]
    preds = np.mean(preds, axis=0)
    submission = pd.DataFrame({"id": test_df["id"], "target": preds})
    submission.to_csv("submission.csv", index=False)

predict("./test_features.csv", ".", "RIDGE")