# Readability Regressor

## Import Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import joblib
import os
from pathlib import Path
import sys

from sklearn.metrics import mean_squared_error

## Define Files and Directories

In [None]:
# Input files
data_dir = Path('../input/commonlitreadabilityprize')
train_file = data_dir / 'train.csv'
test_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'



In [None]:
# Data files for readability formulas
dalechall_file = '../input/the-new-dalechall-familiar-words-list/DaleChallEasyWordList.txt'
spache_file = '../input/spache-familiar-words-list/SpacheFamiliarWordList.txt'


In [None]:
train_df = pd.read_csv(train_file, index_col='id')
test_df = pd.read_csv(test_file, index_col='id')
print(f'Train size:{train_df.shape}. Test size:{test_df.shape}')
train_df.head()

## Calculate Classic Metrics for Readability
Based on:

https://clickhelp.com/software-documentation-tool/user-manual/readability-metrics.html

https://readabilityformulas.com/

### Common Classic Metrics:
Most readability formulas (**Fry, Flesch-Kincaid, Gunning Fog, and others**) are based on weighted combination of:
- ASL: Average sentence length (number of words in a sentence, excluding numbers).
- ASW: Average number of syllables per word.

Some other formulas (**SMOG, Linsear Write**) are similar to the above, but instead of average word length, they count number of long words (Polysyllables - 3 or more syllables). Some more simple formulas use average characters count per word instead of syllables.

### Formulas based on list of common words:

**The New Dale-Chall Formula** and Spache Readability Formula use a predefined set of "common" words and the ratio of "unfamiliar" or "difficult" words (PDW) and words per sentence (ASL).

Difficult words in Dale-Chall formula are words that do not appear on a specially designed list of common words familiar to most 4th-grade students - around 3000 words (https://readabilityformulas.com/articles/dale-chall-readability-word-list.php).

**The Spache Formula** considers “unfamiliar words” as words that 3rd grade and below do not recognize. The Spache list comprises of 925 familiar words (https://readabilityformulas.com/articles/spache-formula-word-list.php).

The Spache Formula is best used to calculate the difficulty of text that falls at the 3rd grade level or below, while the New Dale-Chall Formula is best used for texts that falls at the 4th grade level or above.

**Words considering "familiar"**:

- Words appearing on the common words list.

- Variants of words appearing on the list that have regular verb form endings – ing, -ed, -es.

- Plural and Possessive endings of nouns from the list.

- First Names.

- Single letters standing alone as words. E.g., ‘C is the third letter of the alphabets.’

**‘Difficult Words’:**

- Words not appearing on the "familiar" list or its variants as specified above.

- Variants of words appearing on the list that have irregular verb form endings – unless those variant forms also appear on the list.

- Variants of words appearing on the list that have adverbial, comparative, or superlative endings – ly, -er, -est.

- Words not appearing on the list are counted only once, even if they appear later with other endings.


**Bottom line Features Extraction:**

We'll extract the following features from the text:

- ASL: Average Sentence Length.
- ASW: Average Syllables per Word.
- ROP: Ratio Of Polysyllables (percentage of 3 or more syllables words in total words)
- RDC: Ratio of Dale-Chall difficult words
- ROS: Ratio Of Spache difficult words

In [None]:
import spacy

try:
    import syllapy
except ImportError as e:
    !pip install syllapy -q
    import syllapy
    
nlp = spacy.load('en_core_web_sm')

In [None]:
with open(dalechall_file) as f:
    dalechall_list = f.read().lower().split()
    
with open(spache_file) as f:
    spache_list = f.read().lower().split()

In [None]:
# ASL: Average Sentence Length.
# ASW: Average Syllables per Word.
# ROP: Ratio Of Polysyllables (percentage of 3 or more syllables words in total words)
# RDC: Ratio of Dale-Chall difficult words
# ROS: Ratio Of Spache difficult words

def count_difficult_words (word, easy_words_list, difficult_words_dict):
    word_lemma = word.lemma_.lower()
    if (
        word.ent_type_ != 'PERSON'
        and word.text.lower() not in easy_words_list
        and word_lemma not in easy_words_list
    ):
        try:
            difficult_words_dict[word_lemma] += 1
        except:
            difficult_words_dict[word_lemma] = 1
    return difficult_words_dict

def extract_readability_features(in_df):
    ASL_list, ASW_list, ROP_list, RDC_list, ROS_list = [], [], [], [], []
    total_words_list, num_sentences_list = [], []
    df = in_df.copy()
    
    for txt in df.excerpt:    
        doc = nlp(txt)
        sents = len(list(doc.sents)) # Number of sentences
        difficult_words_dale = {}
        difficult_words_spach = {}
        word_count = 0
        syllables_count = 0
        polysyllables_count = 0
        for token in doc:
            if token.is_alpha:
                word_count += 1
                syllables = syllapy.count(token.text)
                syllables_count += syllables
                if syllables>=3:
                    polysyllables_count+=1
                difficult_words_dale = count_difficult_words (token, dalechall_list, difficult_words_dale)
                difficult_words_spach = count_difficult_words (token, spache_list, difficult_words_spach)
                
        ASL_list.append(word_count/sents)
        ASW_list.append(syllables_count/word_count)
        ROP_list.append(polysyllables_count/word_count)
        RDC_list.append(len(difficult_words_dale) / word_count)
        ROS_list.append(len(difficult_words_spach) / word_count)
        num_sentences_list.append(sents)
        total_words_list.append(word_count)
        
    df['ASL'] = ASL_list
    df['ASW'] = ASW_list
    df['ROP'] = ROP_list
    df['RDC'] = RDC_list
    df['ROS'] = ROS_list
    df['num_sentences'] = num_sentences_list
    df['total_words'] = total_words_list
    return df


In [None]:
train_df_new = extract_readability_features(train_df)
train_df_new.head()

In [None]:
test_df_new = extract_readability_features(test_df)
test_df_new.head()

In [None]:
train_df_new.sort_values(by=['target']).head(10)

In [None]:
train_df_new.sort_values(by=['target']).tail(10)

In [None]:
# Plot pairs correlations
from matplotlib import pyplot as plt
import seaborn as sns

g = sns.pairplot(train_df_new, vars = ['target','ASL','ASW','ROP','RDC','ROS'], diag_kind="kde")
g.map_lower(sns.kdeplot, levels=4, color=".2")

In [None]:
# Ususally low target (= high complex readability score) has longer words, but some excerpts has relatively short words (low ROP),
# but those words are considered complex (with high ROS, RDC).

len(train_df_new[(train_df_new['ROP']<0.05) & (train_df_new['target']<-2.5)])

In [None]:
for excerpt in train_df_new[(train_df_new['ROP']<0.05) & (train_df_new['target']<-2.5)]['excerpt']:
    print (excerpt)
    print ('--------')