# readability of text using popular readability formulas and metrics

In [None]:
import numpy as np 
import pandas as pd 
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt 
import seaborn as sns 
import warnings

readability_path = "../input/commonlitread/py-readability-metrics"
import sys
sys.path.append(readability_path)
from readability import Readability
tqdm.pandas()

train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
train = train.sort_values("target").reset_index(drop=True)

In [None]:
def read_metric(txt):
    r = Readability(txt)
    
    f = r.flesch_kincaid()
    print("Flesch Kincaid Grade Level")
    print(f"\t score : {f.score} \t grade_levels : {f.grade_level}")

    f = r.flesch()
    print("Flesch Reading Ease")
    print(f"\t score : {f.score} \t grade_levels : {f.grade_levels} \t ease : {f.ease} ")

    # Dale-Chall Score
    f = r.dale_chall()
    print("Dale Chall Readability")
    print(f"\t score : {f.score}  \t grade_levels : {f.grade_levels}")

    f = r.ari()
    print("Automated Readability Index (ARI)")
    print(f"\t score : {f.score}  \t grade_levels : {f.grade_levels}")

    f = r.coleman_liau()
    print("Coleman Liau Index")
    print(f"\t score : {f.score}  \t grade_levels : {f.grade_level}")

    # Gunning Fog Scale Level
    f = r.gunning_fog()
    print("Gunning Fog")
    print(f"\t score : {f.score}  \t grade_levels : {f.grade_level}")

    f = r.spache()
    print("Spache")
    print(f"\t score : {f.score}  \t grade_levels : {f.grade_level}")

    f = r.linsear_write()
    print("Linsear Write")
    print(f"\t score : {f.score}  \t grade_levels : {f.grade_level}")
    
# https://github.com/cdimascio/py-readability-metrics
def get_read_metric(txt):
    res = {"fk_score":0,"fk_level":0,
          "f_score":0,"f_level":0,
          "dc_score":0,"dc_level":0,
          "ari_score":0,"ari_level":0,
          "cl_score":0,"cl_level":0,
          "gf_score":0,"gf_level":0,
          "s_score":0,"s_level":0,
          "lw_score":0,"lw_level":0,}
    r = Readability(txt)
    
    f = r.flesch_kincaid()
    res["fk_score"]=f.score
    res["fk_level"]=f.grade_level

    f = r.flesch()
    res["f_score"]=f.score
    res["f_level"]=f.grade_levels

    # Dale-Chall Score
    f = r.dale_chall()
    res["dc_score"]=f.score
    res["dc_level"]=f.grade_levels
    
    f = r.ari()
    res["ari_score"]=f.score
    res["ari_level"]=f.grade_levels
    
    f = r.coleman_liau()
    res["cl_score"]=f.score
    res["cl_level"]=f.grade_level
    
    # Gunning Fog Scale Level
    f = r.gunning_fog()
    res["gf_score"]=f.score
    res["gf_level"]=f.grade_level
    
    f = r.spache()
    res["s_score"]=f.score
    res["s_level"]=f.grade_level
    
    f = r.linsear_write()
    res["lw_score"]=f.score
    res["lw_level"]=f.grade_level
    
    return res

# Flesch–Kincaid grade level
$$
    0.39(\frac{total words}{total sentences}) + 11.8(\frac{total syllables}{total words})
$$

# Flesch reading ease
$$
    206.835 -1.015(\frac{total words}{total sentences}) - 84.6(\frac{total syllables}{total words})
$$

|Score|School level (US)|Notes|
|:-|:-|:-|
|100.00–90.00|5th grade|Very easy to read. Easily understood by an average 11-year-old student.|
|90.0–80.0|6th grade|Easy to read. Conversational English for consumers.|
|80.0–70.0|7th grade|Fairly easy to read.|
|70.0–60.0|8th & 9th grade|Plain English. Easily understood by 13- to 15-year-old students.|
|60.0–50.0|10th to 12th grade|Fairly difficult to read.|
|50.0–30.0|College|Difficult to read.|
|30.0–10.0|College graduate|Very difficult to read. Best understood by university graduates.|
|10.0–0.0|Professional|Extremely difficult to read. Best understood by university graduates.|

# Gunning fog 

$$
    0.4 [(\frac{words}{sentences}) + 100(\frac{complex words}{words})]
$$

|Fog Index|Reading level by grade|
|:-|:-|
|17|College graduate|
|16|College senior|
|15|College junior|
|14|College sophomore|
|13|College freshman|
|12|High school senior|
|11|High school junior|
|10|High school sophomore|
|9|High school freshman|
|8|Eighth grade|
|7|Seventh grade|
|6|Sixth grade|

# SMOG
$$
    1.0430\sqrt{number of polysyllables * \frac{30}{number of sentences}} + 3.1291
$$

# Dale Chall Readability
$$
    0.1579 [(\frac{difficult words}{words})*100 + 0.0496(\frac{sentences}{words})]
$$

|Score|Notes
|:-|:-|
|4.9 or lower|easily understood by an average 4th-grade student or lower|
|5.0–5.9|easily understood by an average 5th or 6th-grade student|
|6.0–6.9|easily understood by an average 7th or 8th-grade student|
|7.0–7.9|easily understood by an average 9th or 10th-grade student|
|8.0–8.9|easily understood by an average 11th or 12th-grade student|
|9.0–9.9|easily understood by an average 13th to 15th-grade (college) student|

In [None]:
for i in range(0,len(train),1000):
    print()
    txt = train.iloc[i].excerpt
    print(f"excerpt : {txt[:50]}")
    print(f"target : {train.iloc[i].target}, standard_error : {train.iloc[i].standard_error}")
    print()
    read_metric(txt)
    print("-"*20)

In [None]:
train["metric"] = train.excerpt.progress_apply(get_read_metric)
train = train.join(train["metric"].apply(pd.Series))
train.drop(['url_legal',"standard_error",'license',"excerpt","metric","id"], axis=1, inplace=True)
df_corr = train.corr()
fig, ax = plt.subplots(figsize=(12, 9)) 
sns.heatmap(df_corr, vmax=1, vmin=-1, center=0)