In [None]:
>>> import textstat

>>> test_data = (
    "Playing games has always been thought to be important to "
    "the development of well-balanced and creative children; "
    "however, what part, if any, they should play in the lives "
    "of adults has never been researched that deeply. I believe "
    "that playing games is every bit as important for adults "
    "as for children. Not only is taking time out to play games "
    "with our children and other adults valuable to building "
    "interpersonal relationships but is also a wonderful way "
    "to release built up tension."
)

>>> textstat.flesch_reading_ease(test_data)
>>> textstat.smog_index(test_data)
>>> textstat.flesch_kincaid_grade(test_data)
>>> textstat.coleman_liau_index(test_data)
>>> textstat.automated_readability_index(test_data)
>>> textstat.dale_chall_readability_score(test_data)
>>> textstat.difficult_words(test_data)
>>> textstat.linsear_write_formula(test_data)
>>> textstat.gunning_fog(test_data)
>>> textstat.text_standard(test_data)

In [1]:
test_data = (
    "Playing games has always been thought to be important to "
    "the development of well-balanced and creative children; "
    "however, what part, if any, they should play in the lives "
    "of adults has never been researched that deeply. I believe "
    "that playing games is every bit as important for adults "
    "as for children. Not only is taking time out to play games "
    "with our children and other adults valuable to building "
    "interpersonal relationships but is also a wonderful way "
    "to release built up tension."
)

In [21]:
textstat.flesch_reading_ease(test_data)

52.23

In [1]:
import textstat

In [2]:
from article_process import ArticleLM

In [3]:
# path_to_data = '/Users/stephanie/data/newsela_articles_corpus_2019-03-25_SMALL'
path_to_data = '/Users/stephanie/data/newsela_article_corpus_2016-01-29'
path_to_kenlm = '/Users/stephanie/github/kenlm'
path_to_arpa = path_to_kenlm + '/lm'

In [4]:
articleLM = ArticleLM(path_to_data, path_to_kenlm, path_to_arpa, 5, 'grade_level')

In [6]:
# articleLM.metadata_split[articleLM.metadata_split.train_val_test=='val'].file_path

In [7]:
# filename = articleLM.metadata_split[articleLM.metadata_split.train_val_test=='val'].loc[9,'file_path']

In [13]:
article_scores = articleLM.metadata_split[['slug', 'language', 'grade_level', 'filename',
                                          'train_val_test', 'is_original']].copy()

In [14]:
article_scores = article_scores[(article_scores.language=='en')
                               & (article_scores.grade_level != 10)]

In [15]:
article_scores.head()

Unnamed: 0,slug,language,grade_level,filename,train_val_test,is_original
1,10dollarbill-woman,en,8,10dollarbill-woman.en.1.txt,train,False
2,10dollarbill-woman,en,6,10dollarbill-woman.en.2.txt,val,False
3,10dollarbill-woman,en,5,10dollarbill-woman.en.3.txt,train,False
4,10dollarbill-woman,en,3,10dollarbill-woman.en.4.txt,train,False
6,17century-selfies,en,9,17century-selfies.en.1.txt,train,False


In [16]:
def flesch(filename):
    return textstat.flesch_reading_ease(articleLM.get_article_text(filename))
def flesch_k_grade(filename):
    return textstat.flesch_kincaid_grade(articleLM.get_article_text(filename))
def automated_read(filename):
    return textstat.automated_readability_index(articleLM.get_article_text(filename))

In [17]:
def flesch_map(flesch_score):
    if flesch_score <= 29:
        return 'very_confusing'
    elif flesch_score >= 30 and flesch_score <=49:
        return 'difficult'
    elif flesch_score >= 50 and flesch_score <=59:
        return 'fairly_difficult'
    elif flesch_score >= 60 and flesch_score <=69:
        return 'standard'
    elif flesch_score >= 70 and flesch_score <=79:
        return 'fairly_easy'
    elif flesch_score >= 80 and flesch_score <=89:
        return 'easy'
    else:
        return 'very_easy'

In [19]:
article_scores.loc[:,'flesch'] = list(map(flesch, article_scores.filename))
article_scores.loc[:,'flesch_mapping'] = list(map(flesch_map, article_scores.flesch))
article_scores.loc[:,'flesch_k_grade'] = list(map(flesch_k_grade, article_scores.filename))
article_scores.loc[:,'automated_read'] = list(map(automated_read, article_scores.filename))

In [20]:
article_scores.loc[:,'true_easy'] = [1 if x <= 5 else 0 for x in article_scores.grade_level]
article_scores.loc[:,'flesch_easy'] = [1 if x <= 79 else 0 for x in article_scores.flesch]

In [21]:
article_scores.loc[:,'flesch_k_grade'] = article_scores.flesch_k_grade.astype('int')
article_scores.loc[:,'automated_read'] = article_scores.automated_read.astype('int')
article_scores.loc[:,'flesch_kgrade_easy'] = [1 if x <= 5 else 0 for x in article_scores.flesch_k_grade]
article_scores.loc[:,'auto_read_easy'] = [1 if x <= 5 else 0 for x in article_scores.automated_read]

In [22]:
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

cm = confusion_matrix(article_scores.true_easy, article_scores.flesch_easy)
cm_fkg = confusion_matrix(article_scores.grade_level, article_scores.flesch_k_grade)
cm_fkg_easy = confusion_matrix(article_scores.true_easy, article_scores.flesch_kgrade_easy)
cm_autoread = confusion_matrix(article_scores.true_easy, article_scores.auto_read_easy)

In [23]:
cm

array([[   1, 4034],
       [1134, 2477]])

In [24]:
cm_autoread

array([[4035,    0],
       [3440,  171]])

In [25]:
accuracy_score(article_scores.true_easy, article_scores.flesch_easy)

0.3240910279884907

In [26]:
accuracy_score(article_scores.grade_level, article_scores.flesch_k_grade)

0.019879675647397333

In [27]:
accuracy_score(article_scores.true_easy, article_scores.flesch_kgrade_easy)

0.6445200104629872

In [28]:
accuracy_score(article_scores.true_easy, article_scores.auto_read_easy)

0.5500915511378499

In [29]:
f1_score(article_scores.true_easy, article_scores.flesch_easy, pos_label=1)

0.4894289666073898

In [30]:
f1_score(article_scores.true_easy, article_scores.flesch_kgrade_easy, pos_label=1)

0.39653641207815277

In [31]:
f1_score(article_scores.true_easy, article_scores.auto_read_easy, pos_label=1)

0.09042834479111582

In [33]:
len(article_scores[article_scores.true_easy == article_scores.flesch_easy] ) * 100/ len(article_scores)

32.40910279884907

In [34]:
len(article_scores[article_scores.true_easy == article_scores.flesch_kgrade_easy] ) * 100/ len(article_scores)

64.45200104629872

In [35]:
len(article_scores[article_scores.true_easy == article_scores.auto_read_easy] ) * 100/ len(article_scores)

55.009155113784985