Corpus Readability

In [30]:
import nltk
nltk.download("treebank")
nltk.download("cmudict")
nltk.download("brown")
nltk.download("movie_reviews")
nltk.download("switchboard")
nltk.download("gutenberg")
nltk.download("webtext")
nltk.download("twitter_samples")
nltk.download("nps_chat")

from nltk.corpus import treebank,cmudict,brown,movie_reviews,switchboard,gutenberg,webtext,twitter_samples,nps_chat
from collections import defaultdict, Counter
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import altair as alt

[nltk_data] Downloading package treebank to
[nltk_data]     /Users/shounakmondal/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package cmudict to
[nltk_data]     /Users/shounakmondal/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     /Users/shounakmondal/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/shounakmondal/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package switchboard to
[nltk_data]     /Users/shounakmondal/nltk_data...
[nltk_data]   Package switchboard is already up-to-date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/shounakmondal/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package webtext to
[nltk_data]     /Users/shounakmondal/nltk_data...
[nltk_dat

In [31]:
vowels = {"a","e","i","o","u","y"}
p_dict = cmudict.dict() # keep this outside as a global variable so you aren't reloading each time

def get_syllables(word):
    '''use CMU dict (p_dict) to count the number of syllables in word, default to number of vowels'''
    syllable_count = 0
    #your code here

    if word.lower() not in p_dict:
        syllable_count = len(vowels.intersection(word))
    else:
        for x in p_dict[word.lower()][0]:
            if x[-1].isdigit():
                syllable_count +=1
                
    return syllable_count

In [32]:
def get_reading_ease(sentence):
    '''calculate the Flesh reading ease for a single sentence consisting of a list of words (strings)'''
    # your code here
    total_words = 0
    total_sentences = 1
    total_syllables = 0
    
    for word in sentence:
        if word.isalpha():
            total_words +=1
            syllables = get_syllables(word)
            total_syllables += syllables
            
    if total_words == 0:
        return None

    reading_ease = 206.835 - 1.015 * (total_words / total_sentences) - 84.6 * (total_syllables / total_words)
    return reading_ease


In [33]:
def get_reading_ease_corpus(corpus):
    total = 0 
    count = 0
    for sent in corpus.sents():
        count +=1
        score = get_reading_ease(sent) if get_reading_ease(sent) != None else 0
        total = total + score

    return total / count

In [38]:
penn_readability = get_reading_ease_corpus(treebank)
review_readability = get_reading_ease_corpus(movie_reviews)
brown_readability = get_reading_ease_corpus(brown)
gutenberg_readability = get_reading_ease_corpus(gutenberg)
webtext_readability = get_reading_ease_corpus(webtext)

print("Flesch Reading Ease for Penn Treebank is:",penn_readability)
print("Flesch Reading Ease for Movie Reviews is:",review_readability)
print("Flesch Reading Ease for Brown is:",brown_readability)
print("Flesch Reading Ease for gutenberg is:",gutenberg_readability)
print("Flesch Reading Ease for webtext is:",webtext_readability)



Flesch Reading Ease for Penn Treebank is: 48.361193762970494
Flesch Reading Ease for Movie Reviews is: 57.98530445444417
Flesch Reading Ease for Brown is: 59.421414769007185
Flesch Reading Ease for gutenberg is: 70.96167608411457
Flesch Reading Ease for webtext is: 81.57121075203247


In [39]:
readability_list = [penn_readability, review_readability, brown_readability, gutenberg_readability, webtext_readability ]
corpus_list = ["Penn", "Movie Reviews", "Brown", "Gutenberg", "Web Text"  ]
columns_list = ["corpus","readibility_score"]
df = pd.DataFrame(zip(corpus_list,readability_list), columns = columns_list)

In [42]:
alt.Chart(df).mark_bar().encode(
    x = alt.X('corpus',sort='y', title = "Corpus"),
    y = alt.Y('readibility_score', title = "Readability Score"),
    color = alt.Color('corpus', legend=None)
).properties(
    width=300,
    height=200
)