# Characterizing Complexity in Document

### Reading Text Content

In [121]:
file = open("terms/20150518.txt", encoding="utf8")

content = file.read()
file.close()

print(content)

Twitter Terms of Service

These Terms of Service (“Terms”) govern your access to and use of our Services, including our various websites, SMS, APIs, email notifications, applications, buttons, widgets, ads, commerce services (the “Twitter Services”), and our other covered services that link to these Terms (collectively, the “Services”), and any information, text, graphics, photos or other materials uploaded, downloaded or appearing on the Services (collectively referred to as “Content”). Your access to and use of the Services are conditioned on your acceptance of and compliance with these Terms. By accessing or using the Services you agree to be bound by these Terms.

1. Basic Terms
You are responsible for your use of the Services, for any Content you post to the Services, and for any consequences thereof. Most Content you submit, post, or display through the Twitter Services is public by default and will be able to be viewed by other users and through third party services and websites

### Get Text Complexity

In [17]:
import textstat

content_data = (content)

# readability functions
flesch_reading_ease = textstat.flesch_reading_ease(content_data)
smog_index = textstat.smog_index(content_data)
flesch_grade = textstat.flesch_kincaid_grade(content_data)
coleman_liau_index = textstat.coleman_liau_index(content_data)
readability_index = textstat.automated_readability_index(content_data)
dale_chall_readability_score = textstat.dale_chall_readability_score(content_data)
difficult_words = textstat.difficult_words(content_data)
linsear_write_formula = textstat.linsear_write_formula(content_data)
gunning_fog_index = textstat.gunning_fog(content_data)
text_standard = textstat.text_standard(content_data)

# formulas provided in comment links
print("Flesch reading ease:", flesch_reading_ease) # https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease
print("SMOG index:", smog_index) # https://en.wikipedia.org/wiki/SMOG
print("Flesch-Kincaid grade:", flesch_grade) # https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch%E2%80%93Kincaid_grade_level
print("Coleman-Liau index:", coleman_liau_index) # https://en.wikipedia.org/wiki/Coleman%E2%80%93Liau_index
print("Automated readability index:", readability_index) # https://en.wikipedia.org/wiki/Automated_readability_index
print("Dale-Chall readability score:", dale_chall_readability_score) # https://en.wikipedia.org/wiki/Dale%E2%80%93Chall_readability_formula#Formula
print("Difficult words:", difficult_words)
print("Linsear write formula:", linsear_write_formula) # https://en.wikipedia.org/wiki/Linsear_Write#Algorithm
print("Gunning fog index:", gunning_fog_index) # https://en.wikipedia.org/wiki/Gunning_fog_index#Calculation
print("Text standard:", text_standard)

Flesch reading ease: 28.54
SMOG index: 17.0
Flesch grade: 19.8
Coleman-Liau index: 12.61
Readability index: 24.1
Dale-Chall readability score: 7.91
Difficult words: 501
Linsear write formula: 21.666666666666668
Funning fog: 19.68
Text standard: 19th and 20th grade


### Get Low-Level Text Descriptors

In [113]:
import re
import nltk

# required dependencies
# nltk.download('punkt')                        # punctuation (English)
# nltk.download('stopwords')                    # stopword corpus
# nltk.download('averaged_perceptron_tagger')   # parts of speech tagging (English)
# nltk.download('universal_tagset')             # parts of speech tagging (Universal)

# tokenize words
words = nltk.word_tokenize(content)
nonPunct = re.compile('.*[A-Za-z0-9].*')  # "word" defined as containing a letter or digit
words_filtered = [w for w in words if nonPunct.match(w)]

# filter out stopwords
stop_words = nltk.corpus.stopwords.words('english')
words_clean = [w for w in words_filtered if not w in stop_words]

# characters per words
chars_per_word = float(sum(map(len, words))) / len(words) # all words
chars_per_word_clean = float(sum(map(len, words_clean))) / len(words_clean) # words filtered by punct and stopwords

# frequent words
fdist_all = nltk.FreqDist(w.lower() for w in words_clean)
fdist_common = fdist_all.most_common(10)

# parts of speech tagging
pos_tags = nltk.pos_tag(words_clean, tagset="universal")
pos_freq = nltk.FreqDist(tag for (word, tag) in pos_tags)
pos_counts = pos_freq.most_common()

word_counts = Counter(w.lower() for w in words_clean)
# comprehension manifest indicators
comp_indicators = ["agree", "continue", "grant", "let", "rely", "represent", "responsibility", "warrant", "understand"]
comp_occur = 0
for comp_indicator in comp_indicators:
    comp_occur = word_counts[comp_indicator]

# willingness/ability to perform actions manifest indicators
ability_indicators = ["can", "cannot", "may", "must"]
ability_occur = 0
for ability_indicator in ability_indicators:
    ability_occur = word_counts[ability_indicator]

# tokensize sentences
sentences = nltk.sent_tokenize(content)

# words per sentence
words_per_sent = len(words) / len(sentences)

# interesting collocations = phrases with highest point wise mutual information
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(words_clean)
trigram_finder = nltk.collocations.TrigramCollocationFinder.from_words(words_clean)

bigram_finder.apply_freq_filter(3) # only bigrams that appear 3+ times
bigram_fdist = bigram_finder.nbest(bigram_measures.pmi, 10) # return the 10 2-grams with the highest PMI

trigram_finder.apply_freq_filter(3) # only trigrams that appear 3+ times
trigram_fdist = trigram_finder.nbest(trigram_measures.pmi, 10) # return the 10 3-grams with the highest PMI

print("Low-level characteristics: ")
print("Word count:", len(words_filtered))
print("Non-stopword count:", len(words_clean))
print("Avg. characters per word (unclean):", chars_per_word)
print("Avg. characters per word (clean):", chars_per_word_clean)
print("Sentence count:", len(sentences))
print("Avg. words per sentence:", words_per_sent)
print("Comprehension indicator count:", comp_occur)
print("Ability indicator count:", ability_occur)
print("")

print("Frequencies: ")
print("Frequent words:", fdist_common)
print("Frequent bigrams:", bigram_fdist)
print("Frequent trigrams:", trigram_fdist)
print("")

print("Parts of speech counts: ")
print("adjectives:", pos_freq["ADJ"]) # new, good, high, special, big, local
print("adpositions:", pos_freq["ADP"]) # on, of, at, with, by, into, under
print("adverbs:", pos_freq["ADV"]) # really, already, still, early, now
print("conjunctions:", pos_freq["CONJ"]) # and, or, but, if, while, although
print("articles:", pos_freq["DET"]) # the, a, some, most, every, no, which
print("nouns:", pos_freq["NOUN"]) # year, home, costs, time, Africa
print("numerals:", pos_freq["NUM"]) # twenty-four, fourth, 1991, 14:24
print("particles:", pos_freq["PRT"]) # at, on, out, over per, that, up, with
print("pronouns:", pos_freq["PRON"]) # he, their, her, its, my, I, us
print("verb:", pos_freq["VERB"]) # is, say, told, given, playing, would
print("other:", pos_freq["X"]) # ersatz, esprit, dunno, gr8, univeristy
print("")

Low-level characteristics: 
Word count: 3643
Non-stopword count: 2078
Avg. characters per word (unclean): 4.411153119092628
Avg. characters per word (clean): 6.57218479307026
Sentence count: 121
Avg. words per sentence: 34.97520661157025
Comprehension indicator count: 5
Ability indicator count: 0

Frequencies: 
Frequent words: [('services', 104), ('twitter', 80), ('content', 54), ('use', 44), ('terms', 39), ('may', 28), ('the', 27), ('you', 21), ('or', 20), ('information', 18)]
Frequent bigrams: [('San', 'Francisco'), ('TWITTER', 'ENTITIES'), ('third', 'party'), ('sole', 'discretion'), ('submit', 'post'), ('Privacy', 'Policy'), ('United', 'States'), ('extent', 'permitted'), ('prior', 'notice'), ('copyright', 'infringement')]
Frequent trigrams: [('THE', 'TWITTER', 'ENTITIES'), ('without', 'prior', 'notice'), ('laws', 'United', 'States'), ('Content', 'submit', 'post'), ('posted', 'via', 'Services'), ('access', 'use', 'Services'), ('use', 'Services', 'Content'), ('Content', 'Twitter', 'Se

### Reading Webpage Content

In [58]:
# from local file
file = open("terms/20150518.html", encoding="utf8")

html_content = file.read()
file.close()

print(html_content)

<!doctype html>
  <html lang="en" prefix="og: http://ogp.me/ns#">
  <head>
    <meta charset="utf-8"/>

<meta name="viewport" content="width=device-width, initial-scale=1"/>

<title>Previous Terms of Service</title>
<meta name="description"/>
<link rel="canonical" href="https://twitter.com/content/twitter-com/legal/en/tos/previous/version_9.html"/>



<meta property="og:url" content="https://twitter.com/content/twitter-com/legal/en/tos/previous/version_9.html"/>
<meta property="og:type" content="article"/>
<meta property="og:title" content="Previous Terms of Service"/>




<meta name="twitter:card" content="summary"/>















<meta name="twitter:widgets:new-embed-design" content="on"/>
<meta name="twitter:widgets:csp" content="on"/>

<link href="https://abs.twimg.com/favicons/favicon.ico" rel="shortcut icon" type="image/x-icon"/>


<script type="application/json" id="analytics-settings">{&quot;google&quot;:{&quot;accounts&quot;:[],&quot;options&quot;:{&quot;displayAdvertisingFea

In [78]:
# from webpage url

url = "https://twitter.com/en/tos/previous/version_9"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

print(soup)

<!DOCTYPE html>

<html lang="en" prefix="og: http://ogp.me/ns#">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<title>Previous Terms of Service</title>
<meta name="description"/>
<link href="https://twitter.com/content/twitter-com/legal/en/tos/previous/version_9.html" rel="canonical"/>
<meta content="https://twitter.com/content/twitter-com/legal/en/tos/previous/version_9.html" property="og:url"/>
<meta content="article" property="og:type"/>
<meta content="Previous Terms of Service" property="og:title"/>
<meta content="summary" name="twitter:card"/>
<meta content="on" name="twitter:widgets:new-embed-design"/>
<meta content="on" name="twitter:widgets:csp"/>
<link href="https://abs.twimg.com/favicons/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
<script id="analytics-settings" type="application/json">{&quot;google&quot;:{&quot;accounts&quot;:[],&quot;options&quot;:{&quot;displayAdvertisingFeatures&quot;:false}},&quot;scribe&q

### Get Low-Level Webpage Descriptors

In [86]:
from bs4 import BeautifulSoup
import requests, json

# get hyperlink descriptors

print(soup.title)
links = soup.body.find_all("a")

valid_links = {}

for i, link in enumerate(links):
    href = link.get('href').strip()

    if ('.') in href: # includes mailtos versus href.startswith("http")
        valid_links[href] = link.text.strip()

print("Number of hyperlinks:", len(valid_links))
print("Number of unique hyperlinks:", len(set(valid_links)))
print(json.dumps(valid_links, indent=4)) # pretty printing

<title>Previous Terms of Service</title>
{
    "https://twitter.com": "Twitter logo icon",
    "https://twitter.com/login": "Sign in",
    "https://support.twitter.com/articles/20172501": "our other covered services",
    "https://support.twitter.com/articles/14016-about-public-and-protected-tweets": "here",
    "http://twitter.com/settings/security": "account settings",
    "https://twitter.com/settings/notifications": "email",
    "https://twitter.com/settings/devices": "mobile",
    "https://dev.twitter.com/overview/terms/agreement-and-policy": "rules",
    "http://support.twitter.com/articles/18311-the-twitter-rules": "Twitter Rules",
    "https://twitter.com/privacy": "Privacy",
    "http://dev.twitter.com/": "Twitter API",
    "https://support.twitter.com/articles/20171943": "Twitter Commerce Terms",
    "http://twitter.com/apirules": "terms and conditions",
    "https://support.twitter.com/forms/dmca": "https://support.twitter.com/forms/dmca",
    "http://support.twitter.com/art

In [95]:
# get bolded descriptors

terms = soup.body.find_all(["strong", "b"])

valid_terms = []
tip_count = 0
for i, term in enumerate(terms):
    term_processed = term.text.strip()
    
    # all bolded terms <-- b?
    valid_terms.append(term_processed)
    
    # if tip
    if "Tip" in term_processed: # tip found <-- strong?
        tip_count += 1

print("Number of terms:", len(valid_terms))
print("Number of unique terms:", len(set(valid_terms)))
print("Number of tips:", tip_count)
print(valid_terms)

Number of terms: 10
Number of unique terms: 5
Number of tips: 6
['Terms', 'Twitter Services', 'Services', 'Content', 'Tip', 'Tip', 'Tip', 'Tip', 'Tip', 'Tip']


In [96]:
# get section descriptors 

sections = soup.body.find_all('div', class_="indexed-section-title") # <-- subsections? 

valid_sections = []
for i, section in enumerate(sections):
    valid_sections.append(section.text.strip())

print("Number of sections:", len(valid_sections))
print(valid_sections)

Number of sections: 18
['1. Basic Terms', '2. Privacy', 'Â\xa03. Passwords', '4. Content on the Services', '5. Your Rights', '6. Your License To Use the Services', '7. Twitter Rights', '8. Restrictions on Content and Use of the Services', '9. Copyright Policy', '10. Ending These Terms', '11. Disclaimers and Limitations of Liability', 'A. The Services are Available â\x80\x9cAS-ISâ\x80\x9d', 'B. Links', 'C. Limitation of Liability', '12. General Terms', 'A. Waiver and Severability', 'B. Controlling Law and Jurisdiction', 'C. Entire Agreement']


# Data Frame Functions

In [115]:
import numpy as np
import pandas as pd

In [119]:
def create_readability_df(content):
    
    readability_df = pd.DataFrame()
    
    readability_df['Flesch Reading Ease'] = 0 
    readability_df['Flesch Grade'] = 0
    readability_df['SMOG Index'] = 0
    readability_df['Coleman-Liau Index'] = 0
    readability_df['Automated Readability Index'] = 0
    readability_df['Dale-Chall Readability Index'] = 0
    readability_df['Difficult Words Score'] = 0
    readability_df['Linsear Write Score'] = 0
    readability_df['Gunning Fog Index'] = 0
    
    readability_df.loc[0] = [
        textstat.flesch_reading_ease(content),
        textstat.flesch_kincaid_grade(content),
        textstat.smog_index(content),
        textstat.coleman_liau_index(content),
        textstat.automated_readability_index(content),
        textstat.dale_chall_readability_score(content),
        textstat.difficult_words(content),
        textstat.linsear_write_formula(content),
        textstat.gunning_fog(content)
    ]
    
    return readability_df    

# Use:
# create_readability_df(content_data)

In [120]:
create_readability_df(content_data)

Unnamed: 0,Flesch Reading Ease,Flesch Grade,SMOG Index,Coleman-Liau Index,Automated Readability Index,Dale-Chall Readability Index,Difficult Words Score,Linsear Write Score,Gunning Fog Index
0,28.54,19.8,17.0,12.61,24.1,7.91,501.0,21.666667,19.68


In [124]:
def create_pos_df(content):
    
    words = nltk.word_tokenize(content) # tokenize words
    
    # parts of speech tagging
    pos_tags = nltk.pos_tag(words, tagset="universal")
    pos_freq = nltk.FreqDist(tag for (word, tag) in pos_tags)
    
    pos_df = pd.DataFrame()
    
    pos_df["Adjectives"] = 0
    pos_df["Adpositions"] = 0
    pos_df["Adverbs"] = 0
    pos_df["Conjunctions"] = 0
    pos_df["Articles"] = 0
    pos_df["Nouns"] = 0
    pos_df["Numerals"] = 0
    pos_df["Particles"] = 0
    pos_df["Pronouns"] = 0
    pos_df["Verb"] = 0
    pos_df["Other POS"] = 0
    
    pos_df.loc[0] = [
        pos_freq["ADJ"],
        pos_freq["ADP"],
        pos_freq["ADV"],
        pos_freq["CONJ"],
        pos_freq["DET"],
        pos_freq["NOUN"],
        pos_freq["NUM"],
        pos_freq["PRT"],
        pos_freq["PRON"],
        pos_freq["VERB"],
        pos_freq["X"]
    ]
    
    return pos_df

# create_pos_df(content_data)

Unnamed: 0,Adjectives,Adpositions,Adverbs,Conjunctions,Articles,Nouns,Numerals,Particles,Pronouns,Verb,Other POS
0,229,435,87,268,401,1306,31,138,215,559,0


In [127]:
def create_lowlevel_df(content):
    
    # word-level analyses
    words = nltk.word_tokenize(content)
    
    # filter out punctuation
    nonPunct = re.compile('.*[A-Za-z0-9].*')  # "word" defined as containing a letter or digit
    words_filtered = [w for w in words if nonPunct.match(w)]
    
    # filter out stopwords
    stop_words = nltk.corpus.stopwords.words('english')
    words_clean = [w for w in words_filtered if not w in stop_words]
    
    # word count
    wc_unfiltered = len(words_filtered) # all words
    wc_filtered = len(words_clean) # all but stop words
    
    # characters per word
    cpw_unfiltered = float(sum(map(len, words))) / len(words) # all words
    cpw_filtered = float(sum(map(len, words_clean))) / len(words_clean) # all but stop words
    
    # sentence-level analyses
    sentences = nltk.sent_tokenize(content)
    
    # words per sentence
    wps_unfiltered = len(words) / len(sentences) # considering all words per sentence
    wps_filtered = len(words_clean) / len(sentences) # considering all but stop words per sentence
    
    lowlevel_df = pd.DataFrame()
    
    lowlevel_df["Word Count All"] = 0
    lowlevel_df["Word Count Filtered"] = 0 
    lowlevel_df["Characters Per Word All"] = 0 
    lowlevel_df["Characters Per Word Filtered"] = 0 
    lowlevel_df["Words Per Sentence All"] = 0 
    lowlevel_df["Words Per Sentence Filtered"] = 0
    
    lowlevel_df.loc[0] = [
        wc_unfiltered,
        wc_filtered,
        cpw_unfiltered,
        cpw_filtered,
        wps_unfiltered,
        wps_filtered        
    ]
    
    return lowlevel_df

# create_lowlevel_df(content_data)

Unnamed: 0,Word Count All,Word Count Filtered,Characters Per Word All,Characters Per Word Filtered,Words Per Sentence All,Words Per Sentence Filtered
0,3643.0,2078.0,4.411153,6.572185,34.975207,17.173554


In [126]:
def create_manifest_df(content, comp_indicators, ability_indicators):
    words = nltk.word_tokenize(content)
    
    # filter out punctuation
    nonPunct = re.compile('.*[A-Za-z0-9].*')  # "word" defined as containing a letter or digit
    words_filtered = [w for w in words if nonPunct.match(w)]
    
    # filter out stopwords
    stop_words = nltk.corpus.stopwords.words('english')
    words_clean = [w for w in words_filtered if not w in stop_words]
    
    # calculate word counts
    word_counts = Counter(w.lower() for w in words_clean)
    
    # comprehension manifest indicators
    comp_occur = 0
    for comp_indicator in comp_indicators:
        comp_occur = word_counts[comp_indicator]
    
    # willingness / abilities to perform actions indicators
    ability_occur = 0
    for ability_indicator in ability_indicators:
        ability_occur = word_counts[ability_indicator]
    
    manifest_df = pd.DataFrame()
    
    manifest_df["Comprehension Manifest Indicators"] = 0
    manifest_df["Ability Manifest Indicators"] = 0
    
    manifest_df.loc[0] = [
        comp_occur,
        ability_occur
    ]
    
    return manifest_df

#comp_indicators = ["agree", "continue", "grant", "let", "rely", "represent", "responsibility", "warrant", "understand"]
#ability_indicators = ["can", "cannot", "may", "must"]
#create_manifest_df(content_data, comp_indicators, ability_indicators)

In [130]:
def create_webpage_df(url):
    
    # from webpage url
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    # count hyperlinks
    links = soup.body.find_all("a")
    valid_links = {}
    for i, link in enumerate(links):
        href = link.get('href').strip()
        if('.') in href:
            valid_links[href] = link.text.strip()
    
    link_count = len(valid_links)
    unique_links_count = len(set(valid_links))
    
    # count bolded terms
    terms = soup.body.find_all(["strong", "b"])
    valid_terms = []
    tip_count = 0
    for i, term in enumerate(terms):
        term_processed = term.text.strip()
        valid_terms.append(term_processed)
        if "Tip" in term_processed:
            tip_count += 1
            
    terms_count = len(valid_terms)
    unique_terms_count = len(set(valid_terms))
    
    # count sections
    sections = soup.body.find_all('div', class_="indexed-section-title")
    valid_sections = []
    for i, section in enumerate(sections):
        valid_sections.append(section.text.strip())
    sections_count = len(valid_sections)
    
    webpage_df = pd.DataFrame()
    
    webpage_df["URL"] = 0
    webpage_df["Hyperlink Count"] = 0
    webpage_df["Unique Hyperlink Count"] = 0
    webpage_df["Terms Count"] = 0
    webpage_df["Unique Terms Count"] = 0
    webpage_df["Tip Count"] = 0
    webpage_df["Section Count"] = 0
    
    webpage_df.loc[0] = [
        url,
        link_count, 
        unique_links_count, 
        terms_count, 
        unique_terms_count,
        tip_count, 
        sections_count
    ]
    
    return webpage_df

# create_webpage_df("https://twitter.com/en/tos/previous/version_9")

Unnamed: 0,URL,Hyperlink Count,Unique Hyperlink Count,Terms Count,Unique Terms Count,Tip Count,Section Count
0,https://twitter.com/en/tos/previous/version_9,21,21,10,5,6,18


In [None]:
def extract_features(file, url):
    
    # read file contents
    file = open(file, encoding="utf8")
    content = file.read()
    file.close()
    
    lowlevel_descriptors = create_