In [None]:
# !pip install -q -r requirements.txt')
from adjective_reading import *
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import plotnine as p9
from tqdm import tqdm

In [12]:
CORPUS_DATA = None
def get_corpus_data(path_corpus=PATH_CORPUS):
    def without_full_text(d):
        return {k: v for k, v in d.items() if k != 'fullText'}

    global CORPUS_DATA
    if CORPUS_DATA is None:
        CORPUS_DATA = pd.DataFrame(
            tqdm(
                (without_full_text(d) for d in orjsonl.stream(path_corpus)),
                total=CORPUS_NUM_SENTS
            )
        ).set_index('url')
    return CORPUS_DATA

In [13]:
df_corpus = get_corpus_data()
df_instances = get_instances_data()
word2data = get_word2data()

100%|██████████| 71902/71902 [00:03<00:00, 20749.89it/s]
  0%|          | 0/71902 [00:00<?, ?it/s]

Loading word2data from /Users/ryan/Dropbox/Share/data/byu_word_data/worddb.byu.txt...


100%|██████████| 71902/71902 [03:29<00:00, 343.46it/s]


In [14]:
total_df = df_instances.merge(df_corpus, on="url", how="left").fillna("")
total_df['year'] = pd.to_numeric(total_df.publicationYear, errors="coerce")
total_df['decade'] = total_df.year.astype(int) // 10 * 10
total_df['prev_pos'] = total_df.token0.map(lambda x: word2data[x]['pos'] if x in word2data else None).fillna("")
total_df['next_pos'] = total_df.token2.map(lambda x: word2data[x]['pos'] if x in word2data else None).fillna("")
total_df_adjs = total_df[total_df.prev_pos.str.startswith("j")]

In [15]:
total_df.decade.value_counts()

decade
1990    17542
2000    16916
1980    15094
2010    14643
1970     8923
1960     4821
1950     3941
1930     2716
1940     2701
1920     1977
1910     1474
1900      801
Name: count, dtype: int64

In [16]:
deccount = 1000
total_df_smpl = pd.concat(
    gdf.sample(frac=1).head(deccount)
    for g,gdf in total_df.query('decade>=1920').groupby('decade')
)
total_df_smpl.decade.value_counts(), total_df_smpl.shape

(decade
 1920    1000
 1930    1000
 1940    1000
 1950    1000
 1960    1000
 1970    1000
 1980    1000
 1990    1000
 2000    1000
 2010    1000
 Name: count, dtype: int64,
 (10000, 65))

In [17]:

# pos_df = total_df_smpl.groupby('decade').agg(
#     n_sents=('sent', 'nunique'),
#     n_words=('token0', 'size'),
#     n_wordtypes=('token0', 'nunique'),
#     n_adjs=('prev_pos', lambda x: (x.str.startswith('j')).sum()),
#     n_nouns=('next_pos', lambda x: (x.str.startswith('n')).sum()),
#     n_verbs=('next_pos', lambda x: (x.str.startswith('v')).sum()),
#     n_adverbs=('next_pos', lambda x: (x.str.startswith('r')).sum()),
#     n_other=('next_pos', lambda x: (~(x.str.startswith('j')) & ~(x.str.startswith('n')) & ~(x.str.startswith('v')) & ~(x.str.startswith('r'))).sum()),
# )
# pos_df['ttr_words'] = pos_df.n_wordtypes / pos_df.n_words * 100
# pos_df['perc_adj'] = pos_df.n_adjs / pos_df.n_words * 100
# pos_df['perc_noun'] = pos_df.n_nouns / pos_df.n_words * 100
# pos_df['perc_verb'] = pos_df.n_verbs / pos_df.n_words * 100
# pos_df["ratio_adj2noun"] = pos_df.n_adjs / pos_df.n_nouns
# round(pos_df,1)


In [8]:
pprint(total_df_smpl.query('prev_pos=="ex"').sent)

Series([], Name: sent, dtype: object)


In [9]:
def get_pos_type(pos):
    if pos.startswith('j'):
        return 'adj'
    elif pos.startswith('n'):
        return 'noun'
    elif pos.startswith('v'):
        return 'verb'
    elif pos.startswith('r'):
        return 'adverb'
    elif pos.startswith("i"):
        return 'preposition'
    elif pos.startswith("p"):
        return 'pronoun'
    elif pos.startswith("c"):
        return 'conjunction'
    elif pos.startswith("d"):
        return 'determiner'
    elif pos.startswith("a"):
        return 'article'
    elif pos.startswith("t"):
        return 'particle'
    elif pos.startswith("m"):
        return 'number'
    elif pos.startswith("s"):
        return 'symbol'
    elif pos.startswith("u"):
        return 'punctuation'
    elif pos.startswith("x"):
        return 'not'
    else:
        print(pos)
        return 'other'

def get_stats_from_smpl(df):
    old = []
    df = df.copy().fillna('')
    df = df.query('prev_pos!="" & prev_pos!="." & prev_pos!="ex"')
    df['prev_pos0'] = df.prev_pos.str.slice(0, 1)
    df['prev_pos_type'] = df.prev_pos.map(get_pos_type)
    
    for g, gdf in df.groupby('decade'):
        tok_counts = gdf.token0.value_counts()
        tok_rel_freq = tok_counts / tok_counts.sum()

        # pos_counts = gdf.prev_pos0.value_counts()
        # pos_rel_freq = pos_counts / pos_counts.sum()
        # print(pos_rel_freq)

        pos_type_counts = gdf.prev_pos_type.value_counts()
        pos_type_rel_freq = pos_type_counts / pos_type_counts.sum()
        # print(pos_type_rel_freq)

        
        for rank, (pos, rel_freq) in enumerate(pos_type_rel_freq.items()):
            pos_gdf = gdf.query('prev_pos_type==@pos')
            pos_tok_counts = pos_gdf.token0.value_counts()
            tok_counts_str= '\n'.join(f'{tok}\t{pos_tok_counts[tok]}' for tok in pos_tok_counts.index[:10])

            pos_n_tokens = pos_gdf.token0.size
            pos_n_wordtypes = pos_gdf.token0.nunique()
            pos_d = {
                'decade': g,
                'pos': pos,
                'count': pos_type_counts[pos],
                'freq': rel_freq,
                'ttr': pos_n_wordtypes / pos_n_tokens * 100,
                'egs': '\n'.join(pos_gdf.sent),
                'eg_tokens': tok_counts_str
            }
            old.append(pos_d)
    return pd.DataFrame(old)


In [10]:
pos_df = get_stats_from_smpl(total_df_smpl)
pos_df.to_excel('pos_df.xlsx', index=False)
pos_df

Unnamed: 0,decade,pos,count,freq,ttr,egs,eg_tokens
0,1920,article,259,0.307236,3.474903,"The writer of a medieval ""best seller"" -that i...",the\t176\na\t31\nhis\t29\ntheir\t11\nmy\t5\nit...
1,1920,adj,237,0.281139,51.476793,This is one of the instances in which MS. A al...,wide\t16\noriginal\t15\ncorrect\t10\ncareful\t...
2,1920,preposition,139,0.164887,11.510791,"The Journals of Paris, Amsterdam, Leipsick, Tr...",of\t43\nin\t30\nby\t12\nafter\t11\nworth\t9\nf...
3,1920,noun,64,0.075919,42.187500,We find Mrs. Frank indebted to Dr. Christ in s...,variant\t30\nmanuscript\t5\ntext\t2\nimpressio...
4,1920,verb,40,0.047450,50.000000,Professor A. T. Baker has drawn my attention t...,was\t8\nbeen\t6\naccepted\t5\nis\t4\nwere\t2\n...
...,...,...,...,...,...,...,...
110,2010,adverb,20,0.023337,90.000000,"In conceptualizing surface __reading__, they d...",closely\t2\nfurther\t2\njust\t1\nalways\t1\nho...
111,2010,particle,9,0.010502,11.111111,Attending to such contact points shifts the in...,to\t9
112,2010,number,5,0.005834,60.000000,The first mis__reading__ results from an un- c...,first\t3\nthree\t1\none\t1
113,2010,pronoun,3,0.003501,100.000000,Another is James E. Berg’s chapter on Wopsle’s...,them\t1\nthemselves\t1\nourselves\t1


In [11]:
total_df['wordCount2'] = sum(
    total_df.sent.map(lambda s: len(s.split()))
)

In [12]:
total_df2 = total_df.drop_duplicates(subset=['url', 'sent'])

decade_counts = total_df2.groupby('decade').wordCount2.sum()
decade_num_sents = total_df2.groupby('decade').size()

decade_df = pd.DataFrame({
    'decade': decade_counts.index,
    'num_sents': decade_num_sents,
    'num_words': decade_counts,
})
decade_df['word_per_sent'] = decade_df.num_words / decade_df.num_sents
decade_df.to_excel('decade_df.xlsx', index=False)
decade_df

Unnamed: 0_level_0,decade,num_sents,num_words,word_per_sent
decade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1900,1900,757,2332845386,3081698.0
1910,1910,1391,4286641918,3081698.0
1920,1920,1899,5852144502,3081698.0
1930,1930,2607,8033986686,3081698.0
1940,1940,2583,7960025934,3081698.0
1950,1950,3789,11676553722,3081698.0
1960,1960,4643,14308323814,3081698.0
1970,1970,8452,26046511496,3081698.0
1980,1980,14004,43156098792,3081698.0
1990,1990,16386,50496703428,3081698.0


In [13]:
dict(total_df.iloc[0])

{'url': 'http://www.jstor.org/stable/461288',
 'page_num': np.int64(4),
 'sent_num': np.int64(4),
 'sent': 'We are all our lifetime __reading__ the copious sense of this first of forms.',
 'context0': "194 Pascal et le d6s6quilibre ment cette circularit6 de l'univers dans des termes d'une hardiesse toute pascalienne: The eye is the first circle; the horizon which it forms is the second; and throughout nature this primary figure is repeated without end.\nIt is the highest emblem in the cipher of the world.\nSt. Augustine described the nature of God as a circle whose center was everywhere, and its circumference nowhere.",
 'context1': 'We are all our lifetime reading the copious sense of this first of forms.',
 'context2': 'One moral we have already deduced, in considering the circulatory or compensatory character of every human action.\n(Essays i, "Circles") 4 Cette conception d\'une compensation circu- laire est belle.\nAurait-elle 6t6 souscrite par Pascal?\nOn peut en douter.\nEt nous

In [16]:
def get_corpus_sents(force=True, **kwargs):
    df_corpus = get_corpus_data()
    return df_corpus

In [3]:
# !pip install -q hashstash[rec]
from hashstash import HashStash
STASH_SENT_COUNTS = HashStash("adjread_sent_counts")
# STASH_SENT_COUNTS.clear()
# Adjective-noun "reading" collocations of interest (for frequency analysis etc.)
SENT_COUNT_WORDS = [
    "reading",
    "distant reading",      # 2010s = 4x of 2000s
    "correct reading",      # 1920s = 31x of 2010s
    "careful reading",      # 1960s = 4x of 2010s
    "political reading",    # 1980s = 3x of 2010s
    "feminist reading",     # 1990s = 5x of 2010s
    "nuanced reading",      # 2010s = 4x of 1990s
    "close reading",        # 2010s = 33x of 1920s
]

def get_sent_counts(txt,id=None,force=False):
    key=id
    if id is not None and not force and key in STASH_SENT_COUNTS:
        return STASH_SENT_COUNTS[key]
    else:
        sents = nltk.sent_tokenize(txt)
        tokens = nltk.word_tokenize(txt.lower())
        words = txt.split()

        
        out_ld = []
        for keyword in SENT_COUNT_WORDS:
            keyword_l = keyword.lower()
            num_words_keyword = txt.lower().count(keyword_l) if len(keyword.split())>1 else tokens.count(keyword_l)
            num_sents_keyword = len([s for s in sents if keyword_l in s.lower()])

            out_d = {
                'url': id,
                'keyword': keyword,
                'keyword_num_words': num_words_keyword,
                'keyword_num_sents': num_sents_keyword,
                'url_num_words': len(words),
                'url_num_sents': len(sents),
            }
            out_ld.append(out_d)
        STASH_SENT_COUNTS[key] = out_ld
        return out_ld

In [4]:
txt_d = next(orjsonl.stream(PATH_CORPUS))
fulltext = '\n\n'.join(txt_d['fullText']).strip()
get_sent_counts(fulltext,txt_d['url'], force=True)

NameError: name 'orjsonl' is not defined

In [7]:
ld_sents_counts = []
for txt_d in tqdm(orjsonl.stream(PATH_CORPUS), total=CORPUS_NUM_SENTS):
    fulltext = '\n\n'.join(txt_d['fullText']).strip()
    ld_sents_counts.extend(get_sent_counts(fulltext,txt_d['url']))
df_sent_counts = pd.DataFrame(ld_sents_counts)
df_sent_counts


NameError: name 'tqdm' is not defined

In [6]:
df_sents_corpus = df_sent_counts.merge(df_corpus[['publicationYear']], on='url', how='left')
df_sents_corpus['year'] = pd.to_numeric(df_sents_corpus.publicationYear, errors="coerce")
df_sents_corpus['decade'] = df_sents_corpus.year.astype(int) // 10 * 10
for col in df_sents_corpus.columns:
    if col.startswith('num_words_'):
        df_sents_corpus[f'perc_{col}'] = df_sents_corpus[col] / df_sents_corpus['num_words'] * 100
    if col.startswith('num_sents_'):
        df_sents_corpus[f'perc_{col}'] = df_sents_corpus[col] / df_sents_corpus['num_sents'] * 100
df_sents_corpus

NameError: name 'df_sent_counts' is not defined

In [83]:
def get_sents_corpus_stats(df):
    df=df.copy()
    decades = df.decade.unique()
    out_ld = []
    for decade in decades:
        df_decade = df.query('decade==@decade').copy()

        decade_sum_words = df_decade.num_words.sum()
        decade_sum_sents = len(df_decade)
        out_d = {}
        out_d['decade'] = decade
        for col in df_decade.columns:
            if col.startswith('num_'):
                summ = df_decade[col].sum()
                out_d[f'{col}'] = summ
        out_ld.append(out_d)
    return pd.DataFrame(out_ld)

# df_sents_corpus = get_sents_corpus_stats(df_sents_corpus)

# df_sents_corpus['sum_words']
# sents_decade_num_words = df_sents_corpus.groupby('decade').num_words.sum()

# sents_decade_num_sents = df_sents_corpus.groupby('decade').size()

# sents_decade_df = pd.DataFrame({
#     'num_sents': sents_decade_num_sents,
#     'num_words': sents_decade_num_words,
#     'word_per_sent': sents_decade_num_words / sents_decade_num_sents,

# })
# sents_decade_df.to_excel('sents_decade_df.xlsx', index=False)
# sents_decade_df

df_sents_corpus_stats = get_sents_corpus_stats(df_sents_corpus)
df_sents_corpus_stats.to_excel('sents_corpus_stats.xlsx', index=False)
df_sents_corpus_stats

Unnamed: 0,decade,num_sents,num_words,num_words_distant_reading,num_sents_distant_reading,num_words_correct_reading,num_sents_correct_reading,num_words_careful_reading,num_sents_careful_reading,num_words_political_reading,num_sents_political_reading,num_words_feminist_reading,num_sents_feminist_reading,num_words_nuanced_reading,num_sents_nuanced_reading,num_words_close_reading,num_sents_close_reading
0,1960,1887689,26374462,0,0,38,38,68,67,0,0,0,0,0,0,96,96
1,1970,1397012,32729554,0,0,43,43,72,72,1,1,3,3,0,0,298,296
2,2000,1776111,42284159,46,44,29,28,101,101,54,54,95,95,56,56,1178,1162
3,1950,1376495,19848818,0,0,44,44,43,43,0,0,0,0,0,0,45,45
4,1980,1572322,36218439,0,0,48,48,106,106,60,60,120,118,3,3,587,571
5,2010,1185118,26842196,135,128,8,8,62,62,45,45,39,38,61,61,1432,1372
6,1990,1789294,41221785,0,0,29,29,123,123,71,71,212,211,31,31,875,872
7,1940,986219,15654020,0,0,58,55,36,36,1,1,0,0,0,0,17,17
8,1920,573505,10082313,0,0,55,55,19,19,0,0,0,0,0,0,3,3
9,1900,312724,4979984,0,0,20,20,7,7,0,0,0,0,0,0,3,3
