In [None]:
# !pip install -q -r requirements.txt
from __reading import *

In [None]:
CORPUS_DATA = None
def get_corpus_data(path_corpus=PATH_CORPUS):
    def without_full_text(d):
        return {k: v for k, v in d.items() if k != 'fullText'}

    global CORPUS_DATA
    if CORPUS_DATA is None:
        CORPUS_DATA = pd.DataFrame(
            tqdm(
                (without_full_text(d) for d in orjsonl.stream(path_corpus)),
                total=CORPUS_NUM_SENTS
            )
        ).set_index('url')
    return CORPUS_DATA

In [None]:
df_corpus = get_corpus_data()
df_instances = get_instances_data()
word2data = get_word2data()

In [None]:
total_df = df_instances.merge(df_corpus, on="url", how="left").fillna("")
total_df['year'] = pd.to_numeric(total_df.publicationYear, errors="coerce")
total_df['decade'] = total_df.year.astype(int) // 10 * 10
total_df['prev_pos'] = total_df.token0.map(lambda x: word2data[x]['pos'] if x in word2data else None).fillna("")
total_df['next_pos'] = total_df.token2.map(lambda x: word2data[x]['pos'] if x in word2data else None).fillna("")

In [None]:
total_df_adjs = total_df[total_df.prev_pos.str.startswith("j")]

In [None]:
total_df_adjs

In [None]:
def get_decade_counts(df, max_rank=100):
    new_ld = []
    for decade, decade_df in df.groupby('decade'):
        tok_counts = decade_df.token0.value_counts()
        tok_rel_freq = tok_counts / tok_counts.sum()
        
        for rank, (tok, rel_freq) in enumerate(tok_rel_freq.items()):
            if rank >= max_rank:
                break

            tok_df = decade_df[decade_df.token0 == tok]
            sents = tok_df.sent.tolist()

            decade_d = {
                'decade': decade,
                'token': tok,
                'rank': rank+1,
                'count': tok_counts[tok],
                'freq': rel_freq,
                "sents": sents,
                
            }
            new_ld.append(decade_d)
    new_df = pd.DataFrame(new_ld)
    return new_df

In [None]:
decade_word_counts = get_decade_counts(total_df_adjs)



In [None]:
new_df

In [None]:
# !pip install plotnine
import plotnine as p9
p9.options.figure_size = (10, 6)
p9.options.dpi = 300

In [None]:
# !pip install numpy

In [None]:
most_ever_top_words = new_df.groupby('token')['count'].sum().sort_values(ascending=False).index.tolist()
print("distant" in most_ever_top_words)


fig_num_words = 50
fig_top_words = most_ever_top_words[:fig_num_words]
print("distant" in fig_top_words)

ever_top_words_sorted = token2historical_avg.sort_values().index.tolist()
# ever_top_words_sorted = fig_top_words
fig_df = new_df.query('token in @fig_top_words').copy()
fig_df['fpk'] = fig_df['freq'] * 1000
fig_df['historical_avg'] = fig_df['token'].map(token2historical_avg)

token_labels = [
    f"{token} ({int(token2historical_avg[token])})"
    for token in ever_top_words_sorted
    if token in fig_top_words
]

token_labels


fig_df['token_label'] = fig_df.apply(lambda row: f"{row['token']} ({int(row['historical_avg'])})", axis=1)

fig_df['token'] = pd.Categorical(fig_df['token'], categories=ever_top_words_sorted, ordered=True)
fig_df['token_label'] = pd.Categorical(fig_df['token_label'], categories=token_labels, ordered=True)

In [None]:
fig = p9.ggplot(fig_df, p9.aes(x="decade", y="fpk", group="token"))
fig += p9.geom_line(size=.5, alpha=.15, color="blue")
fig += p9.geom_point(p9.aes(size="count"), alpha=.15, color="blue")
fig += p9.geom_text(p9.aes(label="count"), size=8, angle=45)
fig += p9.facet_wrap("token_label", ncol=10)
fig += p9.theme_classic()
fig += p9.theme(figure_size=(16, 9))
fig


In [None]:
word_counts = new_df.groupby('token')['count'].sum()
word_freqsum = new_df.groupby('token')['freq'].sum()
word_freqavg = new_df.groupby('token')['freq'].mean()
word_freqmax = new_df.groupby('token')['freq'].max()
word_ld =[]
for word in token2historical_avg.index:
    word_d = {
        "token": word,
        "count_sum": word_counts[word],
        "freq_sum": word_freqsum[word],
        "freq_avg": word_freqavg[word],
        "freq_max": word_freqmax[word],
        "historical_avg": int(token2historical_avg[word]),
    }
    word_ld.append(word_d)
word_df = pd.DataFrame(word_ld)
word_df['freq'] = word_df['freq_sum']
word_df['fpk_max'] = word_df['freq_max'] * 1000
word_df


In [None]:
word_fig_df = word_df[word_df.count_sum > 10]
# print("distant" in word_fig_df.token.tolist())
fig = p9.ggplot(word_fig_df, p9.aes(x="historical_avg", y="fpk_max"))
fig += p9.geom_text(p9.aes(label="token"), size=12)
fig += p9.theme_minimal()
fig += p9.theme(figure_size=(16, 9))
fig += p9.scale_y_log10()
fig


In [None]:
word_df.to_csv('word_stats.csv')

In [None]:
total_df_adjs.decade #.query('token0 == "distant"').sent.tolist()

In [None]:
total_df_adjs

In [None]:
grp,grpdf = next((g,grpdf) for g,grpdf in total_df_adjs.groupby(['decade','token0']) if g[1] == "distant")
grpdf

In [None]:
dict(grpdf.iloc[0])

In [None]:
# dict(total_df_adjs.iloc[0])

In [None]:
def get_eg_str(word_decade_df, max_sents=10):
    egdf = word_decade_df if len(word_decade_df) <= max_sents else word_decade_df.sample(max_sents)

    egs = []
    for grp, grpdf in egdf.groupby('url'):
        row = grpdf.iloc[0]
        author = row.creator
        if len(row.creator) > 1:
            author += " et al."
        title = row.title
        year = row.year
        journal = row.isPartOf
        vol = row.volumeNumber
        issue = row.issueNumber
        this_eg = [f'{author}, "{title}", _{journal}_, vol. {vol}, no. {issue} ({year})']
        for sent in grpdf.sent.tolist():
            this_eg.append(f'  * {sent}')
        egs.append("\n".join(this_eg))
    return '\n\n'.join(egs)



    
    

In [None]:
# total_df_adjs

In [None]:
word_decade_to_egs = {
    (decade, word): get_eg_str(word_decade_df)
    for (decade, word), word_decade_df in total_df_adjs.groupby(['decade','token0'])
}

out_df = new_df.copy()
out_df['egs'] = out_df.apply(lambda row: word_decade_to_egs[(row.decade, row.token)], axis=1)

out_df = out_df.merge(word_df, on='token', how='left', suffixes=('', '_word'))




In [None]:
out_df

In [None]:
# !pip install openpyxl
out_df.to_excel('word_decade_stats.xlsx', index=False)