# Plot data from Hathi, etc

## Setup

In [1]:
import sys; sys.path.append('../..')
from abslithist import *
import mpi_slingshot as sl

In [2]:
data_dir = os.path.join(COUNT_DIR,'counts_from_jsons2','cache')
CACHE_DF_FN = os.path.join(COUNT_DIR,'counts_from_jsons2','dfall.ft')

In [6]:
CORPORA = [
#     'ARTFL',
#     'BPO',
    'CLMET',
    'COCA',
    'COHA',
    'CanonFiction',
    'Chadwyck',
    'ChadwyckDrama',
    'ChadwyckPoetry',
    'Chicago',
#     'DTA',
    'DialNarr',
    'ECCO',
#     'ECCO_LitLang',
    'ECCO_TCP',
    'EEBO_TCP',
#     'ESTC',
    'EnglishDialogues',
    'FanFic',
    'GaleAmericanFiction',
    'GildedAge',
#     'Hathi',
    'HathiBio',
    'HathiEngLit',
    'HathiEssays',
    'HathiLetters',
    'HathiNovels',
    'HathiProclamations',
    'HathiSermons',
    'HathiStories',
    'HathiTales',
    'HathiTreatises',
    'InternetArchive',
#     'JstorDFR',
#     'LitHist',
#     'LitHistAuthors',
#     'LitHistHathi',
#     'LitHistProse',
#     'LitLab',
    'MarkMark',
    'NewYorker',
    'OldBailey',
    'PMLA',
#     'RavenGarside',
    'SOTU',
    'Sellers',
#     'SemanticCohort',
    'Spectator',
    'TedJDH',
#     'TxtLab'
]

In [7]:
def load_path2meta(corpora=CORPORA,cols=['id','year','genre','medium','title']):
    path2meta={}
    
    for corpus in tqdm(corpora,desc='Loading corpus metadata files'):
#     for corpus in corpora:
        C=lltk.load(corpus)
        Cdf=C.metadata
        for col in cols:
            if not col in set(Cdf.columns):
                Cdf[col]=''
        Cdf=Cdf[cols]
#         for rowd in tqdm(Cdf.to_dict('records'),desc=f'Loading {C.name} metadata',total=len(Cdf)):
        found=0
        for rowd in Cdf.to_dict('records'):
            idx=rowd.get('id')
            if not idx: continue
            if not idx in C.textd: continue
            path=C.textd[idx].path_freqs
            if not path: continue
            rowd['corpus']=C.name
            path2meta[path]=rowd
            found+=1
#         print(corpus,found)
    return path2meta

In [5]:
path2meta=load_path2meta(CORPORA)
len(path2meta)

Loading corpus metadata files:   3%|▎         | 1/35 [00:16<09:05, 16.03s/it]


KeyboardInterrupt: 

### Load count data

In [None]:
def load_data(data_dir,path2meta):
    ld=[]
    for path,pathld in sl.stream_results(data_dir):
        meta=path2meta.get(path)
        corp=path.split('/freqs/')[0].split('/')[-1]
        if meta is None: continue
        meta['path_corpus']=corp
        for pdx in pathld:
            odx={**pdx, **meta}
            ld.append(odx)
    df=pd.DataFrame(ld)
    
    ## add essential data
    for x in ['abs','conc','neither']:
        df[f'perc_{x}'] = df[f'num_{x}']/df['num_total'] * 100
    df['abs-conc']=df['perc_abs'] - df['perc_conc']
    df['abs/conc']=df['perc_abs'] / df['perc_conc']
    df['year']=pd.to_numeric(df['year'], errors='coerce')
    return df

In [None]:
df=load_data(data_dir,path2meta)
df

In [None]:
meta = [p for p in path2meta if 'gildedage' in p]
len(meta),meta[0]

In [None]:
# !cat "/home/ryan/lltk_data/corpora/gildedage/freqs/1875.Alcott.Am.F.Eight Cousins.json"

In [None]:
set(CORPORA) - set(df.corpus)

In [None]:
set(df.corpus) & {'EEBO_TCP'}

In [None]:
stop

In [None]:
# check year cols
def check_year_cols(corpora):
    for c in corpora:
        C=lltk.load(c)
        print(C.name)
        print(C.metadata.year)
        print('\n')

In [None]:
# check_year_cols(CORPORA)

### Run setup

In [None]:
# !rm $CACHE_DF_FN

In [None]:
if not os.path.exists(CACHE_DF_FN):
    path2meta=load_path2meta(CORPORA)
    df=load_data(data_dir,path2meta)
    df.to_feather(CACHE_DF_FN)
else:
    df=pd.read_feather(CACHE_DF_FN)
df

In [None]:
# # Filter
# df[~df.year.isna()]


## Inspect

In [None]:
# df.corpus.value_counts()

In [None]:
# df[df.year.isnull()].corpus.value_counts()

In [None]:
# df.genre.value_counts()

In [None]:
# df[df.genre=='Fiction'].corpus.value_counts()

In [None]:
# df[df.genre=='Biography'].corpus.value_counts()

In [None]:
# df[df.genre=='Essays'].corpus.value_counts()

In [None]:
# df[df.genre=='Periodical'].corpus.value_counts()

In [None]:
df[df.corpus=='EEBO_TCP']

## Filter

In [None]:
def fixgenres(row):
    g=row.get('genre')
    g1=None
    g2=None
    
    ###
    # g1?
    
    
    if g in {'Verse','Poetry'}: g2='Poetry'
    if g in {'Fiction','Novel','Tale','Story'}: g2='Fiction'
    if g in {'Treatise','Essay'}: return 'Essay/Treatise'
    if g in {'Letter','Letters'}: return 'Letters'
    if g in {'Sermon','Biography'}: return g
    return None

## Plot

In [None]:
df['genre2']=df.genre.apply(fixgenre)
df.genre2.value_counts()

In [None]:
# df[df.genre2.isna()]

In [None]:
# Figure data
BAD_CORPORA = ['CLMET','Sellars']

# filter
df['dec']=df.year.apply(lambda y: y)#//5*5)
df['dec']=df.year.apply(lambda y: y//5*5)
figdf=df[~df.genre.isna()].groupby(['dec','genre2','genre','corpus','period']).mean().reset_index()
figdf=figdf[~figdf.corpus.isin(BAD_CORPORA)]
figdf=figdf[figdf.period!='orig']

# Custom corpus filters
figdf=figdf[(figdf.corpus!='Chadwyck') | (figdf.year<1900)]
figdf=figdf[(figdf.corpus!='MarkMark') | (figdf.year>=1900)]
figdf=figdf[(figdf.genre!='Poetry') | (figdf.corpus!='ECCO_TCP')]
figdf=figdf[(figdf.corpus!='HathiEngLit') | (figdf.year>=1720)]
figdf=figdf[figdf.year>=1600]
figdf=figdf[figdf.year<=2000]
figdf.genre.value_counts()

In [None]:
figdf.corpus.value_counts()

In [None]:
figdf[figdf.genre=='Poetry'].corpus.value_counts()

In [None]:
p9.options.figure_size=(12,9)

### Figure 1

In [None]:
# fig=p9.ggplot(p9.aes(x='year',y='abs-conc',color='genre',shape='corpus'), data=figdf)
# fig+=p9.geom_point(alpha=0.25)
# fig+=p9.geom_smooth(method='loess',span=0.2)#group='genre')
# fig

### Figure 1C: By genre in column

In [None]:
# figdf['genre']=pd.Categorical(figdf['genre'])
# figdf['genre'].cat.reorder_categories(['Essa'])
# figdf['abs/conc']=figdf['num_abs']/figdf['num_conc']
figdf['abs-conc_z']=zscore(figdf['abs-conc'])
fig=p9.ggplot(
    p9.aes(x='year',y='abs-conc_z',color='corpus'),#,shape='period'), 
    data=figdf#.groupby(['year','genre','corpus']).mean().reset_index()#[figdf.genre.isin({'Fiction'})]
)
fig+=p9.geom_point(alpha=0.25)
fig+=p9.geom_smooth(p9.aes(group='corpus'),method='loess',span=0.333)#group='genre')
fig+=p9.facet_wrap('genre2')#,ncol=2,nrow=2)
# fig+=p9.ylim(-30,60)
fig+=p9.ylim(-2.5,2.5)
fig+=p9.geom_hline(yintercept=0,size=0.5,alpha=0.55)
fig

### Figure 1B
Averaging across corpora.

In [None]:
# fig data
fig=p9.ggplot(
    p9.aes(x='year',y='abs-conc',color='genre',shape='corpus'),
#     data=figdf.groupby(['year','genre','period']).mean().reset_index()
    data=figdf.groupby(['year','genre','corpus']).mean().reset_index()
)
fig+=p9.geom_point(alpha=0.25)
fig+=p9.geom_smooth(p9.aes(group='genre'),method='loess',span=0.3)#group='genre')
fig

### Figure 2

In [None]:
figdf2=figdf.melt(['year','genre','genre2','corpus','period'],['perc_abs','perc_conc','perc_neither'])
# figdf2

In [None]:
fig=p9.ggplot(p9.aes(x='year',y='value',color='genre2',shape='corpus'), data=figdf2)
fig+=p9.geom_point(alpha=0.25)
fig+=p9.geom_smooth(p9.aes(group='genre2'),method='loess',span=0.3)#group='genre')
fig+=p9.facet_wrap('variable')
fig

In [None]:
fig=p9.ggplot(p9.aes(x='year',y='value',color='variable',shape='corpus'), data=figdf2)
fig+=p9.geom_point(alpha=0.25)
fig+=p9.geom_smooth(method='loess',span=0.3)#group='genre')
fig+=p9.facet_wrap('genre2')
fig+=p9.scale_color_manual({'perc_abs':'#83b9d8', 'perc_conc':'#f9b466', 'perc_neither':'silver'})
fig+=p9.ylim(0,70)
fig