In [None]:
# Code
import sys
sys.path.append('../')
from generative_formalism import *
pd.options.display.max_rows = 5


# Markdown header
printm(f'# Corpus: Chadwyck-Healey poetry collections')
printm(f'## Loading corpus from source')


# Checking if paths exist
printm(f'### Checking for Chadwyck-Healey access')

if not check_chadwyck_healey_paths():
    printm(f'Paths and/or URLs for Chadwyck-Healey corpus do not exist, skipping...')
else:
    printm(f'Paths and/or URLs for Chadwyck-Healey corpus exist, loading corpus...')

    printm(f'## Loading corpus')    
    documentation(get_chadwyck_corpus_metadata)
    documentation(get_chadwyck_corpus_texts)
    documentation(get_chadwyck_corpus)
    
    df_corpus = get_chadwyck_corpus()
    display(df_corpus)
    

    printm(f'## Replicating corpus sampling')
    documentation(get_chadwyck_corpus_sampled_by)
    
    printm(f'### Sampling by rhyme annotation')
    df_smpl_by_rhyme_replicated = get_chadwyck_corpus_sampled_by('rhyme', as_replicated=True, force=True)
    display(df_smpl_by_rhyme_replicated)
    assert len(df_smpl_by_rhyme_replicated) == 2000
    printm('----')

    printm(f'### Sampling by period')
    df_smpl_by_period_replicated = get_chadwyck_corpus_sampled_by('period', as_replicated=True, force=True)
    display(df_smpl_by_period_replicated)
    assert len(df_smpl_by_period_replicated) == 8000
    printm('----')
    
    printm(f'### Sampling by period/subcorpus')
    df_smpl_by_period_subcorpus_replicated = get_chadwyck_corpus_sampled_by('period_subcorpus',as_replicated=True,force=True)
    display(df_smpl_by_period_subcorpus_replicated)
    assert len(df_smpl_by_period_subcorpus_replicated) > 20_000
    printm('----')
    
    printm(f'### Sampling by sonnet/period')
    df_smpl_by_sonnet_period_replicated = get_chadwyck_corpus_sampled_by('sonnet_period', as_replicated=True, force=True)
    display(df_smpl_by_sonnet_period_replicated)
    printm('----')



printm(f'## Loading samples used in paper')
printm(f'### Sample by rhyme annotation')
df_smpl_by_rhyme_in_paper = get_chadwyck_corpus_sampled_by('rhyme', verbose=True)
display(df_smpl_by_rhyme_in_paper)
describe_qual(df_smpl_by_rhyme_in_paper.rhyme)
assert len(df_smpl_by_rhyme_in_paper) == 2000
printm('----')


printm(f'### Sample by period')
df_smpl_by_period_in_paper = get_chadwyck_corpus_sampled_by('period')
display(df_smpl_by_period_in_paper)
describe_qual(df_smpl_by_period_in_paper.period)
assert len(df_smpl_by_period_in_paper) == 8000
printm('----')


printm(f'### Sample by period/subcorpus')
df_smpl_by_period_subcorpus_in_paper = get_chadwyck_corpus_sampled_by('period_subcorpus')
display(df_smpl_by_period_subcorpus_in_paper)
describe_qual_grouped(df_smpl_by_period_subcorpus_in_paper, ['period','subcorpus'], name='period/subcorpus')
assert len(df_smpl_by_period_subcorpus_in_paper) > 20_000
printm('----')


printm(f'### Sample by sonnet/period')
df_smpl_by_sonnet_period_in_paper = get_chadwyck_corpus_sampled_by('sonnet_period')
display(df_smpl_by_sonnet_period_in_paper)
describe_qual_grouped(df_smpl_by_sonnet_period_in_paper, ['period'], name='sonnet/period')
printm('----')