# Making poem corpus

In [1]:
output_folder='corpus'

## Getting sonnet IDs

In [2]:
import pandas as pd
metadata_fn='/Users/ryan/DH/lit/corpus/chadwyck_poetry/corpus-metadata.ChadwyckPoetry.txt.gz'
dfmeta=pd.read_csv(metadata_fn,sep='\t',encoding='utf-8')
dfmeta.shape

  interactivity=interactivity, compiler=compiler, result=result)


(336180, 102)

In [3]:
dfmeta=dfmeta.loc[dfmeta.id!='']
dfmeta.shape

(336180, 102)

In [4]:
#dfmeta.groupby('attgenre').count()
dfmeta['attgenre'].value_counts().sort_values(ascending=False).head(10)

Sonnet             10136
Lyric               5153
Heroic couplets     4353
Ballad              3181
Metrical Psalm      2987
Ode                 2577
Epigram             1246
Hymn                1232
Verse epistle        895
Epitaph              804
Name: attgenre, dtype: int64

## Making samples

In [5]:
## Divide into groups
import lit
CP=lit.load_corpus('ChadwyckPoetry')

>> reading config files...


In [6]:
CPgroups = CP.new_grouping()
CPgroups.group_by_author_at_30(yearbin=25)

>> streaming as tsv: /Users/ryan/DH/lit/corpus/chadwyck_poetry/corpus-metadata.ChadwyckPoetry.txt.gz
   done [14.0 seconds]


In [7]:
dfmeta['group']=[CPgroups.textid2group.get(idx,'') for idx in dfmeta['id']]

In [8]:
# filtering
dfmeta=dfmeta.loc[dfmeta.group>='1600']
dfmeta=dfmeta.loc[dfmeta.group<'2000']
dfmeta=dfmeta.loc[dfmeta.num_lines>=14]
#dfmeta['file_exists']=[CP.textd[idx].exists for idx in dfmeta['id']]
#dfmeta=dfmeta.loc[dfmeta['file_exists']==True]
dfmeta.shape

(214542, 103)

In [9]:
sonnets = dfmeta.loc[dfmeta.attgenre=='Sonnet']
notsonnets = dfmeta.loc[dfmeta.attgenre!='Sonnet']

In [10]:
samples = {
    'Sonnets':sonnets,
    'NotSonnets':notsonnets
}

In [32]:
def balance_samples(samples,groupcol='group',maxlen=50):
    group_types = set()
    for sample in samples.values(): group_types|=set(sample[groupcol])
    
    old=[]
    for gt in sorted(group_types):
        # initiate
        this_sample={}
        for sname,sample in samples.items():
            sdf=this_sample[sname]=sample.loc[sample[groupcol]==gt]

        # get minimum
        minlen=min([len(smpl) for smpl in this_sample.values()])
        if not minlen: continue
        n=minlen if minlen<maxlen else maxlen
        
        # don't allow minimum?
        if n<maxlen: continue
        
        # resample
        for sname,sample in this_sample.items():
            balanced=sample.sample(n=n)
            balanced['sample_name']=[sname for idx in balanced['id']]
            old+=balanced.to_dict('records')
    
    return pd.DataFrame(old)


In [33]:
def clean_final(df):
    import os
    df['fn']=[os.path.join(output_folder,row['sample_name'],row['idz']+'.txt') for ind,row in df.iterrows()]
    df['l']=[unicode(l).replace('&indent;','') for l in df['l']]
    df=df[['sample_name','id','idz','group','fn','title','author','year','l','num_lines']].set_index('fn')
    return df

In [34]:
final_df=clean_final(balance_samples(samples))

In [35]:
final_df['sample_name'].value_counts()

NotSonnets    650
Sonnets       650
Name: sample_name, dtype: int64

In [36]:
final_df['group'].value_counts()

1900-1924    100
1850-1874    100
1650-1674    100
1775-1799    100
1800-1824    100
1925-1949    100
1825-1849    100
1875-1899    100
1950-1974    100
1725-1749    100
1600-1624    100
1625-1649    100
1750-1774    100
Name: group, dtype: int64

In [37]:
import os
ofn=os.path.join(output_folder,'metadata.txt')
final_df.to_csv(ofn, sep='\t',encoding='utf-8')

## Saving corpus

In [38]:
def save_corpus(df,max_num_lines=14,output_folder=output_folder):
    import os
    from lit import tools
    for sname,smpl in df.groupby('sample_name'):
        ofolder=os.path.join(output_folder,sname)
        for ofnfn,idx in zip(smpl.index,smpl.id):
            text=CP.textd[idx]
            lines=text.text_plain().strip().split('\n')
            lines_to_keep=[]
            for line in lines:
                if len([x for x in lines_to_keep if x])>=max_num_lines: break
                lines_to_keep+=[line]
            odir=os.path.dirname(ofnfn)
            if not os.path.exists(odir): os.makedirs(odir)
            tools.write2(ofnfn,'\n'.join(lines_to_keep))

In [39]:
save_corpus(final_df)

>> getting spelling modernizer from /Users/ryan/DH/lit/data/spelling_variants_from_morphadorner.txt...
>> saved: corpus2/NotSonnets/Z300275485.txt
>> saved: corpus2/NotSonnets/Z400286971.txt
>> saved: corpus2/NotSonnets/Z200377081.txt
>> saved: corpus2/NotSonnets/Z200539795.txt
>> saved: corpus2/NotSonnets/Z400342708.txt
>> saved: corpus2/NotSonnets/Z300286991.txt
>> saved: corpus2/NotSonnets/Z300381944.txt
>> saved: corpus2/NotSonnets/Z300287385.txt
>> saved: corpus2/NotSonnets/Z200512232.txt
>> saved: corpus2/NotSonnets/Z300275508.txt
>> saved: corpus2/NotSonnets/Z200393530.txt
>> saved: corpus2/NotSonnets/Z400468862.txt
>> saved: corpus2/NotSonnets/Z300396642.txt
>> saved: corpus2/NotSonnets/Z400287339.txt
>> saved: corpus2/NotSonnets/Z200274653.txt
>> saved: corpus2/NotSonnets/Z200540009.txt
>> saved: corpus2/NotSonnets/Z300505924.txt
>> saved: corpus2/NotSonnets/Z300381955.txt
>> saved: corpus2/NotSonnets/Z300492918.txt
>> saved: corpus2/NotSonnets/Z400287079.txt
>> saved: corpus2