# Making poem corpus

In [43]:
output_folder='corpus'
target_col='sample_name'

## Getting sonnet IDs

In [None]:
# Load ChadwyckPoetry corpus metadata
import pandas as pd
metadata_fn='/Users/ryan/DH/lit/corpus/chadwyck_poetry/corpus-metadata.ChadwyckPoetry.txt.gz'
dfmeta=pd.read_csv(metadata_fn,sep='\t',encoding='utf-8')
dfmeta.head()

In [22]:
# Filter and show #rows,#cols
dfmeta=dfmeta.loc[dfmeta.id!='']
dfmeta.shape

(214542, 103)

In [23]:
# What are the 10 most common genres in the corpus and how often do they occur?
dfmeta['attgenre'].value_counts().sort_values(ascending=False).head(10)

Sonnet             8468
Lyric              3729
Ode                2313
Heroic couplets    2285
Metrical Psalm     2093
Hymn               1163
Verse epistle       746
Pastoral poem       431
Ballad              323
Dialect poem        266
Name: attgenre, dtype: int64

## Making samples

In [24]:
# Load the corpus
import lit
CP=lit.load_corpus('ChadwyckPoetry')

In [25]:
# Separate into quarter-centuries, according to when an author was 30 years old (most reliable data)
CPgroups = CP.new_grouping()
CPgroups.group_by_author_at_30(yearbin=25)

>> streaming as tsv: /Users/ryan/DH/lit/corpus/chadwyck_poetry/corpus-metadata.ChadwyckPoetry.txt.gz
   done [20.0 seconds]


In [12]:
# Assign group (quartercentury)
dfmeta['group']=[CPgroups.textid2group.get(idx,'') for idx in dfmeta['id']]

In [26]:
# Filter for min/max periods and for poems >14 lines
dfmeta=dfmeta.loc[dfmeta.group>='1600']
dfmeta=dfmeta.loc[dfmeta.group<'2000']
dfmeta=dfmeta.loc[dfmeta.num_lines>=14]
dfmeta.shape

(214542, 103)

In [27]:
# Get sonnets/nonsonnets
sonnets = dfmeta.loc[dfmeta.attgenre=='Sonnet']
notsonnets = dfmeta.loc[dfmeta.attgenre!='Sonnet']
len(sonnets),len(notsonnets)

(8468, 206074)

In [28]:
# Make a dictionary of {SampleName: Sample}
samples = {'Sonnets':sonnets, 'NotSonnets':notsonnets}

In [29]:
# Balance the sample between sonnets vs. non-sonnets by quarter-century:
# - take an equal number of non/sonnets from each 'group' (quarter-century),
# - up to maxlen (50), or only maxlen if allow_less_than_max==False
def balance_samples(samples,groupcol='group',maxlen=50,allow_less_than_max=False):
    group_types = set()
    for sample in samples.values(): group_types|=set(sample[groupcol])
    
    old=[]
    for gt in sorted(group_types):
        # initiate
        this_sample={}
        for sname,sample in samples.items():
            sdf=this_sample[sname]=sample.loc[sample[groupcol]==gt]

        # get minimum
        minlen=min([len(smpl) for smpl in this_sample.values()])
        if not minlen: continue
        n=minlen if minlen<maxlen else maxlen
        
        # don't allow minimum?
        if not allow_less_than_max and n<maxlen: continue
        
        # resample
        for sname,sample in this_sample.items():
            balanced=sample.sample(n=n)
            balanced['sample_name']=[sname for idx in balanced['id']]
            old+=balanced.to_dict('records')
    
    return pd.DataFrame(old)

In [30]:
# declutter dataframe
def clean_final(df):
    import os
    df['fn']=[os.path.join(output_folder,row['sample_name'],row['idz']+'.txt') for ind,row in df.iterrows()]
    df['l']=[unicode(l).replace('&indent;','') for l in df['l']]
    df=df[['sample_name','id','idz','group','fn','title','author','year','l','num_lines']].set_index('fn')
    return df

In [32]:
# Create a decluttered, balanced dataframe
final_df=clean_final(balance_samples(samples))

In [33]:
# How many sonnets and nonsonnets now?
final_df['sample_name'].value_counts()

NotSonnets    650
Sonnets       650
Name: sample_name, dtype: int64

In [34]:
# How many poems per period?
final_df['group'].value_counts()

1900-1924    100
1850-1874    100
1650-1674    100
1775-1799    100
1800-1824    100
1925-1949    100
1825-1849    100
1875-1899    100
1950-1974    100
1725-1749    100
1600-1624    100
1625-1649    100
1750-1774    100
Name: group, dtype: int64

In [36]:
# Save this dataframe
import os
ofn=os.path.join(output_folder,'metadata.txt')
final_df.to_csv(ofn, sep='\t',encoding='utf-8')
print '>> saved:',ofn

>> saved: corpus/metadata.txt


## Saving corpus

In [51]:
# Save the corpus as text files in corpus/Sonnets/Z...txt
def save_corpus(df,max_num_lines=14,output_folder=output_folder):
    import os
    from lit import tools
    for sname,smpl in df.groupby(target_col):
        ofolder=os.path.join(output_folder,sname)
        for ofnfn,idx in zip(smpl.index,smpl.id):
            text=CP.textd[idx]
            lines=text.text_plain().strip().split('\n')
            print ofnfn,len(lines)
            lines_to_keep=[]
            for line in lines:
                if len([x for x in lines_to_keep if x])>=max_num_lines: break
                lines_to_keep+=[line]
            odir=os.path.dirname(ofnfn)
            if not os.path.exists(odir): os.makedirs(odir)
            tools.write2(ofnfn,'\n'.join(lines_to_keep))




In [52]:
#final_df

In [54]:
# save_corpus(final_df)
# last run: 2/11 17:44