# Creating feature sets about the sonnets

In [16]:
# Where is the corpus?
corpus_folder='corpus'
metadata_fn='corpus/metadata.txt'

In [17]:
# importing
import pandas as pd,os
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 5)

## Loading metadata

In [19]:
# loading metadata
df=pd.read_csv(metadata_fn,encoding='utf-8',sep='\t')
df['fn']=[os.path.join(corpus_folder,row['sample_name'],row['idz']+'.txt') for ind,row in df.iterrows()]
df['l']=[unicode(l).replace('&indent;','') for l in df['l']]
df=df[['sample_name','group','fn','title','author','year','l','num_lines']].set_index('fn')
df

Unnamed: 0_level_0,sample_name,group,title,author,year,l,num_lines
fn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
corpus/Sonnets/Z300342466.txt,Sonnets,1600-1624,[i] [The Instabilitie of Mortall Glorie.],"Drummond, William, 1585-1649",1913,"Proude Obeliskes, Tombes of the vastest frame,",14
corpus/Sonnets/Z200273009.txt,Sonnets,1600-1624,"Sonnet, XIIII. [Lord, my dryrie foes, why doe...","Barnfield, Richard, 1574-1627",1594,"Me for to ruinate, sundry be couetous.",16
...,...,...,...,...,...,...,...
corpus/NotSonnets/Z300202663.txt,NotSonnets,1950-1974,Corrected Review: THEREISATREEMOREANCIENTTHANEDEN,"Harper, Michael S., 1938-",1970,"achieved in the imagination conjured,",29
corpus/NotSonnets/Z400358659.txt,NotSonnets,1950-1974,ENDYMION,"Grossman, Allen, 1932-",1986,To fight the crocodile you must be young.,43


### Featureset #1: Poesy

In [24]:
import codecs,os
from poesy import Poem,product,scheme2edges

def do_get_feats(fn):
    print '>>',fn,'...'
    try:
        with codecs.open(fn,encoding='utf-8') as f: txt=f.read()
        print '~~'
        print txt
        print '~~'
        poem=Poem(txt)
        statd=poem.statd
        statd['fn']=fn
        
        # convert list to feats
        if statd['rhyme_schemes']:
           for (schemename,schemevals),acc in statd['rhyme_schemes']:
               statd['rhyme_scheme_acc_'+schemename]=acc
        del statd['rhyme_schemes']
        
        for l1,l2 in sorted(scheme2edges(poem.rhyme_ids)):
            statd['rhymes_l{0}-l{1}'.format(str(l1+1).zfill(2),str(l2+1).zfill(2))]=1
        return statd
    except AttributeError:
        return {}

In [25]:
do_get_feats(df.index[0])

>> corpus/Sonnets/Z300342466.txt ...
~~
Triumphant Arches, Statues crowned with Baize,
Proud Obeliskes, Tombes of the vastest frame,
Colosses, brazen Atlases of Fame,
Phanes vainly built to vain Idols praise;
States, which vnsatiate Mindes in blood do raise,
From the Crosse-starres unto the Articke Teame,
Alas! and what we write to keep our Name,
Like Spiders Caules are made the sport of Days:
All only constant is in constant Change,
What done is, is undone, and when undone,
Into some other figure doth it range;
Thus moves the restless World beneath the Moon:
    Wherefore (my Minde) above Time, Motion, Place,
    Thee raise, and Steppes, not reached by Nature trace.
~~


{'beat_scheme': (5,),
 'beat_scheme_diff': 12,
 'beat_scheme_length': 1,
 'beat_scheme_repr': 'Pentameter',
 'beat_scheme_type': 'Invariable',
 'fn': u'corpus/Sonnets/Z300342466.txt',
 'meter_ambiguity': 2.357142857142857,
 'meter_constraint_TOTAL': 0.12598425196850394,
 'meter_constraint_footmin-f-resolution': 0.007874015748031496,
 'meter_constraint_footmin-w-resolution': 0.0,
 'meter_constraint_strength_w=>-p': 0.0,
 'meter_constraint_stress_s=>-u': 0.031496062992125984,
 'meter_constraint_stress_w=>-p': 0.08661417322834646,
 'meter_length_avg_line': 9.357142857142858,
 'meter_length_avg_parse': 9.357142857142858,
 'meter_mpos_s': 0.4881889763779528,
 'meter_mpos_ss': 0.015748031496062992,
 'meter_mpos_w': 0.48031496062992124,
 'meter_mpos_ww': 0.015748031496062992,
 'meter_perc_lines_ending_s': 0.9285714285714286,
 'meter_perc_lines_ending_w': 0.07142857142857142,
 'meter_perc_lines_fourthpos_s': 0.8571428571428571,
 'meter_perc_lines_fourthpos_w': 0.14285714285714285,
 'meter_perc

In [None]:
# This function saved in poesy_process.py
# I then execute this function using MPI parallel processing through 'slingshot'
# slingshot = https://github.com/quadrismegistus/mpi-slingshot

!slingshot -sling poesy_process.py -stone do_get_feats -path corpus/metadata.txt -pathkey fn -savedir data_poesy -parallel 16


!! EXCUTING NOW @ 20190211-1759-27 !!

>> SLINGSHOT COMMAND:
slingshot -sling poesy_process.py -stone do_get_feats -path /Users/ryan/DH/poetry/classification/sonnetproject/corpus/metadata.txt -savedir data_poesy -parallel 16

>> EXECUTING COMMAND:
mpirun -np 16 python -c "from mpi_slingshot import slingshot; slingshot(path_sling='poesy_process.py', stone_name='do_get_feats', path_source='/Users/ryan/DH/poetry/classification/sonnetproject/corpus/metadata.txt', path_ext='None', path_key='fn', path_prefix='', path_suffix='', limit=None, results_dir='data_poesy', cache_results=True, save_results=True, txt_maxcols=10000, num_runs=1)"  | tee /dev/tty > data_poesy/output.txt

>> SLINGSHOT: initializing MPI with size 16 and rank 3
>> SLINGSHOT: initializing MPI with size 16 and rank 9
>> SLINGSHOT: initializing MPI with size 16 and rank 12
>> SLINGSHOT: initializing MPI with size 16 and rank 13
>> SLINGSHOT: initializing MPI with size 16 and rank 14
>> SLINGSHOT: initializing MPI with size 16

In [10]:
# Tabulated results saved in data_nlp/results.txt
df_feats_poesy=pd.read_csv('data_poesy/results.txt',encoding='utf-8',sep='\t')

In [11]:
df_feats_poesy['fn']=[unicode(fn).replace('/Users/ryan/DH/poetry/classification/','') for fn in df_feats_poesy['fn']]
df_feats_poesy=df_feats_poesy.set_index('fn')

In [12]:
df_feats_poesy.head()

Unnamed: 0_level_0,_path,beat_scheme,beat_scheme_diff,beat_scheme_length,beat_scheme_repr,beat_scheme_type,meter_ambiguity,meter_constraint_TOTAL,meter_constraint_footmin-f-resolution,meter_constraint_footmin-w-resolution,...,rhymes_l11-l13,rhymes_l11-l14,rhymes_l12-l13,rhymes_l12-l14,rhymes_l13-l14,syll_scheme,syll_scheme_diff,syll_scheme_length,syll_scheme_repr,syll_scheme_type
fn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
corpus2/Sonnets/Z400342329.txt,corpus2/Sonnets/Z400342329.txt,"[5, 5, 4, 5, 5, 5, 4]",11.0,7.0,Complex (5-5-4-5-5-5-4),Complex,3.071429,0.16,0.0,0.0,...,,,,,1.0,"[10, 10, 10, 10, 10, 8, 10]",9.0,7.0,Complex (10-10-10-10-10-8-10),Complex
corpus2/Sonnets/Z200344673.txt,corpus2/Sonnets/Z200344673.txt,"[5, 5, 4, 5, 5, 5, 5]",5.0,7.0,Complex (5-5-4-5-5-5-5),Complex,2.285714,0.137405,0.0,0.0,...,,1.0,,1.0,,"[10, 10, 8, 10, 10, 10, 10]",6.0,7.0,Complex (10-10-8-10-10-10-10),Complex
corpus2/Sonnets/Z300488995.txt,corpus2/Sonnets/Z300488995.txt,[5],6.0,1.0,Pentameter,Invariable,2.428571,0.180451,0.0,0.0,...,1.0,1.0,,,1.0,[10],4.0,1.0,10,Invariable
corpus2/NotSonnets/Z200448334.txt,corpus2/NotSonnets/Z200448334.txt,[5],0.0,1.0,Pentameter,Invariable,1.714286,0.067164,0.007463,0.0,...,,,,,1.0,[10],0.0,1.0,10,Invariable
corpus2/Sonnets/Z300306252.txt,corpus2/Sonnets/Z300306252.txt,[5],8.0,1.0,Pentameter,Invariable,4.714286,0.228571,0.021429,0.014286,...,,1.0,1.0,,,"[12, 10, 12, 10, 10, 10, 10]",14.0,7.0,Complex (12-10-12-10-10-10-10),Complex


In [13]:
df_feats_poesy.to_csv('featuresets/data.feats.poesy.sonnets-vs-nonsonnets.txt',sep='\t',encoding='utf-8')
# last saved: 2/11 13:14

## Featureset #2: Word frequencies

In [10]:
def tokenize_fast(line):
    import re
    return re.findall("[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+",line.lower())

def count_words_fast(path):
    from future_builtins import map
    from collections import Counter
    from itertools import chain
    with codecs.open(path,encoding='utf-8',errors='ignore') as f:
        return Counter(chain.from_iterable(map(tokenize_fast, f)))

In [11]:
def build_wordfreq_featureset(df,nlargest=1000):
    fns=df.index
    results = [count_words_fast(fn) for fn in fns]
    odf=pd.DataFrame(results).fillna(0)
    top_cols=odf.sum(0).nlargest(nlargest).index
    odf=odf[top_cols]
    odf['_fn']=fns
    return odf.set_index('_fn')

In [12]:
df_feats_wordfreq = build_wordfreq_featureset(df)
df_feats_wordfreq.shape

(1300, 1000)

In [13]:
df_feats_wordfreq.to_csv('featuresets/data.feats.wordfreqs.sonnets-vs-nonsonnets.txt',sep='\t',encoding='utf-8')
# last saved: 2/11 13:14