# Compare Models

## Create featuresets

In [4]:
# defining constants
corpus_folder='corpus'
metadata_fn='corpus/metadata.txt'

In [5]:
# loading metadata
import pandas as pd,os
df=pd.read_csv(metadata_fn,encoding='utf-8',sep='\t')
df['fn']=[os.path.join(corpus_folder,row['sample_name'],row['idz']+'.txt') for ind,row in df.iterrows()]
df['l']=[unicode(l).replace('&indent;','') for l in df['l']]
df=df[['sample_name','group','fn','title','author','year','l','num_lines']].set_index('fn')
df.shape

(1300, 7)

### Featureset #1: Poesy

In [6]:
import codecs
from poesy import Poem,product,scheme2edges

def do_get_feats(fn):
    print '>>',fn,'...'
    try:
        with codecs.open(fn,encoding='utf-8') as f: txt=f.read()
        poem=Poem(txt)
        statd=poem.statd
        statd['fn']=fn
        
        # convert list to feats
        if statd['rhyme_schemes']:
           for (schemename,schemevals),acc in statd['rhyme_schemes']:
               statd['rhyme_scheme_acc_'+schemename]=acc
        del statd['rhyme_schemes']
        
        for l1,l2 in sorted(scheme2edges(poem.rhyme_ids)):
            statd['rhymes_l{0}-l{1}'.format(str(l1+1).zfill(2),str(l2+1).zfill(2))]=1
        return statd
    except AttributeError:
        return {}

In [7]:
do_get_feats(df.index[0])

>> corpus/Sonnets/Z200340853.txt ...


{'beat_scheme': (5,),
 'beat_scheme_diff': 12,
 'beat_scheme_length': 1,
 'beat_scheme_repr': 'Pentameter',
 'beat_scheme_type': 'Invariable',
 'fn': u'corpus/Sonnets/Z200340853.txt',
 'meter_ambiguity': 3.642857142857143,
 'meter_constraint_TOTAL': 0.1484375,
 'meter_constraint_footmin-f-resolution': 0.015625,
 'meter_constraint_footmin-w-resolution': 0.0,
 'meter_constraint_strength_w=>-p': 0.0,
 'meter_constraint_stress_s=>-u': 0.0625,
 'meter_constraint_stress_w=>-p': 0.0703125,
 'meter_length_avg_line': 9.571428571428571,
 'meter_length_avg_parse': 9.571428571428571,
 'meter_mpos_s': 0.5,
 'meter_mpos_w': 0.453125,
 'meter_mpos_ww': 0.046875,
 'meter_perc_lines_ending_s': 0.6428571428571429,
 'meter_perc_lines_ending_w': 0.35714285714285715,
 'meter_perc_lines_fourthpos_s': 0.8571428571428571,
 'meter_perc_lines_fourthpos_w': 0.14285714285714285,
 'meter_perc_lines_starting_s': 0.35714285714285715,
 'meter_perc_lines_starting_w': 0.6428571428571429,
 'meter_type_foot': 'binary',
 

In [8]:
# Ran do_get_feats using mpi_slingshot. saved in poesy_process.py
#!slingshot -sling poesy_process.py -stone do_gget_feats -path /Users/ryan/DH/poetry/classification/corpus/metadata.txt -savedir data_poesy -parallel 16
# Tabulated results saved in data_nlp/results.txt

In [10]:
df_feats_poesy=pd.read_csv('data_poesy/results.txt',encoding='utf-8',sep='\t')

In [11]:
df_feats_poesy['fn']=[unicode(fn).replace('/Users/ryan/DH/poetry/classification/','') for fn in df_feats_poesy['fn']]
df_feats_poesy=df_feats_poesy.set_index('fn')

In [12]:
df_feats_poesy.head()

Unnamed: 0_level_0,_path,beat_scheme,beat_scheme_diff,beat_scheme_length,beat_scheme_repr,beat_scheme_type,meter_ambiguity,meter_constraint_TOTAL,meter_constraint_footmin-f-resolution,meter_constraint_footmin-w-resolution,...,rhymes_l11-l13,rhymes_l11-l14,rhymes_l12-l13,rhymes_l12-l14,rhymes_l13-l14,syll_scheme,syll_scheme_diff,syll_scheme_length,syll_scheme_repr,syll_scheme_type
fn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
corpus2/Sonnets/Z400342329.txt,corpus2/Sonnets/Z400342329.txt,"[5, 5, 4, 5, 5, 5, 4]",11.0,7.0,Complex (5-5-4-5-5-5-4),Complex,3.071429,0.16,0.0,0.0,...,,,,,1.0,"[10, 10, 10, 10, 10, 8, 10]",9.0,7.0,Complex (10-10-10-10-10-8-10),Complex
corpus2/Sonnets/Z200344673.txt,corpus2/Sonnets/Z200344673.txt,"[5, 5, 4, 5, 5, 5, 5]",5.0,7.0,Complex (5-5-4-5-5-5-5),Complex,2.285714,0.137405,0.0,0.0,...,,1.0,,1.0,,"[10, 10, 8, 10, 10, 10, 10]",6.0,7.0,Complex (10-10-8-10-10-10-10),Complex
corpus2/Sonnets/Z300488995.txt,corpus2/Sonnets/Z300488995.txt,[5],6.0,1.0,Pentameter,Invariable,2.428571,0.180451,0.0,0.0,...,1.0,1.0,,,1.0,[10],4.0,1.0,10,Invariable
corpus2/NotSonnets/Z200448334.txt,corpus2/NotSonnets/Z200448334.txt,[5],0.0,1.0,Pentameter,Invariable,1.714286,0.067164,0.007463,0.0,...,,,,,1.0,[10],0.0,1.0,10,Invariable
corpus2/Sonnets/Z300306252.txt,corpus2/Sonnets/Z300306252.txt,[5],8.0,1.0,Pentameter,Invariable,4.714286,0.228571,0.021429,0.014286,...,,1.0,1.0,,,"[12, 10, 12, 10, 10, 10, 10]",14.0,7.0,Complex (12-10-12-10-10-10-10),Complex


In [13]:
df_feats_poesy.to_csv('featuresets/data.feats.poesy.sonnets-vs-nonsonnets.txt',sep='\t',encoding='utf-8')
# last saved: 2/11 13:14

## Featureset #2: Word frequencies

In [10]:
def tokenize_fast(line):
    import re
    return re.findall("[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+",line.lower())

def count_words_fast(path):
    from future_builtins import map
    from collections import Counter
    from itertools import chain
    with codecs.open(path,encoding='utf-8',errors='ignore') as f:
        return Counter(chain.from_iterable(map(tokenize_fast, f)))

In [11]:
def build_wordfreq_featureset(df,nlargest=1000):
    fns=df.index
    results = [count_words_fast(fn) for fn in fns]
    odf=pd.DataFrame(results).fillna(0)
    top_cols=odf.sum(0).nlargest(nlargest).index
    odf=odf[top_cols]
    odf['_fn']=fns
    return odf.set_index('_fn')

In [12]:
df_feats_wordfreq = build_wordfreq_featureset(df)
df_feats_wordfreq.shape

(1300, 1000)

In [13]:
df_feats_wordfreq.to_csv('featuresets/data.feats.wordfreqs.sonnets-vs-nonsonnets.txt',sep='\t',encoding='utf-8')
# last saved: 2/11 13:14