# Prosodic parsing

## Constants

In [31]:
STONES = ['parse_by_line', 'parse_by_window']

In [71]:
eg_path1='/Volumes/Present/DH/corpora/chadwyck_poetry/xml/african-american/beadlesa/Z200265018.xml'
eg_path2='/Volumes/Present/DH/corpora/chadwyck_poetry/xml/english/miscell2/Z200439011.xml'

## Imports

In [48]:
import prosodic as p

## Functions

In [49]:
def text_plain(path, OK=['l','lb'], BAD=['note','edit'], body_tag='poem', line_lim=None, modernize_spelling=True):
    from llp.corpus.chadwyck_poetry import xml2txt
    return xml2txt(path, OK=OK,BAD=BAD,body_tag=body_tag,line_lim=line_lim,modernize_spelling=modernize_spelling)

In [50]:
#text_plain(eg_path)

In [51]:
def read_file(path_to_txt_or_xml_file):
    try:
        if path_to_txt_or_xml_file.endswith('.xml'):
            txt=text_plain(path_to_txt_or_xml_file)
        else:
            with open(path_to_txt_or_xml_file) as f:
                txt=f.read()
        return txt
    except IOError:
        return ''

In [78]:
def counts(string, sub):
    count = start = 0
    while True:
        start = string.find(sub, start) + 1
        if start > 0:
            count+=1
        else:
            return count

## Parsing functions

## Raw string parsing function

In [138]:
def get_data_from_line(line,meter,require_parse_data=True):
    """
    Get data from the prosodic line object, with its meter.
    """
    
    # get phonological info
    weight_str=line.str_weight()
    sonority_str=line.str_sonority()
    stress_str=line.str_stress()

    # store metrical constraint stats
    bp=line.bestParse(meter)
    
    # require 
    if require_parse_data:
        if not bp:
            return {}
        
    ap=line.allParses(meter)
    output_dict={}
    output_dict['prosodic_line']=line.txt #.encode('utf-8',errors='ignore')
    output_dict['parse']=bp.posString(viols=True) if bp else ''
    #output_dict['parse']=output_dict['parse'] #.encode('utf-8',errors='ignore')
    meter_str=output_dict['meter']=bp.str_meter() if bp else ''
    output_dict['num_parses']=len(ap)
    output_dict['num_viols'] = bp.totalCount if bp else ''
    output_dict['score_viols'] = bp.score() if bp else ''
    output_dict['num_sylls']=bp.num_sylls if bp else ''
    output_dict['num_words']=len(line.words())
    for c in meter.constraints:
        sumviol = sum([parse.constraintCounts[c] if c in parse.constraintCounts else 0 for parse in ap])
        output_dict[c.name_weight+'_bestparse']=bp.constraintCounts[c] if bp and c in bp.constraintCounts else 0
        output_dict[c.name_weight+'_allparse_sum']=sumviol if sumviol else 0
    
    ## store phonological constraint stats
    output_dict['prosodic_stress']=stress_str
    output_dict['prosodic_weight']=weight_str
    output_dict['prosodic_sonority']=sonority_str
    output_dict['num_monosylls']=len([w for w in line.words() if w.numSyll==1])
    output_dict['[*clash_across]']=counts(stress_str,'P#P') + counts(stress_str,'P#S') + counts(stress_str,'S#P') + counts(stress_str,'S#S')
    output_dict['[*clash_within]']=counts(stress_str,'PP') + counts(stress_str,'PS') + counts(stress_str,'SP') + counts(stress_str,'SS')
    output_dict['[*clash_across_primary]']=counts(stress_str,'P#P')
    output_dict['[*clash_within_primary]']=counts(stress_str,'PP')
    output_dict['[*lapse_across]']=counts(stress_str,'U#U')
    output_dict['[*lapse_within]']=counts(stress_str,'UU')
    output_dict['[*WSP]']=0
    output_dict['[*PEAKPROM]']=0
    output_dict['[*High_Stress]']=0
    output_dict['[*Low_Unstress]']=0
    output_dict['[*High_Strong]']=0
    output_dict['[*Low_Weak]']=0
    for s,w,hml,mtr in zip(stress_str,weight_str,sonority_str,meter_str):
        if s=='U' and w=='H':
            output_dict['[*WSP]']+=1
        if (s=='P' or s=='S') and w=='L':
            output_dict['[*PEAKPROM]']+=1
        
        if hml=='H' and s in {'P','S'}:
            output_dict['[*High_Stress]']+=1
        if hml=='L' and s=="U":
            output_dict['[*Low_Unstress]']+=1
        
        if hml=='H' and mtr == 's':
            output_dict['[*High_Strong]']+=1
        if hml=='L' and mtr == 'w':
            output_dict['[*Low_Weak]']+=1
    
    return output_dict

In [139]:
def parse_string(text_str, meter='default_english', num_processes=1):
    """
    Parse the string, assuming line as unit
    """
    # prosodic parse
    text = p.Text(text_str)
    meter = text.get_meter(meter)

    out_ld=[]
    for i,line in enumerate(text.iparse(meter=meter, num_processes=num_processes)):
        line_d=get_data_from_line(line,meter)
        if not line_d or not 'score_viols' in line_d: continue
        line_d['line_id']=i+1
        out_ld.append(line_d)
    return out_ld

#### Testing

In [140]:
# import pandas as pd
# pd.DataFrame(parse_string("""With what attractive charms this goodly frame 
# Of nature touches the consenting hearts 
# Of mortal men; and what the pleasing stores 
# Which beauteous imitation thence derives 
# To deck the poet's, or the painter's toil; 
# My verse unfolds."""))

### By line in text

In [141]:
def parse_by_line(path_to_txt_or_xml_file, meter='default_english', num_processes=1):
    # get txt
    txt=read_file(path_to_txt_or_xml_file)
    
    # return parse
    return parse_string(txt, meter=meter, num_processes=num_processes)

#### Testing

In [142]:
# import pandas as pd
# pd.DataFrame(parse_by_line(eg_path)).sort_values('score_viols').head()

In [143]:
# import pandas as pd
# df_window1=pd.DataFrame(parse_by_line(eg_path1))
# df_window2=pd.DataFrame(parse_by_line(eg_path2))
# print(df_window1.mean()['score_viols'], df_window2.mean()['score_viols'])
# df_window2.sort_values('num_viols').head()

### By window

In [144]:
def slice(l,num_slices=None,slice_length=None,runts=True,random=False):
    """
    Returns a new list of n evenly-sized segments of the original list
    """
    if random:
        import random
        random.shuffle(l)
    if not num_slices and not slice_length: return l
    if not slice_length: slice_length=int(len(l)/num_slices)
    newlist=[l[i:i+slice_length] for i in range(0, len(l), slice_length)]
    if runts: return newlist
    return [lx for lx in newlist if len(lx)==slice_length]

def ngram(l,n=3):
    grams=[]
    gram=[]
    for x in l:
        gram.append(x)
        if len(gram)<n: continue
        g=tuple(gram)
        grams.append(g)
        gram.reverse()
        gram.pop()
        gram.reverse()
    return grams

In [145]:
#slice(read_file(eg_path).split(), slice_length=5)
#len(ngram(read_file(eg_path).split(), n=5))

In [146]:
def parse_by_window(path_to_txt_or_xml_file, meter='default_english', window_size=5,overlapping_windows=False,max_slices=100000,num_processes=1,):
    # get txt
    txt=read_file(path_to_txt_or_xml_file)
    words=txt.split()
    
    if overlapping_windows:
        word_slices = ngram(words,n=window_size)
    else:
        word_slices = slice(words,slice_length=window_size)
        word_slices = word_slices[:max_slices]
        
    txt = '\n'.join([' '.join(slicex) for slicex in word_slices])
    
    return parse_string(txt)

#### Testing

In [147]:
# import pandas as pd
# df_window1=pd.DataFrame(parse_by_window(eg_path1))
# df_window2=pd.DataFrame(parse_by_window(eg_path2))
# print(df_window1.mean()['score_viols'], df_window2.mean()['score_viols'])
# df_window2.sort_values('num_viols').head()

## By phrase

In [158]:
# Requires NLTK
def parse_by_phrase(path_to_txt_or_xml_file, meter='default_english', minword=5):
    # get txt
    txt=read_file(path_to_txt_or_xml_file)
    
    # phrases
    import re
    phrases=re.split('[?.,;:\n]', txt)
    
    # recombine for minword
    if minword:
        phrases2=[]
        phrase=[]
        for px in phrases:
            phrase+=px.split()
            if len(phrase)>=minword:
                phrases2+=[' '.join(phrase)]
                phrase=[]
        phrases=phrases2
    
    # make txt
    txt = '\n'.join(phrases)

    # return parsed
    return parse_string(txt)

#### Testing

In [159]:
import pandas as pd
df_window1=pd.DataFrame(parse_by_phrase(eg_path1))
df_window2=pd.DataFrame(parse_by_phrase(eg_path2))
print(df_window1.mean()['score_viols'], df_window2.mean()['score_viols'])
df_window2.sort_values('score_viols').head()

>> parsing complete in: 1.2151060104370117 seconds
>> parsing line # 100 of 127 lines [ 485.8 syllables/second ]
>> parsing complete in: 0.4264261722564697 seconds
1.4 0.9921259842519685


Unnamed: 0,[*High_Stress],[*High_Strong],[*Low_Unstress],[*Low_Weak],[*PEAKPROM],[*WSP],[*clash_across],[*clash_across_primary],[*clash_within],[*clash_within_primary],...,num_parses,num_sylls,num_viols,num_words,parse,prosodic_line,prosodic_sonority,prosodic_stress,prosodic_weight,score_viols
0,0,0,3,3,0,2,0,0,0,0,...,1,10,0,8,the|CUR|few|TOLLS|the|KNELL|of|PART|ing|DAY,The curfew tolls the knell of parting day,LLHMLLLLHM,UPUPUPUPUP,LHHHLHLHHH,0
40,1,1,2,2,0,2,0,0,0,0,...,1,10,0,8,BACK|to.its|MAN|sion|CALL|the|FLEET|ing|BREATH,Back to its mansion call the fleeting breath,LHHLLLLHHL,PUUPUPUPUP,HLLHHHLHHH,0
45,0,0,4,4,0,2,0,0,0,0,...,1,10,0,9,HANDS|that.the|ROD|of|EM|pire|MIGHT|have|SWAYED,Hands that the rod of empire might have swayed,LLLLLLMMLM,PUUPUPUPUP,HLLHLHHHHH,0
50,1,1,5,5,0,1,0,0,0,0,...,1,9,0,8,and|FROZE|the|GE|nial|CURRENT|of.the|SOUL,And froze the genial current of the soul,LMLHLLLLM,UPUPUPUUP,LHLHHHLLH,0
52,0,0,3,3,0,2,0,0,0,0,...,1,8,0,7,the|DARK|unfathomed|CAVES|of|OCEAN||BEAR,The dark unfathomed caves of ocean bear,LLMMLMLL,UPUPUPUP,LHHHLHHH,0
