# Developing text objects further

In [1]:
import sys; sys.path.insert(0,'/Users/ryan/github/cadence/')
from cadence.imports import *

In [51]:
s='Turning and turning in the widening gyre'
txt="""
Turning and turning in the widening gyre. the falcon cannot hear the falconer;
Things fall apart; the centre cannot hold;
Mere anarchy is loosed upon the world. 
The blood-dimmed tide is loosed, and everywhere the ceremony of innocence is drowned;
The best lack all conviction, while the worst   
Are full of passionate intensity.
"""

In [52]:
nltk.sent_tokenize(txt)

['\nTurning and turning in the widening gyre.',
 'the falcon cannot hear the falconer;\nThings fall apart; the centre cannot hold;\nMere anarchy is loosed upon the world.',
 'The blood-dimmed tide is loosed, and everywhere the ceremony of innocence is drowned;\nThe best lack all conviction, while the worst   \nAre full of passionate intensity.']

In [40]:

def to_stanzas_str(full_txt,sep=SEP_STANZA,**kwargs):
    return [st.strip() for st in full_txt.strip().split(sep) if st.strip()]

def to_lines_str(stanza_txt,sep=SEP_STANZA,**kwargs):
    return [st.strip() for st in stanza_txt.strip().split(sep) if st.strip()]

def to_sents_str(stanza_txt,**kwargs):
    return list(nltk.sent_tokenize(stanza_txt))

def limit_lineparts(linepart_toks,min_len=None,max_len=None):
    if not min_len and not max_len: return [linepart_toks]

    lp=[]
    o=[]
    for tok in reversed(linepart_toks):
        lp.insert(0,tok)
        if len(lp)>=max_len:
            o.insert(0,lp)
            lp=[]
    if lp: o.insert(0,lp)
    
    return o
    

def to_lineparts_str(line_str,seps=SEPS_PHRASE,**kwargs):
    lineparts=[]
    linepart=[]
    tokens=list(tokenize_nice(line_str))
    for token in tokens:
        pref,tok,suf = split_punct(token)        
        is_pref_stopper=set(pref)&set(seps)
        is_suf_stopper=set(suf)&set(seps)
        
        if is_pref_stopper:
            lineparts.append(linepart)
            linepart=[]
        
        linepart.append(token)
        
        if is_suf_stopper:
            lineparts.append(linepart)
            linepart=[]

    # add if remaining
    if linepart:
        lineparts.append(linepart)
        linepart=[]
        
    ## Further divide by max_len
    o=[''.join(lpstr2) for lp_toks in lineparts for lpstr2 in limit_lineparts(lp_toks,**kwargs)]        
    return o


In [41]:
nltk.sent_tokenize(txt)

['\nTurning and turning in the widening gyre.',
 'the falcon cannot hear the falconer;\nThings fall apart; the centre cannot hold;\nMere anarchy is loosed upon the world.',
 'The blood-dimmed tide is loosed, and everywhere the ceremony of innocence is drowned;\nThe best lack all conviction, while the worst   \nAre full of passionate intensity.']

In [42]:





def to_lineparts_ld(
        txt_or_fn_or_lpdf,
        lang=DEFAULT_LANG,
        progress=True,
        incl_alt=INCL_ALT,
        num_proc=DEFAULT_NUM_PROC,
        linebreaks=False,
        phrasebreaks=True,
        verse=None,
        prose=None,
        min_len=MIN_WORDS_IN_PHRASE,
        max_len=MAX_WORDS_IN_PHRASE,
        seps=SEPS_PHRASE,
        desc='Iterating over line scansions',
        **kwargs):
    
    if type(txt_or_fn_or_lpdf) == pd.DataFrame:
        odf=resetindex(txt_or_fn_or_lpdf)
        if 'linepart_str' in set(odf.columns):
            return odf
        else:
            raise Exception('Input is neither string or a linepart-df [result of lineparts()]')
    
    full_txt=to_txt(txt_or_fn_or_lpdf)
    if full_txt is None: return
    
    if verse==True or prose==False:
        linebreaks=True
        phrasebreaks=False
    elif prose==True or verse==False:
        linebreaks=False
        phrasebreaks=True

    df=pd.DataFrame()
    dfl=[]
    to_lines_now = to_lines_str if linebreaks else to_sents_str
    kwargs['lang']=lang
    kwargs['incl_alt']=incl_alt
    
        
    objs=[
        dict(
            stanza_i=stanza_i+1,
            line_i=line_i+1,
            linepart_i=linepart_i+1,
            linepart_str=linepart_txt
        )
        for stanza_i,stanza_txt in enumerate(to_stanzas_str(full_txt))
        for line_i,line_txt in enumerate(to_lines_now(stanza_txt))
        for linepart_i,linepart_txt in enumerate(
            to_lineparts_str(
                line_txt,
                seps=seps,
                min_len=min_len,
                max_len=max_len
            ) if phrasebreaks else [line_txt]
        )
    ]
    return objs

In [43]:
to_lineparts_ld('hello world')

[{'stanza_i': 1, 'line_i': 1, 'linepart_i': 1, 'linepart_str': 'hello world '}]

In [44]:
def scan_iter(df_or_txt_or_fn,num_proc=1,lim=None,progress=True,lineparts_ld=[],**kwargs):
    if not lineparts_ld:
        lineparts_ld=to_lineparts_ld(df_or_txt_or_fn,**kwargs)
    iterr_o=pmap_iter(
        do_scan_iter,
        lineparts_ld,
        progress=progress,
        num_proc=num_proc,
        desc='Scanning lines'
    )
    for i,odf in enumerate(iterr_o):
        if lim and i>=lim: break
        yield odf

        
def scan(txt_or_fn,**kwargs):
    o=list(scan_iter(txt_or_fn,**kwargs))
    return pd.concat(o) if len(o) else pd.DataFrame()

def do_scan_iter(rowd,**kwargs):
    lpstr=rowd['linepart_str']
    odf=get_scansion(lpstr,**kwargs)
    for k,v in rowd.items(): odf[k]=v
    return setindex(odf)


In [45]:
# for odf in scan_iter(txt,max_len=2): display(odf)

In [48]:
scan(txt,max_len=2,linebreaks=True,phrasebreaks=True)

Scanning lines [x1]: 100%|███████████████████████████████████████████| 29/29 [00:00<00:00, 86.50it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,is_funcword,is_heavy,is_light,is_peak,is_stressed,is_trough,is_unstressed,linepart_num_monosyll,linepart_num_syll,prom_strength,prom_stress,prom_weight
stanza_i,line_i,linepart_i,linepart_str,word_i,word_str,word_ipa_i,word_ipa,syll_i,syll_str,syll_ipa,syll_stress,syll_weight,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
1,1,1,Turning,1,Turning,1,'tɛː.nɪŋ,1,Tur,'tɛː,P,,0,0,0,1,1,0,0,0,2,1.0,1.0,
1,1,1,Turning,1,Turning,1,'tɛː.nɪŋ,2,ning,nɪŋ,U,,0,0,0,0,0,1,1,0,2,0.0,0.0,
1,1,2,and turning,1,and,1,ænd,1,and,ænd,U,,1,0,0,0,0,0,1,1,3,,0.0,
1,1,2,and turning,2,turning,1,'tɛː.nɪŋ,1,tur,'tɛː,P,,0,0,0,1,1,0,0,1,3,1.0,1.0,
1,1,2,and turning,2,turning,1,'tɛː.nɪŋ,2,ning,nɪŋ,U,,0,0,0,0,0,1,1,1,3,0.0,0.0,
1,1,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,1,29,passionate intensity.,1,passionate,1,'pæ.ʃə.nət,3,nate,nət,U,,0,0,0,0,0,0,1,0,7,,0.0,
1,1,29,passionate intensity.,2,intensity.,1,ɪn.'tɛn.sə.tiː,1,in,ɪn,U,,0,0,0,0,0,1,1,0,7,0.0,0.0,
1,1,29,passionate intensity.,2,intensity.,1,ɪn.'tɛn.sə.tiː,2,ten,'tɛn,P,,0,0,0,1,1,0,0,0,7,1.0,1.0,
1,1,29,passionate intensity.,2,intensity.,1,ɪn.'tɛn.sə.tiː,3,si,sə,U,,0,0,0,0,0,1,1,0,7,0.0,0.0,


In [11]:

def Verse(txt,**kwargs):
    kwargs={**dict(verse=True), **kwargs}
    return Text(txt,**kwargs)
def Poem(txt,**kwargs):
    kwargs={**dict(verse=True), **kwargs}
    return Text(txt,**kwargs)
def Prose(txt,**kwargs):
    kwargs={**dict(prose=True), **kwargs}
    return Text(txt,**kwargs)
def FreeVerse(txt,**kwargs):
    kwargs={**dict(linebreaks=True,phrasebreaks=True), **kwargs}
    return Text(txt,**kwargs)



# loading txt/strings
def to_fn_txt(txt_or_fn):
    # load txt
    if type(txt_or_fn)==str and not '\n' in txt_or_fn and os.path.exists(txt_or_fn):
        fn=txt_or_fn
        with open(fn,encoding='utf-8',errors='replace') as f:
            txt=f.read()
    else:
        fn=''
        txt=txt_or_fn
    return (fn,txt.strip())


### convenient objs
def kwargs_key(kwargs,bad_keys={'num_proc','progress','desc'}):
    return ', '.join(
        f'{k}={v}'
        for k,v in kwargs.items()
        if k not in bad_keys
    )
    
    
### texts
class Text(object):
    def __init__(self,txt_or_fn,**kwargs):
        
        self._scans={}
        self._parses={}
        self._lineparts={}
        self._num_lines=None
        self._num_stanzas=None
        self._infod={}
        self._kwargs=kwargs
        
        ## Load texts
        self.fn,self.txt=to_fn_txt(txt_or_fn)
        
    # def __repr__(self):
    #     o=self.txt.split('\n\n')[0] if self.txt is not None else ""
    #     o='\t' + '\n\t'.join(l for l in o.split('\n'))
    #     o=f'''<cadence.Text: {self.first_line} ({self.num_stanzas} stanza{"s" if self.num_stanzas>1 else ""}, {self.num_lines} line{"s" if self.num_lines>1 else ""})>'''.strip()
    #     #o='\n'.join(l.strip() for l in o.split('\n'))
    #     return o
    
    def kwargs(self,**kwargs):
        return {**self._kwargs, **kwargs}
    
    def get_kwargs_key(self,**kwargs):
        return kwargs_key(self.kwargs(**kwargs))


    
    ##################################################################
    ### Stanzas
    def stanzas(self,txt='',**kwargs):
        kwargs=self.kwargs(**kwargs)
        return to_stanzas_str(txt if txt else self.txt,**kwargs)    

    ##################################################################
    ### Lines
    
    def lines(self,txt='',linebreaks=False,prose=False,**kwargs):
        if not txt: txt=self.txt
        kwargs=self.kwargs(**kwargs)
        to_lines_now = to_lines_str if kwargs.get('linebreaks') or kwargs.get('verse') else to_sents_str
        return [
            l
            for stanza_str in self.stanzas(txt,**kwargs)
            for l in to_lines_now(stanza_str, **kwargs)
        ]
    
    def sentences(self,txt='',**kwargs):
        if not txt: txt=self.txt
        kwargs=self.kwargs(**kwargs)
        return [
            lp
            for line_str in self.lines(txt,**kwargs)
            for lp in to_sents_str(line_str, **kwargs)
        ]


    ##################################################################
    ### LINEPARTS
    
    ### Lineparts
    def lineparts(self, txt='', **kwargs):
        if not txt: txt=self.txt
        kwargs=self.kwargs(**kwargs)
        return [
            lp
            for line_str in self.lines(txt,**kwargs)
            for lp in to_lineparts_str(line_str, **kwargs)
        ]        
    
    ##################################################################
    ### SCANS
    
    def scan(self, force=False, **kwargs):
        key=self.get_kwargs_key(**kwargs)
        if force or not key in self._scans:
            self._scans[key]=scan(self.txt,**kwargs)
        return self._scans[key]

    
    
    ##################################################################
    ### PARSE
    
    def parse(self,
            force=False,
            verbose=True,
            only_best=False,
            only_unbounded=True,
            **kwargs):
        kwargs['verbose']=verbose
        kwargs_line={**self.kwargs, **kwargs, **{'by_syll':False}}
        kwargs_syll={**self.kwargs, **kwargs, **{'by_syll':True}}
        key_line=kwargs_key(kwargs_line)
        key_syll=kwargs_key(kwargs_syll)
        if force or not (key_syll in self._parses) or (not key_line in self._parses):
            self._parses[key_syll]=parse(self.txt, **kwargs_syll)
            self._parses[key_line]=to_lines(self._parses[key_syll])
        elif kwargs.get('verbose',True):
            for li,linedf in sorted(self._parses[key_syll].reset_index().groupby(['stanza_i','line_i'])):
                display(show_parse(linedf))

        self.infod=info_parses(self._parses[key_line])
        if verbose: printm(show_info_parses(self.infod))
                
        
    def parses(self,
            force=True,
            only_best=False,
            only_unbounded=True,
            **kwargs):
        kwargs={**self.kwargs, **kwargs}
        kwargs_line={**self.kwargs, **kwargs, **{'by_syll':False}}
        kwargs_syll={**self.kwargs, **kwargs, **{'by_syll':True}}
        key_line=kwargs_key(kwargs_line)
        key_syll=kwargs_key(kwargs_syll)
        key=key_syll if kwargs.get('by_syll') else key_line
        if not key in self._parses:
            self.parse(force=force, **kwargs)
        if not key in self._parses: return
        
        odf=self._parses[key]
        if only_unbounded and ('parse_is_bounded' in set(odf.index.names) or 'parse_is_bounded' in set(odf.columns)):
            #odf=odf[odf.parse_is_bounded==False]
            odf=odf.query('parse_is_bounded==False')
        if only_best and ('parse_rank' in set(odf.index.names) or 'parse_rank' in set(odf.columns)):
            odf=odf.query('parse_rank==1')
        
        return odf

    def best_parses(self, force=False, **kwargs):
        return self.parses(force=force,only_best=True,**kwargs)
    def all_parses(self, force=False,**kwargs):
        return self.parses(force=force,only_best=False,only_unbounded=False,**kwargs)
    def unbounded_parses(self, force=False,**kwargs):
        return self.parses(force=force,only_best=False,only_unbounded=True,**kwargs)

In [12]:
t=Text('saintsbury/txt/en.addison.prose.Saintsbury.psgs_quoted.txt', prose=True, test='hello')