# Developing text objects further

In [10]:
import sys; sys.path.insert(0,'/Users/ryan/github/cadence/')
from cadence.imports import *

In [11]:
s='Turning and turning in the widening gyre'

In [12]:

def to_stanzas_str(full_txt,sep=SEP_STANZA,**kwargs):
    return [st.strip() for st in full_txt.strip().split(sep) if st.strip()]

def to_lines_str(stanza_txt,sep=SEP_STANZA,**kwargs):
    return [st.strip() for st in stanza_txt.strip().split(sep) if st.strip()]

def to_sents_str(stanza_txt,**kwargs):
    return list(nltk.sent_tokenize(stanza_txt))

def to_lineparts_str(line_str,seps=SEPS_PHRASE,min_len=1,max_len=15):
    if min_len is None: min_len=1
    if max_len is None: max_len=1000000000

    o=[]
    sentparts=[]
    sentpart=[]
    for token in tokenize_nice(line_str):
        pref,tok,suf = split_punct(token)

        # end prev?
        if sentpart and set(pref)&set(seps) and len(sentpart)>=min_len:
            sentparts.append(sentpart)
            sentpart=[]

        # add no matter what
        sentpart.append(token)

        # end after? or if too long?
        if sentpart and ((set(suf)&set(seps) and len(sentpart)>=min_len) or len(sentpart)>=max_len):
            sentparts.append(sentpart)
            sentpart=[]
            
    # add if remaining
    if sentpart:
        sentparts.append(sentpart)
        sentpart=[]
    return [''.join(x) for x in sentparts]






def linepart(
        txt_or_fn_or_lpdf,
        lang=DEFAULT_LANG,
        progress=True,
        incl_alt=INCL_ALT,
        num_proc=DEFAULT_NUM_PROC,
        linebreaks=False,
        phrasebreaks=True,
        verse=None,
        prose=None,
        min_len=MIN_WORDS_IN_PHRASE,
        max_len=MAX_WORDS_IN_PHRASE,
        seps=SEPS_PHRASE,
        desc='Iterating over line scansions',
        **kwargs):
    
    if type(txt_or_fn_or_lpdf) == pd.DataFrame:
        odf=resetindex(txt_or_fn_or_lpdf)
        if 'linepart_str' in set(odf.columns):
            return odf
        else:
            raise Exception('Input is neither string or a linepart-df [result of lineparts()]')
    
    full_txt=to_txt(txt_or_fn_or_lpdf)
    if full_txt is None: return
    
    if verse==True or prose==False:
        linebreaks=True
        phrasebreaks=False
    elif prose==True or verse==False:
        linebreaks=False
        phrasebreaks=True

    df=pd.DataFrame()
    dfl=[]
    to_lines_now = to_lines_str if linebreaks else to_sents_str
    kwargs['lang']=lang
    kwargs['incl_alt']=incl_alt
    
        
    objs=[
        dict(
            stanza_i=stanza_i+1,
            line_i=line_i+1,
            linepart_i=linepart_i+1,
            linepart_str=linepart_txt
        )
        for stanza_i,stanza_txt in enumerate(to_stanzas_str(full_txt))
        for line_i,line_txt in enumerate(to_lines_now(stanza_txt))
        for linepart_i,linepart_txt in enumerate(
            to_lineparts_str(
                line_txt,
                seps=seps,
                min_len=min_len,
                max_len=max_len
            ) if phrasebreaks else [line_txt]
        )
    ]
    return objs

In [27]:
def scan_iter(df_or_txt_or_fn,num_proc=1,lim=None,progress=True,lineparts_ld=[],**kwargs):
    if not lineparts_ld:
        lineparts_ld=linepart(df_or_txt_or_fn,**kwargs)
    iterr_o=pmap_iter(
        do_scan_iter,
        lineparts_ld,
        progress=progress,
        num_proc=num_proc,
        desc='Scanning lines'
    )
    for i,odf in enumerate(iterr_o):
        if lim and i>=lim: break
        yield odf


In [32]:

def Verse(txt,**kwargs):
    kwargs={**dict(verse=True), **kwargs}
    return Text(txt,**kwargs)
def Poem(txt,**kwargs):
    kwargs={**dict(verse=True), **kwargs}
    return Text(txt,**kwargs)
def Prose(txt,**kwargs):
    kwargs={**dict(prose=True), **kwargs}
    return Text(txt,**kwargs)
def FreeVerse(txt,**kwargs):
    kwargs={**dict(linebreaks=True,phrasebreaks=True), **kwargs}
    return Text(txt,**kwargs)



# loading txt/strings
def to_fn_txt(txt_or_fn):
    # load txt
    if type(txt_or_fn)==str and not '\n' in txt_or_fn and os.path.exists(txt_or_fn):
        fn=txt_or_fn
        with open(fn,encoding='utf-8',errors='replace') as f:
            txt=f.read()
    else:
        fn=''
        txt=txt_or_fn
    return (fn,txt.strip())


### convenient objs
def kwargs_key(kwargs,bad_keys={'num_proc','progress','desc'}):
    return ', '.join(
        f'{k}={v}'
        for k,v in kwargs.items()
        if k not in bad_keys
    )
    
    
### texts
class Text(object):
    def __init__(self,txt_or_fn,**kwargs):
        
        self._scans={}
        self._parses={}
        self._lineparts={}
        self._num_lines=None
        self._num_stanzas=None
        self._infod={}
        
        ## Set kwargs
        self.kwargs=kwargs
        for k,v in self.kwargs.items(): setattr(self,k,v)
        self.kwargs_key=kwargs_key(self.kwargs)
        
        ## Load texts
        self.fn,self.txt=to_fn_txt(txt_or_fn)
        
    # def __repr__(self):
    #     o=self.txt.split('\n\n')[0] if self.txt is not None else ""
    #     o='\t' + '\n\t'.join(l for l in o.split('\n'))
    #     o=f'''<cadence.Text: {self.first_line} ({self.num_stanzas} stanza{"s" if self.num_stanzas>1 else ""}, {self.num_lines} line{"s" if self.num_lines>1 else ""})>'''.strip()
    #     #o='\n'.join(l.strip() for l in o.split('\n'))
    #     return o
    
    def get_kwargs_key(self,**kwargs):
        kwargs_both = {**self.kwargs, **kwargs}
        return kwargs_key(kwargs_both)


    
    ##################################################################
    ### Stanzas
    def stanzas(self,txt=None,**kwargs):
        return to_stanzas_str(self.txt if not txt else txt,**kwargs)    

    ##################################################################
    ### Lines
    
    def lines(self,txt=None,**kwargs):
        if not stanza_str: stanza_str=self.txt
        return to_lines_str(stanza_str, **kwargs)

    ##################################################################
    ### LINEPARTS
    
    ### Lineparts
    def lineparts(self, **kwargs):
        key=self.get_kwargs_key(**kwargs)
        if force or not key in self._lineparts:
            self._lineparts[key]=linepart(self.txt,**kwargs)
    
    
    ##################################################################
    ### SCANS
    
    def scan(self, force=False, **kwargs):
        key=self.get_kwargs_key(**kwargs)
        if force or not key in self._scans:
            self._scans[key]=scan(self.txt,**kwargs)
        return self._scans[key]

    
    
    ##################################################################
    ### PARSE
    
    def parse(self,
            force=False,
            verbose=True,
            only_best=False,
            only_unbounded=True,
            **kwargs):
        kwargs['verbose']=verbose
        kwargs_line={**self.kwargs, **kwargs, **{'by_syll':False}}
        kwargs_syll={**self.kwargs, **kwargs, **{'by_syll':True}}
        key_line=kwargs_key(kwargs_line)
        key_syll=kwargs_key(kwargs_syll)
        if force or not (key_syll in self._parses) or (not key_line in self._parses):
            self._parses[key_syll]=parse(self.txt, **kwargs_syll)
            self._parses[key_line]=to_lines(self._parses[key_syll])
        elif kwargs.get('verbose',True):
            for li,linedf in sorted(self._parses[key_syll].reset_index().groupby(['stanza_i','line_i'])):
                display(show_parse(linedf))

        self.infod=info_parses(self._parses[key_line])
        if verbose: printm(show_info_parses(self.infod))
                
        
    def parses(self,
            force=True,
            only_best=False,
            only_unbounded=True,
            **kwargs):
        kwargs={**self.kwargs, **kwargs}
        kwargs_line={**self.kwargs, **kwargs, **{'by_syll':False}}
        kwargs_syll={**self.kwargs, **kwargs, **{'by_syll':True}}
        key_line=kwargs_key(kwargs_line)
        key_syll=kwargs_key(kwargs_syll)
        key=key_syll if kwargs.get('by_syll') else key_line
        if not key in self._parses:
            self.parse(force=force, **kwargs)
        if not key in self._parses: return
        
        odf=self._parses[key]
        if only_unbounded and ('parse_is_bounded' in set(odf.index.names) or 'parse_is_bounded' in set(odf.columns)):
            #odf=odf[odf.parse_is_bounded==False]
            odf=odf.query('parse_is_bounded==False')
        if only_best and ('parse_rank' in set(odf.index.names) or 'parse_rank' in set(odf.columns)):
            odf=odf.query('parse_rank==1')
        
        return odf

    def best_parses(self, force=False, **kwargs):
        return self.parses(force=force,only_best=True,**kwargs)
    def all_parses(self, force=False,**kwargs):
        return self.parses(force=force,only_best=False,only_unbounded=False,**kwargs)
    def unbounded_parses(self, force=False,**kwargs):
        return self.parses(force=force,only_best=False,only_unbounded=True,**kwargs)

In [33]:
t=Text('saintsbury/txt/en.addison.prose.Saintsbury.psgs_quoted.txt', prose=True, test='hello')

In [34]:
t.stanzas()

TypeError: to_stanzas_str() missing 1 required positional argument: 'full_txt'

In [18]:
t

AttributeError: 'Text' object has no attribute 'first_line'

['', 'This is a test']


In [None]:
t.kwargs_key