# Stanza integration

In [13]:
# set_start_method('spawn')

import sys; sys.path.insert(0,'/Users/ryan/github/prosodic/')
import sys; sys.path.insert(0,'/Users/ryan/github/cadence/')
from cadence.parsers.mtree import MetricalTree,DependencyTree,DependencyTreeParser,MetricalTreeParser
from cadence.imports import *

In [46]:
txt="""

Turning and turning in the widening gyre, 
the falcon cannot hear the falconer.
Things fall apart; the centre cannot hold.
   Mere anarchy is loosed upon the world. 

The blood-dimmed tide is loosed, and everywhere
The ceremony of innocence is drowned;



The best lack all conviction, while the worst   
Are full of passionate 
intensity.
"""

In [35]:
def iter_paras_df(txt_or_df,**kwargs):
    if type(txt_or_df)==str:
        for para_i,para_d in enumerate(tokenize_paras_ld(txt_or_df,**kwargs)):
            yield tokenize_sentwords(para_d['para_str'],para_i=para_d['para_i'])
    elif type(txt_or_df)==pd.DataFrame:
        for para_i,para_df in sorted(txt_or_df.groupby('para_i')):
            yield para_df
            
def get_num_paras(txt_or_df,**kwargs):
    if type(txt_or_df)==str:
        return len([pstr.strip() for pstr in txt_or_df.split(SEP_PARA)])
    elif type(txt_or_df)==pd.DataFrame:
        return len(set(getcol(txt_or_df,'para_i')))
    



In [36]:
def iter_parse_nlp_docs(
        docs,
        nlp=None,
        num_proc=1,
        **kwargs):
    ## nlp
    # if not len(docs): yield docs
    if num_proc>1:
        with mp.Pool(num_proc) as pool: # This is the fastest. joblib(thread, mp) experimented.
            yield from pool.imap(nlp, docs)
    else:
        for doc in docs:
            yield nlp(doc) if nlp is not None else doc



In [37]:
def tokenize_nlp_doc(tokdf,doc,**kwargs):
    sents=doc.sentences
    ld=[]
    cols_done=set(tokdf.columns)
    for sent_i, sent in enumerate(sents):
        for word_i,word in enumerate(sent.tokens):
            feats=word.to_dict()[0]
            statd=dict((f'word_{k}',v) for k,v in feats.items() if k not in badcols)
            for feat in feats.get('feats','').split('|'):
                if not feat: continue
                fk,fv=feat.split('=',1)
                statd[fk]=fv
            
            dx={
                'sent_i': sent.id+1,
                'word_i': word_i+1,
                **statd
            }
            ld.append(dx)
    df=pd.DataFrame(ld).fillna('')
    joiner=['sent_i','word_i']
    ocols=(set(df.columns)-set(tokdf.columns))|set(joiner)
    return tokdf.merge(df[ocols],on=joiner,how='left')


In [38]:
def scan_iter_nlp(
        txt,
        nlp=None,
        paras_lim=None,
        progress=True,
        num_proc=1,
    
        lang=DEFAULT_LANG,
        paras_shuffle=False,
        
        postag=False,
        constituency=False,
        depparse=False,
        syllabify=True,
    
        **kwargs):

    ## prep documents
    paras_ld=tokenize_paras_ld(txt)
    if paras_shuffle: random.shuffle(paras_ld)
    if paras_lim: paras_ld=paras_ld[:paras_lim]
    
    para_dfs=[
        tokenize_sentwords(para_d['para_str'],para_i=para_d['para_i'])
        for para_d in tqdm(paras_ld,desc='Tokenizing paragraphs')
    ]
    
    para_doclls=[
        tokenize_sentwords_ll(para_df)
        for para_df in tqdm(para_dfs,desc='Tokenizing sentences and words')
    ]
    
    processors=get_processors(
        postag=postag,
        constituency=constituency,
        depparse=depparse,
    )
    
    if processors and nlp is None:
        nlp = get_nlp(
            lang=lang,
            pretokenized=True,
            processors=processors
        )
        
    # iter docs
    doc_iter = iter_parse_nlp_docs(
        para_doclls,
        nlp=nlp,
        lang=lang,
        constituency=constituency,
        depparse=depparse,
        num_proc=num_proc,
        progress=False
    )
    
    oiterr=zip(paras_ld,para_dfs,doc_iter)
    
    if progress:
        oiterr=tqdm(
            oiterr,
            total=len(para_doclls),
            desc=f'Tokenizing NLP [x{num_proc}]'
        )
    
    # yield from oiterr
    for para_d,para_tokdf,para_doc in oiterr:
        #if postag or constituency or depparse:
        if processors:
            para_tokdf = tokenize_nlp_doc(para_tokdf, para_doc, **kwargs)
        
        if constituency:
            para_tokdf = tokenize_constituency(para_tokdf,para_doc,**kwargs)
        
        if syllabify:
            para_tokdf=syllabify_df(para_tokdf,**kwargs)
        
        yield setindex(para_tokdf.assign(para_i=para_d['para_i']))
    
    

In [39]:
# oiter=scan_iter_nlp(
#     txt,
#     paras_lim=100,
#     # syllabify=True,
#     postag=True,
#     depparse=True,
#     constituency=True,
#     syllabify=True,
#     num_proc=1
# )
# for scanned_para_df in oiter: pass
# scanned_para_df

In [40]:
def scan_iter(txt,groupby='para',**kwargs):
    if 'syllabify' not in kwargs: kwargs['syllabify']=True
    for df_para in scan_iter_nlp(txt,**kwargs):
        
        grpr=None
        if groupby=='sent':
            grpr='sent_i'
        elif groupby=='sentpart':
            grpr=['sent_i','sentpart_i']
        elif groupby=='word':
            grpr=['sent_i','word_i']
        elif groupby=='syll':
            grpr=['sent_i','word_i','word_ipa_i','syll_i']
            
        if grpr is None:
            yield df_para
        else:
            for gi,dfg in sorted(df_para.groupby(grpr)):
                yield dfg

In [45]:
next(scan_iter(txt,groupby='sentpart'))

Tokenizing paragraphs: 100%|████████████████████████████████████████████████| 3/3 [00:00<00:00, 340.60it/s]
Tokenizing sentences and words: 100%|███████████████████████████████████████| 3/3 [00:00<00:00, 794.93it/s]
Tokenizing NLP [x1]:   0%|                                                           | 0/3 [00:00<?, ?it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,is_funcword,is_heavy,is_light,is_peak,is_stressed,is_trough,is_unstressed,prom_strength,prom_stress,prom_weight
para_i,sent_i,sentpart_i,line_i,word_i,word_pref,word_str,word_ipa_i,word_ipa,syll_i,syll_str,syll_ipa,syll_stress,syll_weight,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1,1,1,1,1,,Turning,1,'tɛː.nɪŋ,1,Tur,'tɛː,P,,0,0,0,1,1,0,0,1.0,1.0,
1,1,1,1,1,,Turning,1,'tɛː.nɪŋ,2,ning,nɪŋ,U,,0,0,0,0,0,1,1,0.0,0.0,
1,1,1,1,2,,and,1,ænd,1,and,ænd,U,,1,0,0,1,0,0,1,1.0,0.0,
1,1,1,1,3,,turning,1,'tɛː.nɪŋ,1,tur,'tɛː,P,,0,0,0,1,1,0,0,1.0,1.0,
1,1,1,1,3,,turning,1,'tɛː.nɪŋ,2,ning,nɪŋ,U,,0,0,0,1,0,0,1,1.0,0.0,
1,1,1,1,4,,in,1,ɪn,1,in,ɪn,U,,1,0,0,1,0,0,1,1.0,0.0,
1,1,1,1,4,,in,2,'ɪn,1,in,'ɪn,P,,0,0,0,1,1,0,0,1.0,1.0,
1,1,1,1,5,,the,1,ðə,1,the,ðə,U,,1,0,0,0,0,1,1,0.0,0.0,
1,1,1,1,6,,widening,1,'waɪ.də.nɪŋ,1,wi,'waɪ,P,,0,0,0,0,1,0,0,,1.0,
1,1,1,1,6,,widening,1,'waɪ.də.nɪŋ,2,de,də,U,,0,0,0,0,0,0,1,,0.0,


In [None]:
x[0]

In [None]:
stop

In [None]:
%%timeit
    
ld=tokenize_paras_ld(txt)
ld

In [None]:
%%timeit
list(iter_paras_df(txt))

In [None]:
txtdf=pd.concat(iter_paras_df(txt))

In [None]:
# txtdf

In [None]:
%%timeit
list(iter_paras_df(txtdf))

In [None]:
def tokenize_sentwords_ll(tokdf,**kwargs):
    if not len(tokdf): return []
    return [list(getcol(sdf,'word_str')) for si,sdf in sorted(tokdf.groupby('sent_i'))]

In [None]:
def iter_paras_nlp(
        txt_or_tokdf,
        nlp=None,
        constituency=False,
        depparse=False,
        progress=True,
        syllabify=False,
        num_proc=1,
        lang=DEFAULT_LANG,
        **kwargs):
    
    # get nlp
    if nlp is None:
        processors=get_processors(constituency=constituency,depparse=depparse)
        print(processors)
        nlp=get_nlp(lang=lang, procesors=processors)
    
    oiterr=iter_paras_df(txt_or_tokdf,**kwargs)
    if progress: oiterr=tqdm(oiterr,total=get_num_paras(txt_or_tokdf))
    for para_df in oiterr:
        sentwords_ll = tokenize_sentwords_ll(para_df)
        doc = nlp(sentwords_ll)
        para_tokdf = tokenize_nlp_doc(para_df, doc, lang=lang, **kwargs)
        if constituency: para_tokdf = tokenize_constituency(para_tokdf,doc,**lang)
        if syllabify: para_tokdf=syllabify_df(para_tokdf,**kwargs)
        yield setindex(para_tokdf)
    
    

In [None]:
for pdf in iter_paras_nlp(txt): pass
pdf

In [None]:


# def scan_iter_nlp(
#         txt_or_tokdf,
#         nlp=None,
#         constituency=False,
#         depparse=False,
#         syllabify=False,
#         num_proc=1,
#         custom_tokenize=True,
#         lang=DEFAULT_LANG,
#         **kwargs):
    
#     # make orig tokdf
#     tokdf=tokenize_parasentword(txt_or_tokdf,**kwargs) if type(txt_or_tokdf)==str else txt_or_tokdf    
    
#     # get nlp
#     if nlp is None:
#         processors=get_processors(constituency=constituency,depparse=depparse,**kwargs)
#         nlp=get_nlp(lang=lang, procesors=processors,custom_tokenize=custom_tokenize)
    
#     objs=[(paradf,nlp) for para_i,paradf in sorted(tokdf.groupby('para_i'))]
#     kwargs=dict(constituency=constituency,depparse=depparse,syllabify=syllabify,**kwargs)
#     oiter=pmap_iter(do_scan_iter_nlp, objs, num_proc=num_proc, kwargs=kwargs)
    
#     yield from oiter
    
    
    
# def do_scan_iter_nlp(
#         obj,
#         constituency=False,
#         depparse=False,
#         syllabify=False,                 
#         **kwargs):
    
#     tokdf,nlp=obj
#     sentwords=tokenize_sentwords_ll(tokdf)
#     doc=nlp(sentwords)
#     para_str,para_doc=para_row['para_str'], para_row['para_doc']

#     # add anno?
#     if constituency: tokdf=tokenize_constituency(tokdf,para_doc,**kwargs)
#     if depparse: tokdf=tokenize_deps(tokdf,para_doc,**kwargs)
#     if syllabify: tokdf=syllabify_df(tokdf,**kwargs)

#     for k in ['para_i']: tokdf[k]=para_row[k]

#     # done
#     odf=setindex(tokdf)
#     odf.attrs=dict(para_row)
#     return tokdf


In [None]:
next(scan_iter_nlp(txt))

In [None]:
# next(scan_iter(txt))

In [None]:
# x=list(scan_iter(txt,num_proc=1,lim_paras=10))

In [None]:
for para in scan_iter(txt,
                      num_proc=1,
                      lim_paras=None,
                      shuffle=True,
                      depparse=False,
                      constituency=False): pass

In [None]:
# para.tail(25)