# Minimal version !?

In [1]:
import sys; sys.path.insert(0,'/Users/ryan/github/prosodic/')
import sys; sys.path.insert(0,'/Users/ryan/github/cadence/')
from cadence.imports import *

In [2]:
def Text(*args,**kwargs):
    return TextModel(*args,**kwargs)

In [3]:
class TextModel(object):
    def __init__(self,txt_or_fn,**kwargs):
        self.fn,self.txt=to_fn_txt(txt_or_fn)
        self.attrs=kwargs

        self._numparas={}
        self._paras_d={}
        self._words_d={}
        self._sylls_d={}
        self._paras_df={}
        self._words_df={}
        self._sylls_df={}
        self._nlp_doc={}
        self._syntax_d={}
        self._syntax_df={}

    ####################################
    ## PARAGRAPHS
    ####################################

    def gen_paras_d(self,**kwargs):
        if not self._paras_d:
            for para_d in tokenize_paras_ld(self.txt, **kwargs):
                self._paras_d[para_d['para_i']] = para_d
            self._numparas=len(self._paras_d)

    def get_para_d(self,para_i,**kwargs):
        self.gen_paras_d()
        return self._paras_d.get(para_i)
    
    def get_paras_ld(self,shuffle_paras=SHUFFLE_PARAS,lim_paras=LIM_PARAS,**kwargs):
        self.gen_paras_d()
        paras_ld=[v for k,v in sorted(self._paras_d.items())]
        if shuffle_paras: random.shuffle(paras_ld)
        if lim_paras: paras_ld=paras_ld[:lim_paras]
        return paras_ld

    def iter_paras_d(self,
            progress=True,
            desc='Iterating over paragraphs',
            **kwargs):
        paras_ld=self.get_paras_ld(**kwargs)
        if progress: paras_ld=tqdm(paras_ld,desc=desc)
        yield from paras_ld
    
    def paras(self,**kwargs):
        return getcache_df(self, '_paras_df', self.iter_paras_d, cache=False, **kwargs)
    


    ####################################
    ## WORDS
    ####################################

    def get_words_ld(self,para_i,**kwargs):
        if not para_i in self._words_d:
            para_d=self.get_para_d(para_i)
            if para_d is None: 
                print(para_i)
                return []
            para_str=para_d['para_str']
            self._words_d[para_i]=list(tokenize_sentwords_iter(para_str,para_i=para_i,**kwargs))
        return self._words_d[para_i]
    def get_words_df(self,para_i,**kwargs): return pd.DataFrame(self.get_words_ld(para_i,**kwargs))
    
    def iter_words_d(self,**kwargs):
        if not 'desc' in kwargs: kwargs['desc']='Tokenizing sentences and words'
        for para_d in self.iter_paras_d(**kwargs):
            yield from self.get_words_ld(para_d['para_i'])
    
    def words(self,**kwargs):
        return getcache_df(self, '_words_df', self.iter_words_d, cache=False, **kwargs)
    
    ####################################
    ## SYLLABIFY
    ####################################

    def get_sylls_d(self,word_tok,**kwargs):
        if not word_tok in self._sylls_d:
            self._sylls_d[word_tok]=get_syllable_ld(word_tok,**kwargs)
        yield from self._sylls_d[word_tok]
    
    def iter_sylls_d(self,**kwargs):
        if not 'desc' in kwargs: kwargs['desc']='Tokenizing syllables'
        for word_d in self.iter_words_d(**kwargs):
            sylls_d=self.get_sylls_d(word_d['word_tok'])
            for syll_d in sylls_d:
                yield dict(
                    para_i=word_d['para_i'],
                    sent_i=word_d['sent_i'],
                    word_i=word_d['word_i'],
                    **syll_d
                )
        
    def sylls(self,**kwargs):
        return getcache_df(self, '_sylls_df', self.iter_sylls_d, cache=False, **kwargs)


    ####################################
    ## NLP DOCS
    ####################################

    def get_nlp_doc(self,para_i,**kwargs):
        if not para_i in self._nlp_doc:
            para_words_ld=self.get_words_ld(para_i,**kwargs)
            para_words_ll=tokenize_sentwords_ll(para_words_ld)
            self._nlp_doc[para_i]=get_nlp_doc(para_words_ll,para_i=para_i,**kwargs)
        return self._nlp_doc[para_i]
    def iter_nlp_docs(self,**kwargs):
        if not 'desc' in kwargs: kwargs['desc']='Parsing NLP documents'
        for para_d in self.iter_paras_d(**kwargs):
            yield self.get_nlp_doc(para_d['para_i'],**kwargs)
    def docs(self,**kwargs):
        return list(self.iter_nlp_docs(**kwargs))

    ####################################
    ## NLP FEATS
    ####################################

    def get_syntax_df(self,para_i,sent_i=None,index=True,**kwargs):
        if not para_i in self._syntax_d:
            para_doc=self.get_nlp_doc(para_i)
            dffeat=get_nlp_feats_df(para_doc, **kwargs)
            dfword=self.get_words_df(para_i)
            try:
                odf=dfword.merge(dffeat,on=['sent_i','word_i'])
            except KeyError:
                odf=dfword
            self._syntax_d[para_i]=odf
        odf=self._syntax_d[para_i].assign(para_i=para_i)
        if sent_i is not None: odf=odf[odf.sent_i==sent_i]
        if index and type(odf)==pd.DataFrame and len(odf): odf=setindex(odf)
        return odf
    def iter_syntax_df(self,**kwargs):
        if not 'desc' in kwargs: kwargs['desc']='Parsing NLP documents'
        for para_d in self.iter_paras_d(**kwargs):
            yield self.get_syntax_df(para_d['para_i'])
    def syntax(self,**kwargs):
        o=list(self.iter_syntax_df(**kwargs))
        odf=pd.concat(resetindex(xdf) for xdf in o).fillna('') if len(o) else pd.DataFrame()
        return setindex(odf)

In [4]:
joyce_path = os.path.join(PATH_TXTS,'joyce_oxen.txt')
# s='Stately, plump Buck Mulligan came from the stairhead, bearing a bowl of lather on which a mirror and a razor lay crossed.'
joyce = Text(joyce_path)
# joyce.paras()
# joyce.words()
# joyce.sylls()
# next(joyce.iter_nlp_docs())
# joyce.get_syntax_df(1)
# joyce.get_nlp_doc(1)
# joyce.get_syntax_df(1)
# joyce.get_words_df(1)

In [9]:
joyce.syntax()

Parsing NLP documents: 100%|██████████████████████████████████████████████| 10/10 [00:00<00:00, 108.27it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0,Unnamed: 18_level_0,Unnamed: 19_level_0,Unnamed: 20_level_0,Unnamed: 21_level_0,word_case,word_definite,word_degree,word_depth,word_gender,word_misc,word_mood,word_number,word_numtype,word_person,word_polarity,word_poss,word_prontype,word_tense,word_verbform,word_voice
para_i,sent_i,sent_depth1,sent_depth2,sent_depth3,sent_depth4,sent_depth5,sent_depth6,sent_depth7,sent_depth8,sent_depth9,word_i,sentpart_i,line_i,word_pref,word_str,word_tok,word_lemma,word_upos,word_xpos,word_deprel,word_head,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
1,1,001_0-ROOT(1-NP,001_0-ROOT(1-NP(2-NNP,,,,,,,,1,1,1,,Deshil,deshil,Deshil,PROPN,NNP,root,0,,,,3,,,,Sing,,,,,,,,
1,1,001_0-ROOT(1-NP,001_0-ROOT(1-NP(2-NNP,,,,,,,,2,1,1,,Holles,holles,Holles,PROPN,NNP,flat,1,,,,3,,,,Sing,,,,,,,,
1,1,001_0-ROOT(1-NP,001_0-ROOT(1-NP(2-NNP,,,,,,,,3,1,1,,Eamus.,eamus,Eamus.,PROPN,NNP,flat,1,,,,3,,,,Sing,,,,,,,,
1,2,001_0-ROOT(1-NP,001_0-ROOT(1-NP(2-NNP,,,,,,,,1,2,1,,Deshil,deshil,Deshil,PROPN,NNP,root,0,,,,3,,,,Sing,,,,,,,,
1,2,001_0-ROOT(1-NP,001_0-ROOT(1-NP(2-NNP,,,,,,,,2,2,1,,Holles,holles,Holles,PROPN,NNP,flat,1,,,,3,,,,Sing,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,5,001_0-ROOT(1-S,005_0-ROOT(1-S(2-VP,007_0-ROOT(1-S(2-VP(3-PP,007_0-ROOT(1-S(2-VP(3-PP(4-IN,,,,,,7,8,1,,of,of,of,ADP,IN,case,8,,,,5,,,,,,,,,,,,
10,5,001_0-ROOT(1-S,005_0-ROOT(1-S(2-VP,007_0-ROOT(1-S(2-VP(3-PP,008_0-ROOT(1-S(2-VP(3-PP(4-S,008_0-ROOT(1-S(2-VP(3-PP(4-S(5-NP,008_0-ROOT(1-S(2-VP(3-PP(4-S(5-NP(6-NP,008_0-ROOT(1-S(2-VP(3-PP(4-S(5-NP(6-NP(7-JJ,,,8,8,1,,blushes,blushes,blush,NOUN,NNS,nmod,6,,,,8,,,,Plur,,,,,,,,
10,5,001_0-ROOT(1-S,005_0-ROOT(1-S(2-VP,007_0-ROOT(1-S(2-VP(3-PP,008_0-ROOT(1-S(2-VP(3-PP(4-S,008_0-ROOT(1-S(2-VP(3-PP(4-S(5-NP,008_0-ROOT(1-S(2-VP(3-PP(4-S(5-NP(6-NP,009_0-ROOT(1-S(2-VP(3-PP(4-S(5-NP(6-NP(7-NN,,,9,8,1,,his,his,he,PRON,PRP$,nmod:poss,10,,,,8,Masc,,,Sing,,3,,Yes,Prs,,,
10,5,001_0-ROOT(1-S,005_0-ROOT(1-S(2-VP,007_0-ROOT(1-S(2-VP(3-PP,008_0-ROOT(1-S(2-VP(3-PP(4-S,008_0-ROOT(1-S(2-VP(3-PP(4-S(5-NP,010_0-ROOT(1-S(2-VP(3-PP(4-S(5-NP(6-PP,010_0-ROOT(1-S(2-VP(3-PP(4-S(5-NP(6-PP(7-IN,,,10,8,1,,word,word,word,NOUN,NN,compound,11,,,,8,,,,Sing,,,,,,,,


In [11]:
joyce.sylls()

Tokenizing syllables: 100%|██████████████████████████████████████████████| 10/10 [00:00<00:00, 1183.29it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,is_funcword
para_i,sent_i,word_i,word_tok,word_ipa_i,word_ipa,syll_i,syll_str,syll_ipa,Unnamed: 9_level_1
1,1,1,deshil,1,'dɛs.hɪl,1,des,'dɛs,0
1,1,1,deshil,1,'dɛs.hɪl,2,hil,hɪl,0
1,1,2,holles,1,'hɑlz,1,holles,'hɑlz,0
1,1,3,eamus,1,'i.mʌs,1,ea,'i,0
1,1,3,eamus,1,'i.mʌs,2,mus,mʌs,0
...,...,...,...,...,...,...,...,...,...
10,5,8,blushes,1,'blʌ.ʃəz,2,hes,ʃəz,0
10,5,9,his,1,hɪz,1,his,hɪz,1
10,5,10,word,1,'wɛːd,1,word,'wɛːd,0
10,5,11,winning,1,'wɪ.nɪŋ,1,win,'wɪ,0


In [6]:
joyce.sylls()

Tokenizing syllables: 100%|████████████████████████████████████████████████| 10/10 [00:00<00:00, 20.46it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,is_funcword
para_i,sent_i,word_i,word_tok,word_ipa_i,word_ipa,syll_i,syll_str,syll_ipa,Unnamed: 9_level_1
1,1,1,deshil,1,'dɛs.hɪl,1,des,'dɛs,0
1,1,1,deshil,1,'dɛs.hɪl,2,hil,hɪl,0
1,1,2,holles,1,'hɑlz,1,holles,'hɑlz,0
1,1,3,eamus,1,'i.mʌs,1,ea,'i,0
1,1,3,eamus,1,'i.mʌs,2,mus,mʌs,0
...,...,...,...,...,...,...,...,...,...
10,5,8,blushes,1,'blʌ.ʃəz,2,hes,ʃəz,0
10,5,9,his,1,hɪz,1,his,hɪz,1
10,5,10,word,1,'wɛːd,1,word,'wɛːd,0
10,5,11,winning,1,'wɪ.nɪŋ,1,win,'wɪ,0
