In [8]:
import os,sys; sys.path.insert(0,os.path.abspath('../../..'))
from epistolary import *

In [9]:
class TextSection(BaseText):
    _type='sections'

    @property
    def corpus(self): return self.source.corpus
    @property
    def path(self): return os.path.join(self.source.path,self._type,self.id)
    @property
    def addr(self): return os.path.join(self.source.addr,self.id)
    @property
    def txt(self): return self._txt if self._txt else ''
    @property
    def xml(self): return self._xml if self._xml else ''


class TextSectionLetter(TextSection):
    _type='letters'

    def deduce_recip(self, keys=['txt_front','txt_head']):
        from lltk.model.ner import get_ner_sentdf

        ltr_meta=self._meta

        txt = '     '.join(
            ltr_meta.get(argname,'').replace(' | ',' ')
            for argname in keys
        )
        
        byline_sentdf = None
        if not ltr_meta.get('sender_tok'):
            if byline_sentdf is None: byline_sentdf = get_ner_sentdf(txt)
            if 'epistolary_role' in set(byline_sentdf.columns):
                ltr_meta['sender_tok']=' '.join(byline_sentdf[byline_sentdf.epistolary_role=='sender'].text)
            else:
                ltr_meta['sender_tok']=''

        ## recip
        if not ltr_meta.get('recip_tok'):
            if byline_sentdf is None: byline_sentdf = get_ner_sentdf(txt)
            if 'epistolary_role' in set(byline_sentdf.columns):
                ltr_meta['recip_tok']=' '.join(byline_sentdf[byline_sentdf.epistolary_role=='recip'].text)
            else:
                ltr_meta['recip_tok']=''

        return ltr_meta


class TextSectionLetterChadwyck(TextSectionLetter):
    sep_sents='\n'
    sep_paras='\n\n'
    sep_txt='\n\n------------\n\n'

    @property
    def meta(self):
        if self._meta_: return self._meta_
        meta=self._meta
        ltr_xml=self.xml
        ltr_dom=self.dom

        meta_map={
            'id_letter':'idref',
            'txt_front':['front','caption']
        }
        for newtag,xtag in meta_map.items():
            meta[newtag]=clean_text(grab_tag_text(ltr_dom, xtag)) if xtag else ''
        ltrtitle=''
        if '</collection>' in ltr_xml and '<attbytes>' in ltr_xml:
            ltrtitle=ltr_xml.split('</collection>')[-1].split('<attbytes>')[0].strip()
        meta['txt_head']=ltrtitle if ltrtitle!=meta['txt_front'] else ''
        meta['letter_i']=self.letter_i
        meta['id']=f'L{self.letter_i:03}' if self.letter_i else meta['id_letter']
        self._meta_=meta
        return meta


    @property
    def txt(self,*x,**y):
        ltr_dom = remove_bad_tags(self.dom, BAD_TAGS)
        letters = list(ltr_dom(self.LTR))
        if not len(letters): letters=[ltr_dom]
        ltxts=[]
        for ltr in letters:
            ptxts=[]
            paras=list(ltr('p'))
            if not len(paras): paras=[ltr]
            for p in paras:
                sents = p('s')
                if not len(sents):
                    sents=nltk.sent_tokenize(p.text)
                else:
                    sents=[s.text.strip() for s in sents]
                # ptxt=self.sep_sents.join([escape_linebreaks(x) for x in sents if x])
                ptxt=self.sep_sents.join([x.replace('\n',' ') for x in sents if x])
                ptxts.append(ptxt)
            ltrtxt=self.sep_paras.join(ptxts).strip()
            ltxts.append(ltrtxt)
        otxt=self.sep_txt.join(ltxts).strip()
        return clean_text(otxt)


class TextEpistolaryChadwyck(BaseText):
    DIV='div3'
    LTR='letter'


    def letters(self,lim=None,progress=False,**kwargs):
        if self._letters is None:
            self._letters=[]
            div_strs=[
                ltrxml.split(f'<{self.DIV}>',1)[-1].strip()
                for ltrxml in self.xml.split(f'</{self.DIV}>')[:-1]
                if f'</{self.LTR}>' in ltrxml.split(f'<{self.DIV}>',1)[-1]
            ]
            letter_i=0
            iterr=tqdm(div_strs, disable=not progress, desc='Scanning for letters')
            for ltrxml in iterr:
                letter_i+=1 #len(o)+1
                letter_id=f'L{letter_i:03}'
                ltr=TextSectionLetterChadwyck(letter_id, _source=self,letter_i=letter_i)
                ltr._xml=ltrxml
                self._letters.append(ltr)
        return self._letters



In [10]:
t = Text(CLAR_ID, corpus='epistolary')
t.__class__ = TextEpistolaryChadwyck
tl=t.letters()


In [11]:
ts=random.choice(tl)
display(ts.meta)
ts.path, ts._type

{'letter_i': 443,
 'id': 'L443',
 'id_letter': 'Z300044513',
 'txt_front': 'Mr. Belford, To Robert Lovelace, Esq; || CLARISSA HARLOWE. | APRIL X. | [Then the year] | Ætat. xix.',
 'txt_head': 'LETTER XXXII.'}

('/Users/ryan/lltk_data/corpora/epistolary/texts/_chadwyck/Eighteenth-Century_Fiction/richards.01/letters/L443',
 'letters')

In [None]:
class TextEpistolary(Text):

    def compile(self,lim=None,progress=True,force=False,*x,**y):
        ofn_ltrs = os.path.join(C.paths['path_letters'], t.corpus.id, t.id + '.csv')
        ofn_xml = os.path.join(C.paths['path_xml'], t.corpus.id, t.id + '.xml')
        ofn_txt = os.path.join(C.paths['path_txt'], t.corpus.id, t.id + '.txt')

        if not force and os.path.exists(ofn_ltrs):
            odf = read_df(ofn_ltrs)
            odf_anno=load_with_anno(ofn_ltrs)
            if len(odf_anno) and 'id' in set(odf_anno.columns):
                odf_anno=odf_anno.set_index('id')
                odf_anno=odf_anno[[col for col in odf_anno.columns if not col.startswith('id_')]]
                odx_anno=dict((idx, dict(row)) for idx,row in odf_anno.iterrows())

                newkeys=set(rowdk for idx,rowd in odx_anno.items() for rowdk in rowd)
                for nk in newkeys:
                    odf[nk]=[odx_anno.get(idx,{}).get(nk,'') for idx in odf.id]
            
            return fix_meta(odf)
            

        ol=[]
        #if force or not os.path.exists(ofn_ltrs) or not os.path.exists(ofn_xml) or not os.path.exists(ofn_txt):
        if not os.path.exists(os.path.dirname(ofn_xml)): os.makedirs(os.path.dirname(ofn_xml))
        if not os.path.exists(os.path.dirname(ofn_txt)): os.makedirs(os.path.dirname(ofn_txt))            
        if not os.path.exists(os.path.dirname(ofn_ltrs)): os.makedirs(os.path.dirname(ofn_ltrs))            
        
        tdf=epistolarized_chadwyck_t(C,t=t,lim=lim,progress=progress,*x,**y)
        if not len(tdf): return pd.DataFrame()

    
        todf=fix_meta(tdf)

        for pxml,txml in zip(tdf.path_xml,tdf.xml):
            if pxml and txml:
                with open(pxml,'w') as of: of.write(txml)
        for ptxt,ttxt in zip(tdf.path_txt,tdf.txt):
            if ptxt and ttxt:
                with open(ptxt,'w') as of: of.write(ttxt)

        for needcol in {'sender_tok','sender_id','recip_tok','recip_id','front','title_letter'}:
            if not needcol in set(todf.columns): todf[needcol]=''
        for badcol in {'txt','xml','path_txt','path_xml'} & set(todf.columns):
            todf=todf.drop(badcol,1)
        
        cols=[
            'id','id_corpus','id_text','id_letter','letter_i',
            'txt_head',
            'txt_front',
            'sender_tok',#'sender_id',
            'recip_tok',#'recip_id',
        ]
        odf_ltrs=todf[[c for c in cols if c in set(todf.columns)]].fillna('')
        save_df(fix_meta(odf_ltrs), ofn_ltrs, verbose=True)
        return odf_ltrs

        
    def compile_text_chars(self,t,tdf=None,force=False,force_inner=False,verbose=False,*x,**y):
        id_corpus=t.corpus.id
        id_text=t.id

        ofn_tok2id = os.path.join(self.path_chars, id_corpus, id_text, 'tok2id.csv')
        ofn_id2meta = os.path.join(self.path_chars, id_corpus, id_text, 'id2meta.csv')

        if force or not os.path.exists(ofn_tok2id):
            # get latest sender,recip data
            if tdf is None: tdf=self.compile_text(t,force=force_inner,*x,**y)
            odf_tok2id=calculate_tok2id(tdf.fillna(''))
            save_df(odf_tok2id, ofn_tok2id, verbose=verbose, index=False)
            if verbose: display(odf_tok2id)
        else:
            odf_tok2id=read_df(ofn_tok2id)
        
        # update with anno
        odf_tok2id_anno=load_with_anno(ofn_tok2id)
        if len(odf_tok2id_anno):
            d_tok2id_anno=dict(zip(odf_tok2id_anno.char_tok, odf_tok2id_anno.char_id)) if len(odf_tok2id_anno) else {}
            odf_tok2id['char_id']=[d_tok2id_anno.get(ctok,cid) for ctok,cid in zip(odf_tok2id.char_tok,odf_tok2id.char_id)]
        
        

        ## id2meta
        if force or not os.path.exists(ofn_id2meta):
            ## id2meta
            char_ids = Counter()
            for char_id,char_tok_count in zip(odf_tok2id.char_id, odf_tok2id.char_tok_count):
                char_ids[char_id]+=char_tok_count
            # init?
            id2meta_l=[]
            for char_id,char_id_count in char_ids.most_common():
                char_dx={'char_id':char_id, 'char_id_count':char_id_count, **chardata_metakeys_initial}
                id2meta_l.append(char_dx)
            odf_id2meta = pd.DataFrame(id2meta_l).fillna('')
            save_df(odf_id2meta, ofn_id2meta, verbose=verbose, index=False)
            if verbose: display(odf_id2meta)
        else:
            odf_id2meta=read_df(ofn_id2meta)

        # update with anno
        odf_id2meta=odf_id2meta.set_index('char_id')
        odf_id2meta_anno=load_with_anno(ofn_id2meta)
        if len(odf_id2meta_anno) and 'char_id' in set(odf_id2meta_anno.columns):
            odf_id2meta_anno=odf_id2meta_anno.fillna('').set_index('char_id')
            odf_id2meta_anno=odf_id2meta_anno[[col for col in odf_id2meta_anno if col not in {'char_id_count'}]]
            odf_id2meta.update(odf_id2meta_anno)
        odf_id2meta=odf_id2meta.reset_index()
        
        return odf_tok2id.fillna(''), odf_id2meta.fillna('')
        
    def get_letters(self,t,force=False,*x,**y):
        df_letters=self.compile_text(t,force=force,*x,**y)
        df_tok2id,df_id2meta=self.compile_text_chars(t,force=force,*x,**y)
        d_tok2id=dict(zip(df_tok2id.char_tok, df_tok2id.char_id))
        cols_id2meta_uniq=[col for col in df_id2meta if col not in set(df_letters.columns)]
        df_id2meta=df_id2meta[cols_id2meta_uniq]

        for sndr in ['sender','recip']:
            sidkey=f'{sndr}_id'
            df_letters[sidkey]=df_letters[f'{sndr}_tok'].apply(lambda stok: d_tok2id.get(stok,stok))
            senders = set(df_letters[sidkey])
            df_id2meta_now = df_id2meta[df_id2meta.char_id.isin(senders)]
            df_id2meta_now.columns=[col.replace('char_',f'{sndr}_') for col in df_id2meta.columns]
            if sidkey in set(df_id2meta_now.columns):
                df_letters = df_letters.merge(df_id2meta_now, how='left', on=sidkey)
            
        return df_letters


    def iter_letter_networks(self,t,dfletters=None,*x,**y):
        dfletters=self.get_letters(t,*x,**y) if dfletters is None else dfletters
        return iter_letter_networks_from_dfletters(dfletters,*x,**y)

    def get_letter_network(self,t,dfletters=None,*x,**y):
        dfletters=self.get_letters(t,*x,**y) if dfletters is None else dfletters
        return get_letter_network_from_dfletters(dfletters,*x,**y)







class Epistolary(Corpus):
    NAME='Epistolary'
    ID='epistolary'
    TEXT_CLASS=TextEpistolary
    CORPORA_TO_START_WITH = ['Chadwyck']


# C = Epistolary()
# C.meta

In [None]:
C=Epistolary()
C.init()

In [None]:
t=C.texts()[-1]
t, t.source

In [None]:
C.texts()

In [None]:
# t = Text(id='_chadwyck/Eighteenth-Century_Fiction/burney.01')
# t.meta

In [None]:
# Chad=load('Chadwyck')
# Chad.init()
# Chad.meta

In [None]:
meta=Chad.load_metadata_file()


In [None]:


# def load_metadata_from_df_or_fn(idf,force=False,**attrs):
#     if type(idf)==str: idf=read_df(idf)
#     if idf is None or not len(idf): return pd.DataFrame()
#     #return df_requiring_id_and_corpus(idf,**attrs)
#     return df_requiring_id(idf,**attrs).fillna('')


# def df_requiring_id(df,idkey='id',fillna='',*x,**y):
#     if df is None or not len(df): return pd.DataFrame(columns=[],index=[]).rename_axis(idkey)
#     if df.index.name==idkey and not idkey in set(df.columns): df=df.reset_index()
#     if not idkey in set(df.columns): df[idkey]=''
#     df[idkey]=df[idkey].fillna('')
#     df[idkey]=[(idx if idx else f'X{i+1:04}') for i,idx in enumerate(df[idkey])]
#     df=df.fillna(fillna) if fillna is not None else df
#     df=df.set_index(idkey)
#     return df

In [None]:
t=C.texts()[1]
t

In [None]:
t.source.meta

In [None]:
# !echo $PATH

In [None]:
idx='_chadwyck/Eighteenth-Century_Fiction/richards.04'

In [None]:
C = Epistolary()
C.init()
C.t.meta

In [None]:
t=C.t
t._meta

In [None]:
t.source.corpus.init_metadata()

In [None]:
# t.id, t.source.id

In [None]:
t.meta

In [None]:
t.source_text()

In [None]:
source_text(t)

In [None]:
Chad=load('Chadwyck')
Chad.meta

In [None]:
# load_metadata_from_df_or_fn(meta, id_corpus_default='chad')
meta_final=pd.DataFrame([
    {'id_text':'Eighteenth-Century_Fiction/richards.01', 'final':'!', 'id_corpus':'chadwyck'},
])
load_metadata_from_df_or_fn(meta_final)
# meta_final

In [None]:
C = Epistolary()


In [None]:
# load_metadata_from_df_or_fn(meta)

In [None]:
meta

In [None]:
# meta.update??

In [None]:
init_metadata(Epistolary(),meta_init=meta,other_meta=[lltk.load('Chadwyck')])

In [None]:
C=Epistolary()
C.init_metadata()

In [None]:
C.path_metadata_init

In [None]:
C.load_metadata()

In [None]:
C=Epistolary()

In [None]:
C.texts

In [None]:
clary = get_clarissa()
clary = SourceTextEpistolary(clary, hello='x!?')