In [1]:
import os,sys; sys.path.insert(0,os.path.abspath('../../..'))
from epistolary import *

In [2]:

class TextSectionLetter(TextSection):
    def deduce_recip(self, meta=None,keys=['txt_front','txt_head']):
        from lltk.model.ner import get_ner_sentdf

        ltr_meta=meta if meta is not None else self._meta

        txt = '     '.join(
            ltr_meta.get(argname,'').replace(' | ',' ')
            for argname in keys
        )
        
        byline_sentdf = None
        sender,recip = '',''
        if not ltr_meta.get('sender_tok'):
            if byline_sentdf is None: byline_sentdf = deduce_recip(get_ner_sentdf(txt))
            if 'epistolary_role' in set(byline_sentdf.columns):
                sender=' '.join(byline_sentdf[byline_sentdf.epistolary_role=='sender'].text)

        ## recip
        if not ltr_meta.get('recip_tok'):
            if byline_sentdf is None: byline_sentdf = deduce_recip(get_ner_sentdf(txt))
            if 'epistolary_role' in set(byline_sentdf.columns):
                recip=' '.join(byline_sentdf[byline_sentdf.epistolary_role=='recip'].text)
        
        return sender,recip


class TextSectionLetterChadwyck(TextSectionLetter):
    sep_sents='\n'
    sep_paras='\n\n'
    sep_txt='\n\n------------\n\n'

    @property
    def meta(self):
        if self._meta_: return self._meta_
        meta=self._meta
        ltr_xml=self.xml
        ltr_dom=self.dom

        meta_map={
            'id_letter':'idref',
            'txt_front':['front','caption']
        }
        for newtag,xtag in meta_map.items():
            meta[newtag]=clean_text(grab_tag_text(ltr_dom, xtag)) if xtag else ''
        ltrtitle=''
        if '</collection>' in ltr_xml and '<attbytes>' in ltr_xml:
            ltrtitle=ltr_xml.split('</collection>')[-1].split('<attbytes>')[0].strip()
        meta['txt_head']=ltrtitle if ltrtitle!=meta['txt_front'] else ''
        meta['letter_i']=self.letter_i
        meta['id']=f'L{self.letter_i:03}' if self.letter_i else meta['id_letter']

        ## deduce recips?
        meta['sender_tok'], meta['recip_tok'] = self.deduce_recip(meta)

        self._meta_=meta
        return meta


    @property
    def txt(self,*x,**y):
        ltr_dom = remove_bad_tags(self.dom, BAD_TAGS)
        letters = list(ltr_dom(self.LTR))
        if not len(letters): letters=[ltr_dom]
        ltxts=[]
        for ltr in letters:
            ptxts=[]
            paras=list(ltr('p'))
            if not len(paras): paras=[ltr]
            for p in paras:
                sents = p('s')
                if not len(sents):
                    sents=nltk.sent_tokenize(p.text)
                else:
                    sents=[s.text.strip() for s in sents]
                # ptxt=self.sep_sents.join([escape_linebreaks(x) for x in sents if x])
                ptxt=self.sep_sents.join([x.replace('\n',' ') for x in sents if x])
                ptxts.append(ptxt)
            ltrtxt=self.sep_paras.join(ptxts).strip()
            ltxts.append(ltrtxt)
        otxt=self.sep_txt.join(ltxts).strip()
        return clean_text(otxt)


class TextEpistolaryChadwyck(BaseText):
    DIV='div3'
    LTR='letter'
    SECTION_CLASS=TextSectionLetterChadwyck

    @property
    def letters(self,lim=None,progress=False,**kwargs):
        if self._letters is None:
            self._letters=[]
            div_strs=[
                ltrxml.split(f'<{self.DIV}>',1)[-1].strip()
                for ltrxml in self.xml.split(f'</{self.DIV}>')[:-1]
                if f'</{self.LTR}>' in ltrxml.split(f'<{self.DIV}>',1)[-1]
            ]
            letter_i=0
            iterr=tqdm(div_strs, disable=not progress, desc='Scanning for letters')
            for ltrxml in iterr:
                letter_i+=1 #len(o)+1
                letter_id=f'L{letter_i:03}'
                #ltr=TextSectionLetterChadwyck(letter_id, _source=self,letter_i=letter_i)
                ltr=self.init_section(letter_id, letter_i=letter_i)
                ltr._xml=ltrxml
                self._letters.append(ltr)
        return self._letters


In [14]:
t = Text(CLAR_ID, corpus='epistolary')
t.__class__ = TextEpistolaryChadwyck
tl=t.letters
t

[TextEpistolaryChadwyck](_epistolary/_chadwyck/Eighteenth-Century_Fiction/richards.01)

In [15]:
ts=random.choice(tl)
print(ts)
ts.meta

[TextSectionLetterChadwyck](_epistolary/_chadwyck/Eighteenth-Century_Fiction/richards.01/L497)


{'letter_i': 497,
 'id': 'L497',
 'id_letter': 'Z300044567',
 'txt_front': 'Mr. Belford, To James Harlowe, jun. Esq; || The WILL; || To my Executor.',
 'txt_head': 'LETTER [LXXXVI.]',
 'sender_tok': 'Mr. Belford',
 'recip_tok': 'James Harlowe'}

In [16]:
ts.id, ts.addr, ts.path

('_chadwyck/Eighteenth-Century_Fiction/richards.01/L497',
 '_epistolary/_chadwyck/Eighteenth-Century_Fiction/richards.01/L497',
 '/Users/ryan/lltk_data/corpora/epistolary/texts/_chadwyck/Eighteenth-Century_Fiction/richards.01/L497')

In [17]:

class TextEpistolary(BaseText):
    DIV=''
    LTR=''
    @property
    def letters(self): return []


class Epistolary(BaseCorpus):
    NAME='Epistolary'
    ID='epistolary'
    TEXT_CLASS=TextEpistolary
    CORPORA_TO_START_WITH = ['Chadwyck']

    def init(self,*x,**y):
        super().init(*x,**y)
        
        # recast text objects by corpus
        for idx,t in self._textd.items():
            if t.source and t.source.corpus.id=='chadwyck':
                t.__class__ = TextEpistolaryChadwyck

In [18]:
C=Epistolary()
C.init()
# C._textd

In [19]:
for t in C.texts():
    print(f'{len(t.letters):<4} {t}')

0    [TextEpistolary](_epistolary/Eighteenth-Century_Fiction/richards.02)
155  [TextEpistolaryChadwyck](_epistolary/_chadwyck/Eighteenth-Century_Fiction/richards.04)
84   [TextEpistolaryChadwyck](_epistolary/_chadwyck/Eighteenth-Century_Fiction/burney.01)


In [20]:
ts=random.choice(t.letters)
ts

[TextSectionLetterChadwyck](_epistolary/_chadwyck/Eighteenth-Century_Fiction/burney.01/L080)

'/Users/ryan/lltk_data/corpora/epistolary/texts/_chadwyck/Eighteenth-Century_Fiction/burney.01'