# New nlpfuncs for hathi/digitizedwork

In [1]:
%run ppa_init.ipynb

In [2]:
digiworks=[t for t in DigitizedWork.objects.all() if t.hathi]
len(digiworks)

INFO:parasolr.django.solrclient:Connecting to default Solr http://localhost:8983/solr/ppanlp


5124

## Getting full text

### Hathi Trust: via page data

In [3]:
# Adapted from hathi_page_index_data

def iter_pages(self, page_span=None, **meta):
    """Get page content for the specified digitized work from Hathi
    pairtree and return data to be indexed in solr."""

    from zipfile import ZipFile
    from eulxml.xmlmap import load_xmlobject_from_file
    from ppa.archive.hathi import MinimalMETS
    from pairtree import storage_exceptions


    # load mets record to pull metadata about the images
    try:
        mmets = load_xmlobject_from_file(self.metsfile_path(), MinimalMETS)
    except storage_exceptions.ObjectNotFoundException:
        # logger.debug(
        #     "Pairtree data for %s not found",
        #     self.hathi_id,
        # )
        return

    # read zipfile contents in place, without unzipping
    with ZipFile(self.zipfile_path()) as ht_zip:
        # yield a generator of index data for each page; iterate
        # over pages in METS structmap
        for i, page in enumerate(mmets.structmap_pages, 1):
            # if the document has a page range defined, skip any pages not in range
            if page_span and i not in page_span: continue

            # zipfile spec uses / for path regardless of OS
            pagefilename = f'{self.content_dir}/{page.text_file_location}'

            with ht_zip.open(pagefilename) as pagefile:
                try:
                    yield {
                        "source_id": self.hathi_id,
                        "page_id": page.text_file.sequence,                        
                        "page_i": i,
                        "content": pagefile.read().decode("utf-8"),
                        "order": page.order,
                        "label": page.display_label,
                        "tags": page.label.split(", ") if page.label else [],
                        "item_type": "page",
                    }
                except StopIteration:
                    return

                
HathiObject.iter_pages = iter_pages

In [4]:
work = random.choice(digiworks)
hathiobj = work.hathi
df_pages = pd.DataFrame(hathiobj.iter_pages())
df_pages

Unnamed: 0,source_id,page_id,page_i,content,order,label,tags,item_type
0,mdp.39015059898422,1,1,\nMftT\n3 9015 00394 266 4\nUniversity of Mich...,1,1,"[FRONT_COVER, IMAGE_ON_PAGE, IMPLICIT_PAGE_NUM...",page
1,mdp.39015059898422,2,2,\n,2,2,"[IMAGE_ON_PAGE, IMPLICIT_PAGE_NUMBER]",page
2,mdp.39015059898422,3,3,,3,3,[IMPLICIT_PAGE_NUMBER],page
3,mdp.39015059898422,4,4,,4,4,"[BLANK, IMPLICIT_PAGE_NUMBER]",page
4,mdp.39015059898422,5,5,A\nSYSTEM.\nOF\nPHONIC WRITING.\nBY\nCHARLES M...,5,1,"[TITLE, FIRST_CONTENT_CHAPTER_START, UNTYPICAL...",page
5,mdp.39015059898422,6,6,"N f 7'\n<\nEntered, according to Act of Congre...",6,2,"[UNTYPICAL_PAGE, IMPLICIT_PAGE_NUMBER]",page
6,mdp.39015059898422,7,7,PREFACE.\nPhonic Writing consists of three kin...,7,3,"[CHAPTER_START, UNTYPICAL_PAGE, IMPLICIT_PAGE_...",page
7,mdp.39015059898422,8,8,,8,4,[IMPLICIT_PAGE_NUMBER],page
8,mdp.39015059898422,9,9,A SYSTEM OF PHONIC WRITING.\nPHONOSCRIPT AND P...,9,5,"[CHAPTER_START, IMPLICIT_PAGE_NUMBER]",page
9,mdp.39015059898422,10,10,G\nA SYSTEM OF PHONIC WRITING.\nown. That is t...,10,6,[],page


### From pages to strings

In [5]:
def get_txt_hathi(self,page_sep='\n',line_sep='\r'):
    return page_sep.join(
        standardize_newlines(d.get('content',''),correct=line_sep)
        for d in self.iter_pages()
    )

HathiObject.get_txt = get_txt_hathi

In [6]:
def get_txt_digiwork(self, *args, **kwargs):
    if self.hathi: return self.hathi.get_txt(*args, **kwargs)
    raise NotImplementedError

DigitizedWork.get_txt = get_txt_digiwork

In [7]:
work = random.choice(digiworks)
work.get_txt()[:100]

'\rUC-NRLF\r$B 257 874\rHANDBOON\rYA 00396\r\nJACOB VOORSANGER MEMORIAL\r\rSITATIS,\rBASARAKAT\rVIVERSI\rLIRIKRA'

In [8]:
work.hathi.get_txt()[:100]

'\rUC-NRLF\r$B 257 874\rHANDBOON\rYA 00396\r\nJACOB VOORSANGER MEMORIAL\r\rSITATIS,\rBASARAKAT\rVIVERSI\rLIRIKRA'

## Tokenizing

In [9]:
tokenize("This—iswefwe®qa test")

['this', 'iswefwe', 'qa', 'test']

In [10]:
def get_tokens(self):
    return tokenize(self.get_txt())

DigitizedWork.get_tokens = get_tokens

In [11]:
work.get_tokens()[:5]

['uc', 'nrlf', 'b', 'handboon', 'ya']

## Counting

In [12]:
def calc_counts(self):
    # return Counter(self.get_tokens())
    # return dict(Counter(self.get_tokens()))
    return dict(Counter(self.get_tokens()).most_common())

DigitizedWork.calc_counts = calc_counts

In [13]:
work.calc_counts().get('ballad',0)

0

### Involving the db

In [14]:
def get_or_create_counts(self):
    try:
        wwc = WorkWordCounts.objects.get(work=self)
        countd = wwc.data
    except WorkWordCounts.DoesNotExist:    
        countd = self.calc_counts()
        try:
            wwc, created = WorkWordCounts.objects.get_or_create(work=self, data=countd)
        except Exception as e:
            log.debug(e)
    return countd

def get_counts(self):
    return dict(Counter(self.get_or_create_counts()).most_common())

DigitizedWork.get_or_create_counts = get_or_create_counts
DigitizedWork.get_counts = get_counts

In [15]:
work=random.choice(digiworks)
work

<DigitizedWork: inu.30000035051881>

In [16]:
def cache_all():
    pmap_run(
        get_or_create_counts,
        digiworks,
        num_proc=8,
    )

# cache_all()

### Involving the DB in another way: WorkWordCount

In [17]:
def get_or_create_counts2(self):
    try:
        wwc = WorkWordCount.objects.get(work=self)
        countd = wwc.data
    except WorkWordCount.DoesNotExist:    
        countd = self.calc_counts()
        try:
            wwc, created = WorkWordCount.objects.get_or_create(work=self, data=countd)
        except Exception as e:
            log.debug(e)
    return countd

def get_counts2(self):
    return dict(Counter(self.get_or_create_counts()).most_common())

DigitizedWork.get_or_create_counts2 = get_or_create_counts2
DigitizedWork.get_counts2 = get_counts2

In [28]:
# countd = work.calc_counts()
# countd

In [18]:
def store_counts(self, **opts):
    countd = self.calc_counts()
    workwordcounts = [(self,word,countd[word]) for word in countd]
    pmap_run(do_store_counts, workwordcounts, **opts)


def do_store_counts(workwordcount):
    work,word,count=workwordcount
    try:
        wordobj, word_created = Word.objects.get_or_create(token=word)
        obj, created = WorkWordCount.objects.get_or_create(
            work=work,
            word=wordobj,
            count = count
        )
    except Exception as e:
        logging.debug(e)


DigitizedWork.store_counts = store_counts

In [20]:
work.store_counts(lim=None, num_proc=8)

Mapping do_store_counts() [x8]: 100%|██████████| 13429/13429 [00:19<00:00, 703.19it/s]


In [21]:
# qset=WorkWordCount.objects.filter(work=work)
