# New nlpfuncs for hathi/digitizedwork

In [1]:
%run ppa_init.ipynb

In [2]:
digiworks=[t for t in DigitizedWork.objects.all() if t.hathi]
len(digiworks)

INFO:parasolr.django.solrclient:Connecting to default Solr http://localhost:8983/solr/ppanlp


5124

## Getting full text

### Hathi Trust: via page data

In [3]:
# Adapted from hathi_page_index_data

def iter_pages(self, page_span=None, **meta):
    """Get page content for the specified digitized work from Hathi
    pairtree and return data to be indexed in solr."""

    from zipfile import ZipFile
    from eulxml.xmlmap import load_xmlobject_from_file
    from ppa.archive.hathi import MinimalMETS
    from pairtree import storage_exceptions


    # load mets record to pull metadata about the images
    try:
        mmets = load_xmlobject_from_file(self.metsfile_path(), MinimalMETS)
    except storage_exceptions.ObjectNotFoundException:
        # logger.debug(
        #     "Pairtree data for %s not found",
        #     self.hathi_id,
        # )
        return

    # read zipfile contents in place, without unzipping
    with ZipFile(self.zipfile_path()) as ht_zip:
        # yield a generator of index data for each page; iterate
        # over pages in METS structmap
        for i, page in enumerate(mmets.structmap_pages, 1):
            # if the document has a page range defined, skip any pages not in range
            if page_span and i not in page_span: continue

            # zipfile spec uses / for path regardless of OS
            pagefilename = f'{self.content_dir}/{page.text_file_location}'

            with ht_zip.open(pagefilename) as pagefile:
                try:
                    yield {
                        "source_id": self.hathi_id,
                        "page_id": page.text_file.sequence,                        
                        "page_i": i,
                        "content": pagefile.read().decode("utf-8"),
                        "order": page.order,
                        "label": page.display_label,
                        "tags": page.label.split(", ") if page.label else [],
                        "item_type": "page",
                    }
                except StopIteration:
                    return

                
HathiObject.iter_pages = iter_pages

In [4]:
work = random.choice(digiworks)
hathiobj = work.hathi
df_pages = pd.DataFrame(hathiobj.iter_pages())
df_pages

Unnamed: 0,source_id,page_id,page_i,content,order,label,tags,item_type
0,mdp.39015030930179,00000001,1,A history of English poetryWilliam John Courth...,1,1,"[FRONT_COVER, IMAGE_ON_PAGE, FRONT_COVER_IMAGE...",page
1,mdp.39015030930179,00000002,2,\nE\nHAIR BULUU\n1817\nARTES\nhamuMWILINIMIN\n...,2,2,"[IMAGE_ON_PAGE, IMPLICIT_PAGE_NUMBER]",page
2,mdp.39015030930179,00000003,3,\n,3,3,[IMPLICIT_PAGE_NUMBER],page
3,mdp.39015030930179,00000004,4,\n,4,4,[IMPLICIT_PAGE_NUMBER],page
4,mdp.39015030930179,00000005,5,"\nA HISTORY\nOF\nGLISH POETRY\nBY\nCOURTHOPE, ...",5,5,"[TITLE, IMAGE_ON_PAGE, UNTYPICAL_PAGE, IMPLICI...",page
...,...,...,...,...,...,...,...,...
475,mdp.39015030930179,00000476,476,,476,476,"[BLANK, IMPLICIT_PAGE_NUMBER]",page
476,mdp.39015030930179,00000477,477,\n,477,477,[IMPLICIT_PAGE_NUMBER],page
477,mdp.39015030930179,00000478,478,\nTHE UNIVERSITY OF MICHIGAN\nGRADUATE LIBRARY...,478,478,"[CHECKOUT_PAGE, UNTYPICAL_PAGE, IMPLICIT_PAGE_...",page
478,mdp.39015030930179,00000479,479,\nUNIVERSITY OF MICHIGAN\n3 9015 03093 0179\nB...,479,479,"[IMAGE_ON_PAGE, CHECKOUT_PAGE, UNTYPICAL_PAGE,...",page


### From pages to strings

In [5]:
def get_txt(self,page_sep='\n',line_sep='\r'):
    """
    Get plain text of HathiObject.
    """
    
    return page_sep.join(
        standardize_newlines(
            d.get('content',''),
            correct=line_sep
        )
        for d in self.iter_pages()
    )

HathiObject.get_txt = get_txt

In [6]:
def get_txt(self, *args, **kwargs):
    """
    Get plain text of DigitizedWork.
    """
    
    if self.hathi:
        return self.hathi.get_txt(*args, **kwargs)
    raise NotImplementedError

DigitizedWork.get_txt = get_txt

In [7]:
work = random.choice(digiworks)
work.get_txt()[:100]

'\rᎢᎸᎬ ANᎸLYTIC ᎯᎠ PᎡᎯCTICAL GᎡᎯᏓᎯᏒ\rCONCISE MANUAL\rENGLISH GRAMMAR,\rARRANGED ON THE PRINCIPLE OF ANALY'

In [8]:
work.hathi.get_txt()[:100]

'\rᎢᎸᎬ ANᎸLYTIC ᎯᎠ PᎡᎯCTICAL GᎡᎯᏓᎯᏒ\rCONCISE MANUAL\rENGLISH GRAMMAR,\rARRANGED ON THE PRINCIPLE OF ANALY'

## Tokenizing

In [9]:
tokenize("This—iswefwe®qa test")

['this', 'iswefwe', 'qa', 'test']

In [10]:
def get_tokens(self):
    return tokenize(self.get_txt())

DigitizedWork.get_tokens = get_tokens

In [11]:
work.get_tokens()[:10]

['ꭲꮈꭼ',
 'anꮈlytic',
 'ꭿꭰ',
 'pꭱꭿctical',
 'gꭱꭿꮣꭿꮢ',
 'concise',
 'manual',
 'english',
 'grammar',
 'arranged']

## Counting

In [12]:
def calc_counts(self):
    # return Counter(self.get_tokens())
    # return dict(Counter(self.get_tokens()))
    return dict(Counter(self.get_tokens()).most_common())

DigitizedWork.calc_counts = calc_counts

In [13]:
work.calc_counts().get('ballad',0)

0

### Involving the db

In [14]:
def get_or_create_counts(self):
    try:
        wwc = WorkWordCounts.objects.get(work=self)
        countd = wwc.data
    except WorkWordCounts.DoesNotExist:    
        countd = self.calc_counts()
        try:
            wwc, created = WorkWordCounts.objects.get_or_create(work=self, data=countd)
        except Exception as e:
            log.debug(e)

    return countd

def get_counts(self):
    countd = self.get_or_create_counts()
    return dict(Counter(countd).most_common())

def get_counts_total(self):
    return sum(self.get_counts())

DigitizedWork.get_or_create_counts = get_or_create_counts
DigitizedWork.get_counts = get_counts
DigitizedWork.get_counts_total = get_counts_total

In [15]:
work=random.choice(digiworks)
work

<DigitizedWork: mdp.39015004300599>

In [19]:
def cache_all():
    pmap_run(
        get_or_create_counts,
        digiworks,
        num_proc=8,
    )

# cache_all()