# New nlpfuncs for hathi/digitizedwork

In [5]:
# !pip install pandas
%run ppa_init.ipynb

In [6]:
digiworks=[t for t in DigitizedWork.objects.all() if t.hathi]
len(digiworks)

5124

## Getting full text

### Hathi Trust: via page data

In [15]:
# Adapted from hathi_page_index_data

def iter_pages(self, page_span=None, **meta):
    """Get page content for the specified digitized work from Hathi
    pairtree and return data to be indexed in solr."""

    from zipfile import ZipFile
    from eulxml.xmlmap import load_xmlobject_from_file
    from ppa.archive.hathi import MinimalMETS

    # load mets record to pull metadata about the images
    try:
        mmets = load_xmlobject_from_file(self.metsfile_path(), MinimalMETS)
    except storage_exceptions.ObjectNotFoundException:
        # logger.debug(
        #     "Pairtree data for %s not found",
        #     self.hathi_id,
        # )
        return

    # read zipfile contents in place, without unzipping
    with ZipFile(self.zipfile_path()) as ht_zip:
        # yield a generator of index data for each page; iterate
        # over pages in METS structmap
        for i, page in enumerate(mmets.structmap_pages, 1):
            # if the document has a page range defined, skip any pages not in range
            if page_span and i not in page_span: continue

            # zipfile spec uses / for path regardless of OS
            pagefilename = f'{self.content_dir}/{page.text_file_location}'

            with ht_zip.open(pagefilename) as pagefile:
                try:
                    yield {
                        "source_id": self.hathi_id,
                        "page_id": page.text_file.sequence,                        
                        "page_i": i,
                        "content": pagefile.read().decode("utf-8"),
                        "order": page.order,
                        "label": page.display_label,
                        "tags": page.label.split(", ") if page.label else [],
                        "item_type": "page",
                    }
                except StopIteration:
                    return

                
HathiObject.iter_pages = iter_pages

In [16]:
work = random.choice(digiworks)
hathiobj = work.hathi
df_pages = pd.DataFrame(hathiobj.iter_pages())
df_pages

Unnamed: 0,source_id,page_id,page_i,content,order,label,tags,item_type
0,hvd.hnqbsv,00000001,1,\nWIDENER\nHN QBSV 7\nDAY BOOK\nOT SUBJECT TO\...,1,1,"[FRONT_COVER, IMAGE_ON_PAGE, UNTYPICAL_PAGE, I...",page
1,hvd.hnqbsv,00000002,2,KSF 1367\nPris\nChi. 1 .3\n\nADEA\nVET\nHRIST\...,2,2,"[IMAGE_ON_PAGE, UNTYPICAL_PAGE, IMPLICIT_PAGE_...",page
2,hvd.hnqbsv,00000003,3,,3,3,"[BLANK, IMPLICIT_PAGE_NUMBER]",page
3,hvd.hnqbsv,00000004,4,,4,4,"[BLANK, IMPLICIT_PAGE_NUMBER]",page
4,hvd.hnqbsv,00000005,5,,5,5,"[BLANK, IMPLICIT_PAGE_NUMBER]",page
...,...,...,...,...,...,...,...,...
679,hvd.hnqbsv,00000680,680,,680,680,"[BLANK, IMPLICIT_PAGE_NUMBER]",page
680,hvd.hnqbsv,00000681,681,,681,681,"[BLANK, IMPLICIT_PAGE_NUMBER]",page
681,hvd.hnqbsv,00000682,682,"j --""V\n3""'\n*t.'\n",682,682,[IMPLICIT_PAGE_NUMBER],page
682,hvd.hnqbsv,00000683,683,This book should be returned to\nthe Library o...,683,683,"[IMAGE_ON_PAGE, UNTYPICAL_PAGE, IMPLICIT_PAGE_...",page


### From pages to strings

In [17]:
def get_txt(self,page_sep='\n',line_sep='\r'):
    """
    Get plain text of HathiObject.
    """
    
    return page_sep.join(
        standardize_newlines(
            d.get('content',''),
            correct=line_sep
        )
        for d in self.iter_pages()
    )

HathiObject.get_txt = get_txt

In [18]:
def get_txt(self, *args, **kwargs):
    """
    Get plain text of DigitizedWork.
    """
    
    if self.hathi:
        return self.hathi.get_txt(*args, **kwargs)
    raise NotImplementedError

DigitizedWork.get_txt = get_txt

In [19]:
work = random.choice(digiworks)
work.get_txt()[:100]

'\rNYPL RESEARCH LIBRARIES\r3 3433 08252818 7\r\n\r\nWAN\rProter\r\n---\r\ni\r\n\n25\rNANV\rPorker\r\n\n\n\r.HAO\rPROPER PO'

In [20]:
work.hathi.get_txt()[:100]

'\rNYPL RESEARCH LIBRARIES\r3 3433 08252818 7\r\n\r\nWAN\rProter\r\n---\r\ni\r\n\n25\rNANV\rPorker\r\n\n\n\r.HAO\rPROPER PO'

## Tokenizing

In [21]:
tokenize("This—iswefwe®qa test")

['this', 'iswefwe', 'qa', 'test']

In [26]:
def get_tokens(self):
    return tokenize(self.get_txt())

DigitizedWork.get_tokens = get_tokens

In [29]:
work.get_tokens()[:10]

['nypl',
 'research',
 'libraries',
 'wan',
 'proter',
 'i',
 'nanv',
 'porker',
 'hao',
 'proper']

## Counting

In [38]:
def get_counts(self):
    # return Counter(self.get_tokens())
    return dict(Counter(self.get_tokens()).most_common())

DigitizedWork.get_counts = get_counts

In [None]:
%%timeit
work.get_counts().get('ballad')