# New nlpfuncs for hathi/digitizedwork

In [1]:
%run ppa_init.ipynb

In [2]:
digiworks=[t for t in DigitizedWork.objects.all() if t.hathi]
len(digiworks)

INFO:parasolr.django.solrclient:Connecting to default Solr http://localhost:8983/solr/ppanlp


5124

## Getting full text

### Hathi Trust: via page data

In [3]:
# Adapted from hathi_page_index_data

def iter_pages(self, page_span=None, **meta):
    """Get page content for the specified digitized work from Hathi
    pairtree and return data to be indexed in solr."""

    from zipfile import ZipFile
    from eulxml.xmlmap import load_xmlobject_from_file
    from ppa.archive.hathi import MinimalMETS
    from pairtree import storage_exceptions


    # load mets record to pull metadata about the images
    try:
        mmets = load_xmlobject_from_file(self.metsfile_path(), MinimalMETS)
    except storage_exceptions.ObjectNotFoundException:
        # logger.debug(
        #     "Pairtree data for %s not found",
        #     self.hathi_id,
        # )
        return

    # read zipfile contents in place, without unzipping
    with ZipFile(self.zipfile_path()) as ht_zip:
        # yield a generator of index data for each page; iterate
        # over pages in METS structmap
        for i, page in enumerate(mmets.structmap_pages, 1):
            # if the document has a page range defined, skip any pages not in range
            if page_span and i not in page_span: continue

            # zipfile spec uses / for path regardless of OS
            pagefilename = f'{self.content_dir}/{page.text_file_location}'

            with ht_zip.open(pagefilename) as pagefile:
                try:
                    yield {
                        "source_id": self.hathi_id,
                        "page_id": page.text_file.sequence,                        
                        "page_i": i,
                        "content": pagefile.read().decode("utf-8"),
                        "order": page.order,
                        "label": page.display_label,
                        "tags": page.label.split(", ") if page.label else [],
                        "item_type": "page",
                    }
                except StopIteration:
                    return

                
HathiObject.iter_pages = iter_pages

In [5]:
work = random.choice(digiworks)
hathiobj = work.hathi
work.title, work.author, work.pub_date

('Sapphics', 'Swinburne, Algernon Charles.', 1866)

In [6]:
df_pages = pd.DataFrame(hathiobj.iter_pages())
df_pages

Unnamed: 0,source_id,page_id,page_i,content,order,label,tags,item_type
0,uiug.30112001676896,00000001,1,\n,1,1,"[FRONT_COVER, IMAGE_ON_PAGE, IMPLICIT_PAGE_NUM...",page
1,uiug.30112001676896,00000002,2,\n,2,2,"[IMAGE_ON_PAGE, IMPLICIT_PAGE_NUMBER]",page
2,uiug.30112001676896,00000003,3,\n,3,3,"[IMAGE_ON_PAGE, IMPLICIT_PAGE_NUMBER]",page
3,uiug.30112001676896,00000004,4,LIBRARY OF THE\nUNIVERSITY OF ILLINOIS\nAT URB...,4,4,"[UNTYPICAL_PAGE, IMPLICIT_PAGE_NUMBER]",page
4,uiug.30112001676896,00000005,5,The person charging this material is re-\nspon...,5,5,"[IMAGE_ON_PAGE, UNTYPICAL_PAGE, IMPLICIT_PAGE_...",page
...,...,...,...,...,...,...,...,...
365,uiug.30112001676896,00000366,366,344 DEDICATION.\nEre time that breaks all men ...,366,344,[UNTYPICAL_PAGE],page
366,uiug.30112001676896,00000367,367,,367,345,"[BLANK, IMPLICIT_PAGE_NUMBER]",page
367,uiug.30112001676896,00000368,368,,368,346,"[BLANK, IMPLICIT_PAGE_NUMBER]",page
368,uiug.30112001676896,00000369,369,,369,347,"[BLANK, IMPLICIT_PAGE_NUMBER]",page


In [7]:
df_pages.sample(n=10)

Unnamed: 0,source_id,page_id,page_i,content,order,label,tags,item_type
101,uiug.30112001676896,102,102,80 HYMN TO PROSERPINE.\nThough all men abase t...,102,80,[],page
25,uiug.30112001676896,26,26,4 A BALLAD OF LIFE.\nAnd bosom carved to kiss....,26,4,[UNTYPICAL_PAGE],page
318,uiug.30112001676896,319,319,THE TWO DREAMS. 297\n*-*-*-\nHer body's balanc...,319,297,[IMPLICIT_PAGE_NUMBER],page
352,uiug.30112001676896,353,353,THE BLOODY SON. 331\n“And what will ye leave y...,353,331,[UNTYPICAL_PAGE],page
167,uiug.30112001676896,168,168,146 A BALL AD OF BURDENS.\nThy times and ways ...,168,146,[UNTYPICAL_PAGE],page
323,uiug.30112001676896,324,324,302 THE TWO DREAMS.\nThen a cool naked sense b...,324,302,[],page
291,uiug.30112001676896,292,292,"270\nTHE MASQUE OF QUEEN BERSABE.\nBehold, Lor...",292,270,[],page
87,uiug.30112001676896,88,88,66 ANACTORIA.\n------\nI charge thee keep thy ...,88,66,[IMPLICIT_PAGE_NUMBER],page
347,uiug.30112001676896,348,348,326\nAFTER DEATH.\nHave they boiled my maid in...,348,326,[UNTYPICAL_PAGE],page
140,uiug.30112001676896,141,141,"A MATCH.\nIF love were what the rose is,\nAnd ...",141,119,"[UNTYPICAL_PAGE, IMPLICIT_PAGE_NUMBER]",page


### From pages to strings

In [8]:
def get_txt_hathi(self,page_sep='\n',line_sep='\r'):
    return page_sep.join(
        d.get('content','')
        for d in self.iter_pages()
    )

HathiObject.get_txt = get_txt_hathi

In [9]:
def get_txt_digiwork(self, *args, **kwargs):
    if self.hathi: return self.hathi.get_txt(*args, **kwargs)
    raise NotImplementedError

DigitizedWork.get_txt = get_txt_digiwork

In [10]:
# print(work.get_txt())

In [11]:
# work.hathi.get_txt()[:100]

## Tokenizing

In [12]:
tokenize("This—iswefwe®qa test")

['this', 'iswefwe', 'qa', 'test']

In [13]:
tokenize("""most common and the most general, yet upon examination I find this

Part 1.)
EMOTIONS AND PASSIONS.
27
single part so extensive, as to require a subdivision into several sec.
tions. Human nature is a complicated machine, and is unavoidably
so, in order to answer its various purposes. The public indeed have
been entertained with many systems of human nature that flatter the
mind by their simplicity. According to some writers, man is entirely
""")

['most',
 'common',
 'and',
 'the',
 'most',
 'general',
 'yet',
 'upon',
 'examination',
 'i',
 'find',
 'this',
 'part',
 'emotions',
 'and',
 'passions',
 'single',
 'part',
 'so',
 'extensive',
 'as',
 'to',
 'require',
 'a',
 'subdivision',
 'into',
 'several',
 'sec',
 'tions',
 'human',
 'nature',
 'is',
 'a',
 'complicated',
 'machine',
 'and',
 'is',
 'unavoidably',
 'so',
 'in',
 'order',
 'to',
 'answer',
 'its',
 'various',
 'purposes',
 'the',
 'public',
 'indeed',
 'have',
 'been',
 'entertained',
 'with',
 'many',
 'systems',
 'of',
 'human',
 'nature',
 'that',
 'flatter',
 'the',
 'mind',
 'by',
 'their',
 'simplicity',
 'according',
 'to',
 'some',
 'writers',
 'man',
 'is',
 'entirely']

In [40]:
def get_tokens(self):
    return tokenize(self.get_txt())

DigitizedWork.get_tokens = get_tokens

In [41]:
work.get_tokens()[:5]

['andover', 'harvard', 'library', 'ah', 's']

## Counting

In [42]:
def calc_counts(self):
    # return Counter(self.get_tokens())
    # return dict(Counter(self.get_tokens()))
    return dict(Counter(self.get_tokens()).most_common())

DigitizedWork.calc_counts = calc_counts

In [47]:
work.calc_counts().get('ballad',0)

3

In [50]:
work.calc_counts().get('personification',0)

48

In [45]:
work.title, work.author, work.pub_date

('Elements of criticism', 'Kames, Henry Home, Lord, 1696-1782', 1842)