# New nlpfuncs for hathi/digitizedwork

In [1]:
%run ppa_init.ipynb

In [2]:
digiworks=[t for t in DigitizedWork.objects.all() if t.hathi]
len(digiworks)

INFO:parasolr.django.solrclient:Connecting to default Solr http://localhost:8983/solr/ppanlp


5124

## Getting full text

### Hathi Trust: via page data

In [3]:
# Adapted from hathi_page_index_data

def iter_pages(self, page_span=None, **meta):
    """Get page content for the specified digitized work from Hathi
    pairtree and return data to be indexed in solr."""

    from zipfile import ZipFile
    from eulxml.xmlmap import load_xmlobject_from_file
    from ppa.archive.hathi import MinimalMETS
    from pairtree import storage_exceptions


    # load mets record to pull metadata about the images
    try:
        mmets = load_xmlobject_from_file(self.metsfile_path(), MinimalMETS)
    except storage_exceptions.ObjectNotFoundException:
        # logger.debug(
        #     "Pairtree data for %s not found",
        #     self.hathi_id,
        # )
        return

    # read zipfile contents in place, without unzipping
    with ZipFile(self.zipfile_path()) as ht_zip:
        # yield a generator of index data for each page; iterate
        # over pages in METS structmap
        for i, page in enumerate(mmets.structmap_pages, 1):
            # if the document has a page range defined, skip any pages not in range
            if page_span and i not in page_span: continue

            # zipfile spec uses / for path regardless of OS
            pagefilename = f'{self.content_dir}/{page.text_file_location}'

            with ht_zip.open(pagefilename) as pagefile:
                try:
                    yield {
                        "source_id": self.hathi_id,
                        "page_id": page.text_file.sequence,                        
                        "page_i": i,
                        "content": pagefile.read().decode("utf-8"),
                        "order": page.order,
                        "label": page.display_label,
                        "tags": page.label.split(", ") if page.label else [],
                        "item_type": "page",
                    }
                except StopIteration:
                    return

                
HathiObject.iter_pages = iter_pages

In [4]:
work = random.choice(digiworks)
hathiobj = work.hathi
df_pages = pd.DataFrame(hathiobj.iter_pages())
df_pages

Unnamed: 0,source_id,page_id,page_i,content,order,label,tags,item_type
0,hvd.hw235l,00000001,1,\nHW 235L 5\n,1,1,"[FRONT_COVER, IMAGE_ON_PAGE, IMPLICIT_PAGE_NUM...",page
1,hvd.hw235l,00000002,2,KE11226\n,2,2,"[UNTYPICAL_PAGE, IMPLICIT_PAGE_NUMBER]",page
2,hvd.hw235l,00000003,3,hot\n,3,3,[IMPLICIT_PAGE_NUMBER],page
3,hvd.hw235l,00000004,4,,4,4,[IMPLICIT_PAGE_NUMBER],page
4,hvd.hw235l,00000005,5,,5,1,[IMPLICIT_PAGE_NUMBER],page
...,...,...,...,...,...,...,...,...
365,hvd.hw235l,00000366,366,,366,366,[IMPLICIT_PAGE_NUMBER],page
366,hvd.hw235l,00000367,367,\n,367,367,"[IMAGE_ON_PAGE, IMPLICIT_PAGE_NUMBER]",page
367,hvd.hw235l,00000368,368,,368,368,[IMPLICIT_PAGE_NUMBER],page
368,hvd.hw235l,00000369,369,HW 235L 5\n,369,369,"[IMAGE_ON_PAGE, IMPLICIT_PAGE_NUMBER]",page


### From pages to strings

In [5]:
def get_txt(self,page_sep='\n',line_sep='\r'):
    """
    Get plain text of HathiObject.
    """
    
    return page_sep.join(
        standardize_newlines(
            d.get('content',''),
            correct=line_sep
        )
        for d in self.iter_pages()
    )

HathiObject.get_txt = get_txt

In [6]:
def get_txt(self, *args, **kwargs):
    """
    Get plain text of DigitizedWork.
    """
    
    if self.hathi:
        return self.hathi.get_txt(*args, **kwargs)
    raise NotImplementedError

DigitizedWork.get_txt = get_txt

In [7]:
work = random.choice(digiworks)
work.get_txt()[:100]

'\r\n- -\r-\r- -\r-\r-\r- - - -\r-\r-\r-\r\r\n\r\n\n\n**\r\n>\rIECTURES\ron THE\rENGLISH POETS,\rand the\rENGLISH COMIC WBITE'

In [8]:
work.hathi.get_txt()[:100]

'\r\n- -\r-\r- -\r-\r-\r- - - -\r-\r-\r-\r\r\n\r\n\n\n**\r\n>\rIECTURES\ron THE\rENGLISH POETS,\rand the\rENGLISH COMIC WBITE'

## Tokenizing

In [9]:
tokenize("This—iswefwe®qa test")

['this', 'iswefwe', 'qa', 'test']

In [10]:
def get_tokens(self):
    return tokenize(self.get_txt())

DigitizedWork.get_tokens = get_tokens

In [11]:
work.get_tokens()[:10]

['iectures',
 'on',
 'the',
 'english',
 'poets',
 'and',
 'the',
 'english',
 'comic',
 'wbiters']

## Counting

In [12]:
def calc_counts(self):
    # return Counter(self.get_tokens())
    # return dict(Counter(self.get_tokens()))
    return dict(Counter(self.get_tokens()).most_common())

DigitizedWork.calc_counts = calc_counts

In [13]:
work.calc_counts().get('ballad',0)

12

### Involving the db

In [14]:
def get_or_create_counts(self):
    try:
        wwc = WorkWordCounts.objects.get(work=self)
        countd = wwc.data
    except WorkWordCounts.DoesNotExist:    
        countd = self.calc_counts()
        try:
            wwc, created = WorkWordCounts.objects.get_or_create(work=self, data=countd)
        except Exception as e:
            log.debug(e)

    return countd

def get_counts(self):
    countd = self.get_or_create_counts()
    return dict(Counter(countd).most_common())

def get_counts_total(self):
    return sum(self.get_counts())

DigitizedWork.get_or_create_counts = get_or_create_counts
DigitizedWork.get_counts = get_counts
DigitizedWork.get_counts_total = get_counts_total

In [15]:
work=random.choice(digiworks)
work

<DigitizedWork: uc1.$b276143>

In [16]:
# !pip install ipywidgets
# !jupyter nbextension enable --py widgetsnbextension

In [17]:
# !pip uninstall IProgress -y

In [27]:
pmap_run(
    get_or_create_counts,
    digiworks,
    num_proc=8,
)

NameError: name 'pmap_run' is not defined

## Querying for matches

In [32]:
class WordQuery:
    
    def __init__(self,word='',words=[]):
        if word: words+=[word]
        self.words = words
    
    def get_queryset(self):
        return WorkWordCounts.objects.filter(data__has_any_keys=self.words)

    def get_works(self):
        return [wwc.work for wwc in self.get_queryset()]
    
    def get_words(self):
        return self.words
    
    def get_counts(self):
        o = []
        for work in tqdm(self.get_works()):
            countd = work.get_counts()
            total = sum(countd.values())
            for word in self.get_words():
                count = countd.get(word,0)
                odx = dict(
                    work=work.id,
                    word=word,
                    count=count,
                    total=total,
                    fpm=count/total*10**6
                )
                o.append(odx)
        return pd.DataFrame(o)
    

In [33]:
wq = WordQuery('ballad')
# wq.get_queryset()
# wq.get_works()
wq.get_counts()

100%|██████████| 9/9 [00:00<00:00, 75.88it/s]


Unnamed: 0,work,word,count,total,fpm
0,1133,ballad,1,88802,11.261008
1,4593,ballad,1,151318,6.608599
2,4665,ballad,2,78066,25.619348
3,1996,ballad,9,103884,86.635093
4,2058,ballad,3,198563,15.108555
5,1501,ballad,2,44455,44.989315
6,2504,ballad,2,136648,14.636145
7,1903,ballad,2,202143,9.893986
8,3199,ballad,2,233791,8.554649
