## INSTALLATION NOTES
LINUX

`sudo apt install tesseract-ocr`

MAC - Requires Homebrew

`brew install tesseract`

### Not all these are necessary for the following code.

In [1]:
#! /bin/python3

import os, os.path
import spacy
from spacy.lang.en import English
import en_core_web_lg
import json, string
import datetime
import warnings
import re
import sqlitedict
from sqlitedict import SqliteDict
import math
warnings.filterwarnings('ignore')

nlp = en_core_web_lg.load()

for word in nlp.Defaults.stop_words:
    lex = nlp.vocab[word]
    lex.is_stop = True

#### *SPECIAL FUNCTION TO CHECK WORD PROPERTIES*

In [2]:
def check_words(doc):
    '''
    Helper function to check the properties of all words 
    in the passed portion of text. For data exploration.
    '''
    for sent in doc.sents:
        for word in sent:
            if word.like_url:
                word.tag_, word.dep_ = 'URL', 'URL' # this
#             if word.dep_ != 'punct' and word.pos_ != 'SPACE':
            print('"{}"\n LEMMA: {}\n POS: {}\n TAG: {}\n DEP: {}\n STOP: {}\n'.format(word,
                                                                                word.lemma_,
                                                                                word.pos_, 
                                                                                word.tag_, 
                                                                                word.dep_,
                                                                                word.is_stop))

In [3]:
def count_words(doc):
    '''
    Returns a list of tuples including words of interest and their
    count in the document. Using stop words to narrow search.
    '''
    counts = {}
    restricted = {'punct', 'PUNCT', 'PART'}
    for word in doc:
        if word.is_stop is False and word.pos_ not in restricted:
            if str(word) not in counts:
                counts[str(word)] = 1
            else:
                counts[str(word)] += 1
    return sorted([(v, k) for (k, v) in counts.items()], reverse=True)

In [4]:
def count_entities(doc):
    '''
    Returns a list of tuples including entities and their
    count in the document.
    '''
    counts = {}
    targets = {'NOUN', 'PROPN'}
    restricted = {'punct', 'PUNCT', 'PART'}
    for ent in doc.ents:
        # all named entities are of interest
        if str(ent) in counts:
            counts[str(ent)] += 1
        else:
            counts[str(ent)] = 1
    for word in doc:
        if word.pos_ in targets and word.pos_ not in restricted:
            if str(word) in counts:
                counts[str(word)] += 1
            else:
                counts[str(word)] = 1
    return sorted([(v, k) for (k, v) in counts.items()], reverse=True)

## FUNCTIONS TO EXTRACT TEXT FROM SOURCE DOCUMENTS
### Unnecessary since documents will be passed as string from front end

### TXT

In [5]:
def remove_non_ascii(text):
    text = ''.join([i if ord(i) < 128 else ' ' for i in text])
    text = re.sub(r'[^\x00-\x7F]+',' ', text)
    text = re.sub(' . ', '', text)
    return text

def open_txt(path):
    # Note the encoding here. Needed to remove BOM.
    with open(path, 'r', encoding='utf-8-sig', errors='replace') as txt:
        text = txt.read()
        # spacy did not separate words with a \n between
        # strange punctuation character (‘) breaking spacy
        text = remove_non_ascii(text)
        text = text.strip()
    return text

## Functions

In [6]:
def match_acronyms(doc):
    acronyms = {}
    chunks = set()
    matches = {}
    for word in doc:
        if word.is_upper is True:
            acronyms[str(word)] = word.text.lower()
    for chunk in doc.noun_chunks:
        text = chunk.text.lower()
        text = re.sub('[^A-Za-z0-9 -]+', '', text)
        words = text.split()
        chunks.add((text, ''.join([word[0] for word in words])))
    for i, j in acronyms.items():
        for k, l in chunks:
            if j in l:
                matches[j.upper()] = k
    return matches

In [7]:
def concepts(paragraph):
    '''
    SCHEMA CERTIFIED
    Returns a list of dictionary objects containing an id number and the concept phrase.
    '''
    par = nlp(paragraph)
    concepts = set()
    
    temp = set()
    
    watch = {'NOUN', 'PROPN'}
    
    for ent in par.ents:
        temp.add(ent.text)
    for noun in par:
        # make sure noun is a noun
        if not noun.like_url and not noun.like_email and noun.pos_ in watch and noun.pos_ != 'PRON':
            temp.add(noun.text)
    for chunk in par.noun_chunks:
        for word in temp:
            if word in chunk.text:
                text = re.sub('[^A-Za-z0-9 -]+', '', chunk.text)
                # manually modify to catch more mundane concepts
                if len(text.split()) > 3:
                    # ensure each concept is unique
                    concepts.add(text.lower())
                    
    # catches and adds acronym full meanings based on local context
    matches = match_acronyms(par)
    for key, value in matches.items():
        concepts.add(value)
    # convert to list of dictionary objects per schema
    
    output = []
    idx = 1
    for con in concepts:
        text = re.sub('a ', '', con)
        text = re.sub('the ', '', text)
        text = re.sub('an ', '', text)
        text = re.sub('this ', '', text)
        text = re.sub('that ', '', text)
        # different from schema
        # has to be done this way because keys must be unique
        output.append({'id_{}'.format(idx): text})
        idx += 1
    concepts = output
    
    return concepts

In [8]:
def sentence(paragraph):
    sentences = []
    par = nlp(paragraph)
    idx = 1
    for sent in par.sents:
        # my prefered way
        if len(sent.text) > 5: # crude filter to eliminate non-sentences
            sentences.append({'id_{}'.format(idx): sent.text, 'vector': []})
        idx += 1
    return sentences

In [9]:
def set_paragraph_boundary(doc):
    '''
    Rules based paragraph delimiter. Helper function for SpaCy pipeline.
    This will truncate paragraphs in which a hard new line has been used
    in the middle of text.
    '''
    start = 0
    seen_newline = False
    # this is to make sure crazy white space separations are caught
    targets = '\n\n\n\n\n\n\n\n\n\n'
    for word in doc:
        if seen_newline and not word.is_space:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text in targets:
            seen_newline = True
    if start < len(doc):
        yield doc[start:len(doc)]

def paragraphs(text):
    '''
    Resolves document down to paragraphs of substance just based on punctuation
    and white space. No model so it is very fast.
    '''
    parags = English()
    sbd = spacy.pipeline.SentenceSegmenter(parags.vocab, strategy=set_paragraph_boundary)
    parags.add_pipe(sbd)
    pars = parags(text)
    paragraphs = [' '.join(sent.text.split()).strip() for sent in pars.sents]
    
    # this checks to see if paragraph is actually a paragraph
    for par in paragraphs.copy():
        if len(par.split('. ')) < 3: # this is only a check
            paragraphs.remove(par) # this removes the string if it is not a paragraph
    
    output = []
    for par in paragraphs:
        par = nlp(par)
        temp = []
        for sent in par.sents:
            if len(sent.text) > 5:
                temp.append(sent.text)
        output.append(' '.join(temp))
    return output

In [10]:
def knowledge(text):
    '''
    Assembly function to combine concepts and sentences using the
    paragraph function.
    '''
    knowledge = []
    idx = 1
    for par in paragraphs(text):
        knowledge.append({
            # I do not recommend keeping both paragraph and sentence text
            'id_{}'.format(idx): 'snippet',
            'concepts': concepts(par),
            'elements': {
                'id': 'paragraph_{}'.format(idx),
                'label': [chunk.text for chunk in nlp(par).noun_chunks][0].upper(),
                'sentences': sentence(par)
                
            },
            'weighted_queries': {},
            'snippet_vector': []
        })
        idx += 1
    return knowledge

In [11]:
def schema(text, path=''):
    '''
    Final schema assembly function.
    '''
    doc = nlp(text)
    schema = {}
    # this makes the assumption that the first chunk in the document will
    # contain the title concept of the entire document.This may not be true.
    schema.update({'id': [chunk.text for chunk in doc.noun_chunks][0].upper()})
    schema.update({'url': path})
    schema.update({'knowledge': knowledge(text)})
    return schema

## Batch Schema Encoding

In [18]:
def encode_directory(path):
    db = SqliteDict('./data/knowledge.sqlite', autocommit=True)
    textfiles = [path+f for f in os.listdir(path)]
    for file in textfiles:
        print(len(open_txt(file)), file)
        db.update({file: schema(open_txt(file), file)})

In [19]:
encode_directory('data/txt/')

176283 data/txt/1409.2544v1.pdf.txt
20773 data/txt/0505016v1.pdf.txt
26917 data/txt/1612.00712v1.pdf.txt
40825 data/txt/0608073v1.pdf.txt
29413 data/txt/0504056v1.pdf.txt
31183 data/txt/1404.5997v2.pdf.txt
32173 data/txt/1605.07333v1.pdf.txt
16170 data/txt/1009.4495v1.pdf.txt
49503 data/txt/1701.05549v1.pdf.txt


In [20]:
with SqliteDict('./data/knowledge.sqlite') as db:
    print('Library has {} items.\n'.format(len(db)))
    for i,j in db.items():
        print(j)

Library has 9 items.

{'id': 'THEEURALING', 'url': 'data/txt/1409.2544v1.pdf.txt', 'knowledge': [{'id_1': 'snippet', 'concepts': [], 'elements': {'id': 'paragraph_1', 'label': 'EXISTENCEFHISELATIONNDICATESHATUPPHILEHEREOESXISTNUCHHATHISONTRADICTSHESSUMPTIONHATUPPSIMPLICIALOMPLEX.EONCLUDEHATCASOYPEELATIONS', 'sentences': [{'id_1': 'existencefhiselationndicateshatupphilehereoesxistnuchhathisontradictshessumptionhatuppsimplicialomplex.eoncludehatCasoypeelations.', 'vector': []}, {'id_2': '40heanonicalormfChusnablessommediatelyeadff,iaheypeelations,heinimalorbiddenacesfheimplicialomplexCssociatedoheode,ndlsoheinimaleviationsfromeingimplicialomplex,hichreapturedyheypendypeelations.', 'vector': []}]}, 'weighted_queries': {}, 'snippet_vector': []}, {'id_2': 'snippet', 'concepts': [], 'elements': {'id': 'paragraph_2', 'label': "[1].'KEEFEND.OSTROVSKY.HEIPPOCAMPUSSPATIALAP.RELIMINARYVIDENCEROMNITCTIVITYNHEREELY-MOVINGAT.RAINESEARCH,4(1):171175,971.2]..CNAUGHTON,..ATTAGLIA,.ENSEN,..OSER", 'sente

## TF-IDF Functions

In [None]:
text = open_txt('sample_doc.txt')
text

In [None]:
jsn = schema(text, '')

In [None]:
doc = json.dumps(jsn).lower()
doc = re.sub('"|\{|\}|\[|\]|\(|\)|:|,|\.|^\s+|\d+|\@\w+', '', doc)
doc = re.sub('\s+', ' ', doc).strip()
doc = nlp(doc)
L = [w for w in doc if (not w.is_stop and len(w.text) > 1)]

In [None]:
def normalize_json_nlp(jsn):
    doc = json.dumps(jsn).lower()
    doc = re.sub('"|\{|\}|\[|\]|\(|\)|:|,|\.|^\s+|\d+|\@\w+', '', doc)
    doc = re.sub('\s+', ' ', doc).strip()
    doc = nlp(doc)
    return [str(w) for w in doc if (not w.is_stop and len(w.text) > 1)]

In [None]:
def normalize_json_fast(jsn):
    doc = json.dumps(jsn).lower()
    doc = re.sub('"|\{|\}|\[|\]|\(|\)|:|,|\.|^\s+|\d+|\@\w+', '', doc)
    doc = re.sub('\s+', ' ', doc).strip()
    return doc.split()

In [None]:
def term_frequency(query, doc):
    normalized_doc = normalize_json(doc)
    return normalized_doc.count(query.lower()) / float(len(normalized_doc))

In [None]:
term_frequency('networks', text)

In [None]:
def tf_idf(query, text):
    query = ' '.join([t.lemma_ for t in nlp(query)])
    hits = 0
    with SqliteDict('./data/knowledge.sqlite') as db:
        for doc in db:
            if query in normalize_json_fast(json.dumps(db[doc])):
                hits += 1
        if hits > 0:
            return (1.0 + math.log(float(len(db))) / hits) * term_frequency(query, text)
        else:
            return 1.0 * term_frequency(query, doc)

In [None]:
tf_idf('neural', text)

In [None]:
def query(term):
    L = []
    with SqliteDict('./data/knowledge.sqlite') as db:
        for doc in db:
            L.append((tf_idf(term, normalize_json_fast(json.dumps(db[doc]))),
                      db[doc]['@id'],
                      db[doc]['@url']))
    return sorted(L, reverse=True)

In [None]:
query('neural')

# Note that while this works its pretty slow on only 9 documents. It needs to be able to search the entire database of 5000 documents. I plan on using a combination of binary search to find those documents the term exists in and add them to a list, then do the tf-idf analysis with arrays to vectorize the computation.