Created on January 3rd 2021 by Patrick Rotzetter

https://www.linkedin.com/in/rotzetter/

# Small experiment of document mining with various techniques Part 4

This notebook will check elasticsearch capabilities for our given use case. The notebook is largely inspired by the book http://www.practicalnlp.ai/
This notebook is only going to work if an instance of elasticsearch is reachable at localhost:9200 ( or you can change the parameters in cell 2 below) 

## Load the files

In [1]:
# Import require libraries
import numpy as np
import texthero as hero
import pdftotext
import pandas as pd
from elasticsearch import Elasticsearch 
from datetime import datetime

In [3]:

#elastic search instance has to be running on the machine. Default port is 9200. 

#Call the Elastic Search instance, and delete any pre-existing index
es=Elasticsearch([{'host':'localhost','port':9200}])
if es.indices.exists(index="myindex"):
    es.indices.delete(index='myindex', ignore=[400, 404]) #Deleting existing index for now



In [4]:
# function to read PDF files using pdftotext
def readPdfFile(filename):
    text=""
    with open(filename, "rb") as f:
        pdf = pdftotext.PDF(f)
        for page in pdf:
            text=text+page
    return text

In [5]:
# function to read PPT files
def readPPTFile(filename):
    text=""  
    prs = Presentation(filename)
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text=text+shape.text
    text=remove_special_characters(text)
    return text

In [6]:
#path of first input test file
path='./sampledocs/'

In [7]:
# let us scan the full directory, read PDF and PPT documents, clean them and process them with spacy

docName=[]
docType=[]
docText=[]
docNLP=[]
import glob
list_of_files = glob.glob(path+'*.pdf')           # create the list of file
fileNames=[]
for file_name in list_of_files:
    fileText=readPdfFile(file_name)
    docName.append(file_name)
    docType.append('pdf')
    docText.append(fileText)
list_of_files = glob.glob(path+'*.pptx')           # create the list of file
for file_name in list_of_files:
    fileText=readPPTFile(file_name)
    docName.append(file_name)
    docType.append('ppt')
    docText.append(fileText)
fullDocs = pd.DataFrame({'Name':docName,'Type':docType,'Text':docText})
fullDocs['cleanText']=hero.clean(fullDocs['Text'])
#fullDocs['NLP']=fullDocs['cleanText'].apply(processDoc)

In [8]:
 print ("Average length of text:" + str((np.mean(fullDocs['Text'].str.len()))))
 print ("Min length of text:" + str((np.min(fullDocs['Text'].str.len()))))
 print ("Max length of text:" + str((np.max(fullDocs['Text'].str.len()))))

Average length of text:197501.375
Min length of text:17987
Max length of text:464271


In [9]:
fullDocs['text_word_count'] = fullDocs['Text'].apply(lambda x: len(x.strip().split()))  # word count
fullDocs['text_unique_words']=fullDocs['Text'].apply(lambda x:len(set(str(x).split())))  # number of unique words
fullDocs.head()

Unnamed: 0,Name,Type,Text,cleanText,text_word_count,text_unique_words
0,./sampledocs/Module-1-Lecture-Slides.pdf,pdf,"Application of AI, Insurtech and Real Estate\n...",application ai insurtech real estate technolog...,3732,1509
1,./sampledocs/Technology-and-innovation-in-the-...,pdf,Technology and\ninnovation in the\ninsurance s...,technology innovation insurance sector technol...,16763,4237
2,./sampledocs/sigma-5-2020-en.pdf,pdf,No 5 /2020\n\n\n\n\n...,machine intelligence executive summary machine...,14512,4342
3,./sampledocs/Issues_Paper_on_Increasing_Digita...,pdf,Issues Paper on Increasing Digitalisatio...,issues paper increasing digitalisation insuran...,15390,3685
4,./sampledocs/Digital-disruption-in-Insurance.pdf,pdf,Digital disruption\nin insurance:\nCutting thr...,digital disruption insurance cutting noise con...,34531,7067


In [10]:
fullDocs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Name               8 non-null      object
 1   Type               8 non-null      object
 2   Text               8 non-null      object
 3   cleanText          8 non-null      object
 4   text_word_count    8 non-null      int64 
 5   text_unique_words  8 non-null      int64 
dtypes: int64(2), object(4)
memory usage: 512.0+ bytes


In [11]:
fullDocs.describe()

Unnamed: 0,text_word_count,text_unique_words
count,8.0,8.0
mean,19507.5,4453.5
std,15720.754226,2531.36073
min,2502.0,1006.0
25%,11817.0,3141.0
50%,16076.5,4289.5
75%,22771.0,5756.75
max,49779.0,8462.0


## Elasticsearch

In [12]:
#Build an index from booksummaries dataset. I am using only 500 documents for now.
for index, row in fullDocs.iterrows():
    doc = {'id' : index,
            'name': row['Name'],
            'text': row['Text']
          }

    res = es.index(index="myindex", id=index, body=doc)


In [13]:
#Check to see how big is the index
res = es.search(index="myindex", body={"query": {"match_all": {}}})
print("Your index has %d entries" % res['hits']['total']['value'])

Your index has 8 entries


In [14]:
#Try a test query. The query searches "summary" field which contains the text
#and does a full text query on that field.
res = es.search(index="myindex", body={"query": {"match": {"text": "innovation"}}})
print("Your search returned %d results." % res['hits']['total']['value'])

Your search returned 7 results.


In [15]:
#match query considers both exact matches, and fuzzy matches and works as a OR query. 
#match_phrase looks for exact matches.
while True:
    query = input("Enter your search query: ")
    if query == "STOP":
        break
    res = es.search(index="myindex", body={"query": {"match_phrase": {"text": query}}})
    print("Your search returned %d results:" % res['hits']['total']['value'])
    for hit in res["hits"]["hits"]:
        print(hit["_source"]["name"])
        #to get a snippet 100 characters before and after the match
        loc = hit["_source"]["text"].lower().index(query)
        print(hit["_source"]["text"][:100])
        print(hit["_source"]["text"][loc-100:loc+100])

Enter your search query: insurance innovation
Your search returned 2 results:
./sampledocs/Technology-and-innovation-in-the-insurance-sector.pdf


ValueError: substring not found