* initially experienced significant package dependency issues
* created conda env 'pdf_process' <- use this

## Preliminaries

In [4]:
import pandas as pd
import numpy as np
import PyPDF2
import textract
import re

## Reading Text

- converted PDF file to txt format for better pre-processing

In [5]:
filename ='../src/Songbird Migration 2018_Field Note_1.pdf'

pdfFileObj = open(filename,'rb')               #open allows you to read the file
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)   #The pdfReader variable is a readable object that will be parsed
num_pages = pdfReader.numPages                 #discerning the number of pages will allow us to parse through all the pages


count = 0
text = ""
                                                            
while count < num_pages:                       #The while loop will read each page
    pageObj = pdfReader.getPage(count)
    count +=1
    text += pageObj.extractText()
    
#Below if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned files.

if text != "":
    text = text
    
#If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text

else:
    text = textract.process('http://bit.ly/epo_keyword_extraction_document', method='tesseract', language='eng')

    # Now we have a text variable which contains all the text derived from our PDF file.

In [6]:
text = text.encode('ascii','ignore').lower() #Lowercasing each word

## Extracting Keywords

In [7]:
keywords = re.findall(r'[a-zA-Z]\w+',text)
len(keywords)                               #Total keywords in document

798

In [8]:
df = pd.DataFrame(list(set(keywords)),columns=['keywords'])  #Dataframe with unique keywords to avoid repetition in rows

## Calculating Weightage

 - In information retrieval, tf–idf or TFIDF, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.It is often used as a weighting factor in searches of information retrieval, text mining, and user modeling. 

- __TF: Term Frequency__, which measures how frequently a term occurs in a document. Since every document is different in length, it is possible that a term would appear much more times in long documents than shorter ones. Thus, the term frequency is often divided by the document length (aka. the total number of terms in the document) as a way of normalization: 

__TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document).__

- __IDF: Inverse Document Frequency__, which measures how important a term is. While computing TF, all terms are considered equally important. However it is known that certain terms, such as "is", "of", and "that", may appear a lot of times but have little importance. Thus we need to weigh down the frequent terms while scale up the rare ones, by computing the following: 

__IDF(t) = log_e(Total number of documents / Number of documents with term t in it).__

In [9]:
def weightage(word,text,number_of_documents=1):
    word_list = re.findall(word,text)
    number_of_times_word_appeared =len(word_list)
    tf = number_of_times_word_appeared/float(len(text))
    idf = np.log((number_of_documents)/float(number_of_times_word_appeared))
    tf_idf = tf*idf
    return number_of_times_word_appeared,tf,idf ,tf_idf    

In [10]:
df['number_of_times_word_appeared'] = df['keywords'].apply(lambda x: weightage(x,text)[0])
df['tf'] = df['keywords'].apply(lambda x: weightage(x,text)[1])
df['idf'] = df['keywords'].apply(lambda x: weightage(x,text)[2])
df['tf_idf'] = df['keywords'].apply(lambda x: weightage(x,text)[3])

In [11]:
df = df.sort_values('tf_idf',ascending=True)
# df.to_csv('Keywords.csv')
df.head(25)

Unnamed: 0,keywords,number_of_times_word_appeared,tf,idf,tf_idf
272,in,90,0.016435,-4.49981,-0.073956
263,at,62,0.011322,-4.127134,-0.046728
339,on,59,0.010774,-4.077537,-0.043933
292,the,49,0.008948,-3.89182,-0.034825
377,an,48,0.008766,-3.871201,-0.033933
269,it,43,0.007852,-3.7612,-0.029535
346,or,36,0.006574,-3.583519,-0.023559
232,we,29,0.005296,-3.367296,-0.017833
268,is,26,0.004748,-3.258097,-0.015469
4,to,25,0.004565,-3.218876,-0.014695


## Second Method - Using Gensim library

In [12]:
from gensim.summarization import keywords
import warnings
warnings.filterwarnings("ignore")

No handlers could be found for logger "smart_open.ssh"


In [13]:
values = keywords(text=text,split='\n',scores=True)

In [14]:
data = pd.DataFrame(values,columns=['keyword','score'])
data = data.sort_values('score',ascending=False)
data.head(10)

Unnamed: 0,keyword,score
0,sparrow,0.265833
1,captured,0.263599
2,captures,0.263599
3,capture,0.263599
4,site,0.183963
5,sites,0.183963
6,banding,0.157433
7,banded,0.157433
8,bird,0.140004
9,birds,0.140004


### Third Approach - Using RAKE (Rapid Automatic Keyword Extraction)

In [15]:
from rake_nltk import Rake

In [16]:
r = Rake()
r.extract_keywords_from_text(text)

In [17]:
phrases = r.get_ranked_phrases_with_scores()

In [18]:
table = pd.DataFrame(phrases,columns=['score','Phrase'])
table = table.sort_values('score',ascending=False)


In [19]:
table.head(10)

Unnamed: 0,score,Phrase
0,274.038636,warbler 46common yellowthroat 4song sparrow 34...
1,139.523485,thrush 109unidentified empidonax flycatcher 7y...
2,96.0,eared owl 1spotted towhee 9ovenbird 1american ...
3,80.0,headed grosbeak 20northern waterthrush 2lazuli...
4,68.723485,crowned warbler 19ruffed grouse 2warbling vire...
5,68.133333,pewee 26downy woodpecker 2dusky flycatcher 22m...
6,46.0,breasted nuthatch 2evening grosbeak 13wild tur...
7,45.133333,flycatcher 10hermit thrush 1calliope hummingbi...
8,39.208333,warbler 16varied thrush 2american redstart 13w...
9,29.333333,eyed vireo 1grand total 1336with help
