In [1]:
import os
import pandas as pd
import numpy as np
import collections
import pandas as pd


#pdfConverter
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

#tokenizing and cleaning the words in file
import nltk
import heapq
# nltk.download()
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.stem import PorterStemmer
from openpyxl import Workbook
import re

#Amazon S3
import boto3
from botocore.exceptions import ClientError

# Covert the PDFs into a single text file 

In [2]:
class PdfConverter:
    def __init__(self, file_path):
        self.file_path = file_path
        
# convert the pdf file to a string which has space among words 
    def convert_pdf_to_txt(self):
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
#         codec = 'utf-8'  # 'utf16','utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, laparams=laparams)
        fp = open(self.file_path, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()
        
        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, 
                                      caching=caching, check_extractable=True):
            interpreter.process_page(page)
            
        fp.close()
        device.close()
        strg = retstr.getvalue()
        retstr.close()
        return strg

# convert pdf file text to string and save as a pdfToText.txt file
    def save_convert_pdf_to_txt(self,count):
        content = self.convert_pdf_to_txt()
        text_files = ['pdfToText1.txt']#,'pdfToText2.txt']
        for indx in range(len(text_files)):
            if count == indx:
                file = text_files[indx]
                currentFile = (os.getcwd() +"\data\covertedfiles\\"+ file)
                txt_pdf = open(currentFile,'wb')
                txt_pdf.write(content.encode('utf-8'))
                txt_pdf.close()

    def dataPreprocessing(self):
        #preparing the corpus(document)
        filenames = ['pdfToText1.txt']#,'pdfToText2.txt']
        
        text = ''
        for f in filenames:
            currentFile = (os.getcwd() +"\data\covertedfiles\\"+ f)
            with open(currentFile, "r", encoding="utf-8") as f:
#             with open("finalfile.txt", "r", encoding="utf-8") as f:
                line = f.read()
#                 result = re.sub("[^a-z\s]", "", line, 0, re.IGNORECASE | re.MULTILINE)
            text += line
            
        corpus = nltk.sent_tokenize(text)

        for i in range(len(corpus)):
            corpus [i] = corpus [i].lower()
            corpus [i] = re.sub(r'\W',' ',corpus [i])
            corpus [i] = re.sub(r'\s+',' ',corpus [i])
            
        return corpus
        
    
    #tokenize the sentences into words
    def tokenizedWords(self,corpus):
        unigrams = []
        bigrams = []
        tokenizedWords = []
        
        stop_words = set(stopwords.words('english'))
            
        #Unigrams
        for para in corpus:
            words = nltk.word_tokenize(para)
            for w in words:
                if w not in stop_words and len(w)>1 and not w.isdigit():
                    unigrams.append(w.lower().strip())
                
        #Bigrams
        for para in corpus:
            words = nltk.word_tokenize(para)
            fword= []
            for w in words:
                if w not in stop_words and len(w)>1 and not w.isdigit():
                    fword.append(w)
#             print(fword)
            
            nList = list(ngrams(fword,2))
            
            for w in nList:
#                 print(w)
                tmp = ''.join(w)
                bigrams.append(tmp.lower().strip())

        
        return unigrams,bigrams
    
    
    def tfIDF(self):
        
        #cleaned text
        corpus = self.dataPreprocessing()
        
        #Extracting the tokenized words from data
        unigrams, bigrams = self.tokenizedWords(corpus)
        
#         print(bigrams)
#         print(type(bigrams))
        
        #calculate the frequency of words
        wordFreq = {}
        for token in unigrams:
            if token not in wordFreq.keys():
                wordFreq[token] = 1 
            else:
                wordFreq[token] += 1 
        
        #choose the frequently occured words with heap queue algorithm
        most_freq_unigrams = heapq.nlargest(500,wordFreq, key = wordFreq.get)
#         print(most_freq_unigrams)
        
        #calculate the frequency of words
        wordFreq = {}
        for token in bigrams:
            if token not in wordFreq.keys():
                wordFreq[token] = 1 
            else:
                wordFreq[token] += 1 
        
        #choose the frequently occured words with heap queue algorithm
        most_freq_bigrams = heapq.nlargest(500,wordFreq, key = wordFreq.get)
#         print(most_freq_bigrams)
        
        most_freq = most_freq_unigrams + most_freq_bigrams
        
#         print(most_freq)
        
        #Calculating Inverse Document Frequency 
        wordIDFValues = {}
        for token in most_freq:
            doc_containing_word = 0
            for document in corpus:
                if len(document)>0:
                    if token in nltk.word_tokenize(document):
                        doc_containing_word += 1
                else: continue
            wordIDFValues[token] = np.log(len(corpus)/(1 + doc_containing_word))
        
        #Calculating term frequency
        wordTFValues = {}
        for token in most_freq:
            sent_tf_vector = []
            for document in corpus:
#                 print(document)
                if len(document)>0:
                    doc_freq = 0
                    for word in nltk.word_tokenize(document):
                        if token == word:
                            doc_freq += 1
                    
                    docLength = len(nltk.word_tokenize(document))
                    
                    if docLength<=0:
                        docLength = 1
                       
                    word_tf = (doc_freq/docLength)
                    sent_tf_vector.append(word_tf)
                else: continue
            wordTFValues[token] = sent_tf_vector
        
        #Computing TF-IDF Values
        tfidfValues = []
        finalwords = []
        tfidf_values = []
        
        for token in wordTFValues.keys():
            finalwords.append(token)
            tfidf_sentences = []
            for tf_sentence in wordTFValues[token]:
                tf_idf_score = tf_sentence * wordIDFValues[token]
                tfidf_sentences.append(abs(tf_idf_score))
            tfidfValues.append(tfidf_sentences)
        
        for t in  tfidfValues:
            tfidf_values.append(max(t))
                                               
        return finalwords,tfidf_values
    
    
    def uploadToS3(self,finalwords, tfidfValues):
        tf_idf_keywords = pd.DataFrame()
        tf_idf_keywords['words'] = finalwords
        tf_idf_keywords['weights'] = tfidfValues
        
        try:
            bucket = 'hiringtrendanalysis' # already created on S3
            csv_buffer = StringIO()
            tf_idf_keywords.to_csv(csv_buffer)
            s3_resource = boto3.resource('s3')
            s3_resource.Object(bucket, 'keywords.csv').put(Body=csv_buffer.getvalue())
        except ClientError as e:
            logging.error(e)
            return False
        return True
#         tf_idf_keywords.to_csv(os.getcwd()+"\data\\keywords.csv", index= False)

In [3]:
if __name__ == '__main__':
    pdfs = ['techglossary.pdf']#,'roiguidebook.pdf']
    count = 0
    
    for p in pdfs:
        path = (os.getcwd() +"\data\pdfs\\"+ p)
        pdfConverter = PdfConverter(file_path=path)
        pdfConverter.convert_pdf_to_txt()
        pdfConverter.save_convert_pdf_to_txt(count)
        print("Converted",p)
        count = count+1
    
    finalwords,tfidfValues = pdfConverter.tfIDF()
    
    pdfConverter.uploadToS3(finalwords,tfidfValues)


Converted techglossary.pdf
