In [52]:
import PyPDF2
import os
import math
import string
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import OrderedDict

In [116]:
# initialization local dokumen
def allFile(location):
    document = []
    for doc in os.walk(location):
        document = doc[2]
    return document

def extractPDF(location):
    documents = allFile(location)
    allText = []
    for doc in documents:
        file = open(location+'/'+doc, 'rb')
        fileReader = PyPDF2.PdfFileReader(file)
        
        docs = ''
        pages = fileReader.numPages
        for page in range(pages):
            obj = fileReader.getPage(page)
            docs = docs + obj.extractText()
        allText.append(docs)
    return allText

def generateDocNumber(filename):
    docNum = []
    for file in filename:
        docNum.append(str(filename.index(file)))
    return docNum

# PREPROCESSING
def removePunctuation(textList):
    for i in range(len(textList)):
        for punct in string.punctuation:
            textList[i] = textList[i].replace(punct, " ")
        textList[i] = re.sub(r'^https?:\/\/.*[\r\n]*', '', textList[i], flags=re.MULTILINE)
    return textList

def caseFolding(textList):
    text = []
    for i in range(len(textList)):
        text.append(textList[i].lower())
    return text

def token(sentence):
    token = []
    for word in CountVectorizer().build_tokenizer()(sentence):
        token.append(word)
    return token

def tokenize(textList):
    tokens = []
    for i in range(len(textList)):
        tokens.append(token(textList[i]))
    return tokens

def checkStopword(sentence, stop_words):
    sentence = [w for w in sentence if not w in stop_words]
    return sentence
    
def stopwordRemove(textList):
    stop_words = set(stopwords.words('english'))
    text = []
    for i in range(len(textList)):
        text.append(checkStopword(textList[i], stop_words))
    return text

def numberRemove(textList):
    text = []
    for i in range(len(textList)):
        text.append([w for w in textList[i] if not any(j.isdigit() for j in w)])
    return text

def stemming(textList):
    stemmer = PorterStemmer()
    text = textList
    for i in range(len(textList)):
        for j in range(len(textList[i])):
            text[i][j] = stemmer.stem(text[i][j])
    return text

def sorting(textList):
    for i in range(len(textList)):
        textList[i] = sorted(textList[i])
    return textList

def getAllTerms(textList):
    terms = []
    for i in range(len(textList)):
        for j in range(len(textList[i])):
            terms.append(textList[i][j])
    return sorted(set(terms))

# INDEXING 
def createIndex(textList, docno):
    terms = getAllTerms(textList)
    proximity = {}
    for term in terms:
        position = {}
        for n in range(len(textList)):
            if(term in textList[n]):
                position[docno[n]] = []
                for i in range(len(textList[n])):
                    if(term == textList[n][i]):
                        position[docno[n]].append(i)
        proximity[str(term)] = position
    return proximity

def exportIndex(index, filename):
    file = open(filename,'w')
    for n in index:
        file.write(str(n)+'\n')
        for o in index[n]:
            file.write('\t'+o+': ')
            for p in range(len(index[n][o])):
                file.write(str(index[n][o][p]))
                if(p<len(index[n][o])-1):
                    file.write(', ')
                else:
                    file.write('\n')
    file.close()
    return "file index berhasil dibuat."    

# Ranking document
def queryInIndex(query, index):
    result = []
    for word in query:
        if word in index:
            result.append(word)
    return result

def df(query, index):
    docFreq = {}
    for word in query:
        if word in index:
            docFreq[word] = len(index[word])
    return docFreq

def idf(df, N):
    inv = {}
    for word in df:
        inv[word] = math.log10(N/df[word])
    return inv

def tf(query, index):
    termFreq = {}
    for word in query:
        freq = {}
        if word in index:
            for i in index[word]:
                freq[i] = len(index[word][i])
        termFreq[word] = freq
    return termFreq

def tfidf(tf, idf):
    w = {}
    for word in tf:
        wtd = {}
        for doc in tf[word]:
            wtd[doc] = (1+(math.log10(tf[word][doc])))*idf[word]
        w[word] = wtd
    return w
    
def score(TFIDF):
    res = {}
    for i in TFIDF:
        for j in TFIDF[i]:
            res[j] = 0
    for i in TFIDF:
        for j in TFIDF[i]:
            res[j] = res[j]+TFIDF[i][j]
    sorted_dict = sorted(res, key=res.get, reverse=True)
    return sorted_dict

In [117]:
location = 'pdf_collection'
filename = allFile(location) 
extracted= extractPDF(location)
totalDoc = len(filename)
documentNumber = generateDocNumber(filename)

for i in range(len(filename)):
    extracted[i] = str(extracted[i].encode("utf-8"))
    
# PREPROCESSING
text = removePunctuation(extracted)
text = caseFolding(text)
text = tokenize(text)
text = stopwordRemove(text)
text = numberRemove(text)
text = stemming(text)

# GET ALL TERMS IN COLLECTION
terms = getAllTerms(text)

# INDEXING

# index = createIndex(text,documentNumber, terms)
index = createIndex(text,documentNumber)

# CREATE INDEX FILE
exportIndex(index, 'Index_docs.txt')

'file index berhasil dibuat.'

In [138]:
# QUERY

raw_query = ["database"]

query = removePunctuation(raw_query)
query = caseFolding(query)
query = tokenize(query)
query = stopwordRemove(query)
query = numberRemove(query)
query = stemming(query)
query = query[0]

In [142]:
# Processing
query = queryInIndex(query, index)
N               = totalDoc
tfidf_list      = []

docFrequency    = df(query, index)
invDocFrequency = idf(docFrequency, N)
termFrequency   = tf(query, index)
TFIDF           = tfidf(termFrequency, invDocFrequency)
sc              = score(TFIDF)

In [144]:
# Result

print('Query: ', raw_query,'\n\n')
print('Result: \n')
# for i in range(5):
#     a = documentNumber.index(sc[i])
#     print('Document Number: ',sc[i])
#     print(filename[a])
#     print('-------------------------------------------\n')

count = 0
for i in range(len(sc)):
    a = documentNumber.index(sc[i])
    print('==========================================================================\n')
    print('| Filename: ',filename[a],' | Document ID: ',documentNumber[a],'|','\n')
    print(extracted[a][0:1000])
    print('\n==========================================================================')
    print('\n\n\n')
    count = count + 1
    if(count >= 5):
        break

Query:  ['database'] 


Result: 


| Filename:  Database Auditing.pdf  | Document ID:  3 | 

b 1 Database Auditing  Jungha Woo  Sael Lee  and Carla Zoltowski  n wooj  lee399  cbz  purdue edu  n Abstract  n nGovernment regulations and increased awareness of  nsecurity issues have  nincreased the auditing  nrequirements of information technology systems   In  n nthis paper  we will discuss three government  nregulations and how they have impacted  ninformation techno nlogy systems   We classify  ndatabase auditing systems by considering features  nof the basic components of an auditing system as  n nproposed by Bishop  the logger  analyzer  and  nnotifier   In addition  we will consider possible  npolicy models that could be implemented   Finally  n nwe will survey three commerci nal database and third  nparty auditing products according to the  nclassification features  and discuss how they address  n nthe government regulation ns and general security  nneeds       n 1  Introduction  n 