In [None]:
from IPython.core.display import HTML, display
display(HTML('<style>.container { width:100%; !important } </style>'))

In [None]:
from tika import parser
import magic
import math
import os
import string
import platform
import re
import operator
from nltk.tokenize import RegexpTokenizer

In [None]:
class Document:
    def __init__(self, url, length, id, textList):
        self.url = url
        self.length = length
        self.id = id
        self.score = 0.
        self.textList = textList

In [None]:
class Index:
    hashmap = {} #dictionary
    fileCount = 0 #integer, Gesamtzahl aller gefunden Dateien
    docHashmap = {}

In [None]:
def buildIndex(self):
    # alle Start-Verzeichnisse holen
    start = self._getStartDirectories()
    #start = ["F:/Jonas/Uni"]
    #start = ["C:/Users/marle/OneDrive/Studium"]
    # Magic-Instanz erstellen, um Datei-Typ bestimmen zu können
    mime = magic.Magic(mime=True)
    
    for s in start:
        for root, _dir, files in os.walk(s):
            for f in files:
                path = os.path.abspath(os.path.join(root, f))
                try:
                    if mime.from_file(path) == "application/pdf":
                        # in Text umwawndeln und tokenization durchführen
                        fileData = parser.from_file(path)
                        rawText = fileData['content']
                        self.fileCount += 1
                    
                        processedText = self._preprocessText(rawText)
                        document = Document(path, len(processedText), self.fileCount, processedText)
                        self.docHashmap.update({self.fileCount : document})
                        self._addToIndex(self.fileCount, processedText)
                except:
                    continue
                    
    return

Index.buildIndex = buildIndex

In [None]:
def _getStartDirectories(self):
    start = []
    
    if platform.system() == "Linux":
        start.append("/")
    elif platform.system() == "Windows":
        start = ['%s:\\' % d for d in string.ascii_uppercase if os.path.exists('%s:' % d)]
    else:
        raise EnvironmentError
        
    return start

Index._getStartDirectories = _getStartDirectories

In [None]:
def _addToIndex(self, documentID, terms):
    for t in terms:
        try:
            docs = self.hashmap[t]
            docs.add(documentID)
            self.hashmap.update({t : docs})
        except KeyError:
            docs = {documentID}
            self.hashmap.update({t : docs})
    
Index._addToIndex = _addToIndex

In [None]:
def _preprocessText(self, txt):
    # lower all:
    txt = txt.lower()
    
    # remove digits
    txt = re.sub(r'\d+', '', txt)
            
    # tokenize the text
    tokenizer = RegexpTokenizer(r'[a-zA-Z]+-$|\w+')
    result = tokenizer.tokenize(txt)
    
    # concatenate divided words
    for word in result:
        if word[-1] == '-':
            ind = result.index(word)
            corrected = word[:-1]+result[ind+1]
            result[ind] = corrected
            del result[ind+1]
    return result
    
Index._preprocessText = _preprocessText

In [None]:
def retrieve(self, searchString):
    # pre-processing
    processedStrings = self._preprocessText(searchString)
    result = set()
    df = {}
    helpDict = {}
    resultList = []
    
    for word in processedStrings:
        try:
            documents = set(self.hashmap[word])
            df[word] = len(documents)
            result = result.union(documents)
        except KeyError:
            continue
    
    for document in result:
        doc = ind.docHashmap[document]
        doc.tf_idf(processedStrings,df)
        helpDict[doc.id] = doc.score
        
    sortedDict = sorted(helpDict.items(), key=operator.itemgetter(1))
    
    for key,_ in sortedDict:
        resultList.append(ind.docHashmap[key].url)
        
    return resultList[::-1]

Index.retrieve = retrieve

In [None]:
ind = Index()
ind.buildIndex()

In [None]:
def tf_idf(self, termList, df):
    tfDict = {}
    for term in termList:
        tfDict[term] = 0
    
    ind = Index()
        
    for term in self.textList:
        if term in termList:
            tfDict[term] = tfDict[term]+1

    for key, value in df.items():
        idf = math.log((ind.fileCount+1/value+1),10)
        tfDict[key] = tfDict[key]*idf
    
    self.score = sum(tfDict.values())

Document.tf_idf = tf_idf

In [None]:
resultSet = ind.retrieve("Information Retrieval")
for elem in resultSet:
    print(elem)