In [1]:
import os
import sys

from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser
from whoosh import scoring
from whoosh.index import open_dir
# from whoosh.lang.porter import stem
import snowballstemmer

import pandas as pd

Die Stichwortsuche des Sphinx Framework soll als Baseline für die getesteten Sprachmodelle dienen. Diese ist jedoch so nicht leicht portiertbar, da sie in Javascript verfasst und auf Dependencies wie JQuery aufbaut. Weitaus einfacher ist die Implementierung einer ähnlichen Search-Enginge mit **whoosh**, die aus vier Elementen besteht:

- Stemmer (Reduktion der Wörter in Dokumenten und Anfragen auf Wortstamm)
- Herausfiltern von Stoppwörtern aus Suchanfragen
- Suche nach Stichwörtern in Dokumenten
- Ranking von Dokumenten nach Relevanz nach erfolgreichem Auffinden von Stichwörtern

Für den Stemmer wird **snowball** genutzt (wie auch in der Sphinx Search Engine)

Überprüfen, ob Stemming auch auf deutschen Wörtern funktioniert

In [2]:
stemmer = snowballstemmer.stemmer('german')
print(stemmer.stemWords("Viele Schritte".split()))

['Viel', 'Schritt']


Diese Liste an deutschen Stopp-Wörtern ist der Doku selbst entnommen.

In [3]:
stopwords = ["aber","alle","allem","allen","aller","alles","als","also","am","an","ander","andere","anderem","anderen","anderer","anderes","anderm","andern","anderr","anders","auch","auf","aus","bei","bin","bis","bist","da","damit","dann","das","dasselbe","dazu","da\u00df","dein","deine","deinem","deinen","deiner","deines","dem","demselben","den","denn","denselben","der","derer","derselbe","derselben","des","desselben","dessen","dich","die","dies","diese","dieselbe","dieselben","diesem","diesen","dieser","dieses","dir","doch","dort","du","durch","ein","eine","einem","einen","einer","eines","einig","einige","einigem","einigen","einiger","einiges","einmal","er","es","etwas","euch","euer","eure","eurem","euren","eurer","eures","f\u00fcr","gegen","gewesen","hab","habe","haben","hat","hatte","hatten","hier","hin","hinter","ich","ihm","ihn","ihnen","ihr","ihre","ihrem","ihren","ihrer","ihres","im","in","indem","ins","ist","jede","jedem","jeden","jeder","jedes","jene","jenem","jenen","jener","jenes","jetzt","kann","kein","keine","keinem","keinen","keiner","keines","k\u00f6nnen","k\u00f6nnte","machen","man","manche","manchem","manchen","mancher","manches","mein","meine","meinem","meinen","meiner","meines","mich","mir","mit","muss","musste","nach","nicht","nichts","noch","nun","nur","ob","oder","ohne","sehr","sein","seine","seinem","seinen","seiner","seines","selbst","sich","sie","sind","so","solche","solchem","solchen","solcher","solches","soll","sollte","sondern","sonst","um","und","uns","unse","unsem","unsen","unser","unses","unter","viel","vom","von","vor","war","waren","warst","was","weg","weil","weiter","welche","welchem","welchen","welcher","welches","wenn","werde","werden","wie","wieder","will","wir","wird","wirst","wo","wollen","wollte","w\u00e4hrend","w\u00fcrde","w\u00fcrden","zu","zum","zur","zwar","zwischen","\u00fcber"];

In [4]:
len(stopwords)

231

Anfragen sind *case sensitive*, also muss alles *to lower* konvertiert werden.

In [5]:
"was" in stopwords

True

In [6]:
"Was" in stopwords

False

In [7]:
def prepare_string(string, stopwords):
   
    # replace question mark
    query_str = string.replace('?', '')
    
    # stem words
    query_tokens = stemmer.stemWords(query_str.split())
    
    # remove stopwords and convert to lower
    query_filter = [token.lower() for token in query_tokens if token.lower() not in stopwords]

    # re-assemble string
    return ' '.join(e for e in query_filter)    

Basis für die Suche ist das Bauen eindes Indexes aus allen Dokumenten

In [27]:
def createSearchableData(documents, indexdir_path, ending='.html'):   
 
    '''
    Schema definition: title(name of file), path(as ID), content(indexed
    but not stored),textdata (stored text content)
    '''
    
    # create Schema for indexing documents
    schema = Schema(
        title=TEXT(stored=True),
        path=ID(stored=True), 
        content=TEXT,
        textdata=TEXT(stored=True)
    )
    
    # create dir for storing indexing results
    if not os.path.exists(indexdir_path):
        os.mkdir(indexdir_path)
 
    # Creating a index writer to add document as per schema
    ix = create_in("indexdir", schema)
    writer = ix.writer()
 
    # add every document to index
    for index, row in documents.iterrows():

        path = row['filename']
        title = row['title']
        text = prepare_string(row['body'], stopwords)
        # print(f'title: {title}, path: {path}')
        writer.add_document(title=title, path=path, content=text, textdata=text)
    
    writer.commit()

### Erstellen des Suchindex aus Dokumenten

In [12]:
documents = catalog.load("ecu_test_doku_parsed")

2022-05-10 11:35:43,392 - kedro.io.data_catalog - INFO - Loading data from `ecu_test_doku_parsed` (CSVDataSet)...


In [13]:
documents.head(1)

Unnamed: 0,filename,title,sub_topics,body,links,imgs
0,Bedienung_EasyInsert.html,EasyInsert,['Tastenkombinationen\uf0c1'],EasyInsert — ECU-TEST 2022.2 Anwenderhandbuch ...,"['../index.html', '../Getting_Started/GettingS...","['../_static/logo.png', '../_images/EasyInsert..."


In [28]:
indexdir = "../data/03_primary/keyword_search/"
createSearchableData(documents, indexdir)

### Testen der Suchfunktion und Ausgabe gefundener Resultate in DataFrame

In [23]:
def query(query_str, topN_results=5):
    
    result_rows = []   

    query_str = prepare_string(query_str, stopwords)
    print(f'the query is: "{query_str}"')

    ix = open_dir("indexdir") 
    with ix.searcher(weighting=scoring.Frequency) as searcher:
        query = QueryParser("content", ix.schema).parse(query_str)

        results = searcher.search(query)

        if len(results) > 0:
            print(f'\nsearch yielded {len(results)} results in total')
            num_res = topN_results if topN_results <= len(results) else len(results)
            for i in range(num_res):
                result_rows.append([results[i].score, results[i]['title'], results[i]['path']])
        else:
            print('search yielded no hits')

    # convert results to dataframe and display
    df = pd.DataFrame(result_rows)
    if len(results) > 0:
        df.columns = ['Score', 'Title', 'Path']
    
    return df

### Testquery 1

In [29]:
query("Was ist ein Traceschritt?")

the query is: "traceschritt"

search yielded 34 results in total


Unnamed: 0,Score,Title,Path
0,25.0,Generische Signale,TRACE-CHECK_Handbuch_Traceanalyse-Entwurf_Gene...
1,21.0,Traceschritte verwenden,TRACE-CHECK_Handbuch_Traceanalyse-Entwurf_Trac...
2,15.0,Signalverarbeitung mit Traceschritten,TRACE-CHECK_Handbuch_Traceanalyse-Entwurf_Sign...
3,14.0,Analysen mit Triggern und Berechnungen,TRACE-CHECK_Handbuch_Traceanalyse-Entwurf_Bere...
4,12.0,Assertion,TRACE-CHECK_Handbuch_Traceanalyse-Entwurf_Asse...


### Testquery 2

In [33]:
query("Wie erstellt man eine Traceschritt Analyse?")

the query is: "erstellt traceschritt analys"

search yielded 15 results in total


Unnamed: 0,Score,Title,Path
0,31.0,Generische Signale,TRACE-CHECK_Handbuch_Traceanalyse-Entwurf_Gene...
1,22.0,Analysen mit Triggern und Berechnungen,TRACE-CHECK_Handbuch_Traceanalyse-Entwurf_Bere...
2,18.0,Konzepte und Vorgehen,TRACE-CHECK_Handbuch_Traceanalyse-Entwurf_Konz...
3,13.0,Diagrammerstellung (Plot),TRACE-CHECK_Handbuch_Traceanalyse-Entwurf_Diag...
4,12.0,Analyseablaufsteuerung,TRACE-CHECK_Handbuch_Traceanalyse-Entwurf_Abla...


### Testquery 3

In [31]:
query("Wie wird ein Testfall erstellt?")

the query is: "testfall erstellt"

search yielded 57 results in total


Unnamed: 0,Score,Title,Path
0,32.0,Testausführung,Testausfuehrung_Testausfuehrung.html
1,21.0,Erster Start,Getting_Started_GettingStarted.html
2,15.0,Schnittstelle zur Traceanalyse,Testausfuehrung_Analyse-Jobs.html
3,14.0,Projekt-Editor,Hauptprogramm_Editor_Projekteditor.html
4,13.0,Wie werden Tests erstellt?,Einfuehrung_Testen.html
