# Analysis 2: Key Word in Context (KWIC) search

## 0. Imports and data upload

In [14]:
import pandas as pd

In [15]:
import re

In [16]:
from pathlib import Path

In [17]:
conllfiles = Path(r"../data/csv")

In [18]:
corpus_annotations = {}
for file in conllfiles.iterdir():
    if file.suffix == '.csv':
        #path = os.path.join(conllfiles, filename)  
        data = pd.read_csv(file) 
        corpus_annotations[file.name] = data

In [19]:
corpus_metadata = pd.read_csv(Path('../data/metadata/QUADRIGA_FS-Text-01_Data01_Corpus-Table.csv'), sep=';')

In [20]:
corpus_metadata = corpus_metadata.set_index('DC.identifier')

## 1. KWIC-search

In [27]:
class ContextViewer:
    
    def __init__(self, corpus_annotated, corpus_metadata):
        self.prepare_index_dataframe_for_search(corpus_annotated, corpus_metadata)
    
    def prepare_index_dataframe_for_search(self, corpus_annotated, corpus_metadata):
        for filename, annotated_text in corpus_annotated.items():
            txtname = filename.replace('.csv', '')
            if txtname in corpus_metadata.index:
                year, month, day = self.get_date_fname(txtname, corpus_metadata)
                annotated_text['month'] = month
                annotated_text['filename'] = filename
        self.full_df = pd.concat(corpus_annotated.values())
        self.full_df = self.full_df.reset_index()
        print(f'Searching in a corpus of {self.full_df.shape[0]} word occurences')
        
    def get_date_fname(self, txtname, corpus_metadata):  
        date = corpus_metadata.loc[txtname, 'DC.date']
        date = str(date)
        year = date[:4]
        month = date[:7]
        day = date
        return year, month, day 
    
    def get_context_words(self, n_words):
        search_lemma = input('Insert a word to search: ')
        if len(search_lemma) == 0:
            search_lemma = 'Grippe'
        indices = self.full_df.query(f'Lemma=="{search_lemma}"').index
        #print(indices)
        left_contexts = []
        this_words = []
        right_contexts = []
        months = []
        for indice in indices:
            left = self.full_df.iloc[indice-10:indice-1, ]["Token"]
            leftс = left[~left.str.contains('\n')]
            right = self.full_df.iloc[indice+1:indice+10, ]["Token"]
            rightс = right[~right.str.contains('\n')]
            left_contexts.append(' '.join(leftс))
            right_contexts.append(' '.join(rightс))
            this_words.append(self.full_df.iloc[indice, ]["Token"])
            months.append(self.full_df.iloc[indice, ]["month"])
        newdf = pd.DataFrame()
        newdf['left_context'] = left_contexts
        newdf['word'] = this_words
        newdf['right_context'] = right_contexts
        newdf['month'] = months
        return newdf
        
    ## currently unused functionality:
    def get_context_sents(self, n_sentences):
        search_lemma = input('Insert a word to search: ')
        if len(search_lemma) == 0:
            search_lemma = 'Grippe'
        indices = self.full_df.query(f'Lemma=="{search_lemma}"').index
        #print(indices)
        left_contexts = []
        this_sentences = []
        right_contexts = []
        months = []
        for indice in indices:
            #print(indice)
            current_filename = self.full_df.iloc[indice, ]["filename"]
            current_sentence_id = self.full_df.iloc[indice, ]["Sentence_idx"]
            left_context = self.get_sents(direction=-1, 
                                              current_filename=current_filename, 
                                              current_sentence_id=current_sentence_id, 
                                              n_sentences=n_sentences) 
            left_contexts.append(left_context)
            right_context = self.get_sents(direction=1, 
                                               current_filename=current_filename, 
                                               current_sentence_id=current_sentence_id, 
                                               n_sentences=n_sentences) 
            right_contexts.append(right_context)
            this_sentence = self.get_sents(direction=0, 
                                               current_filename=current_filename,
                                               current_sentence_id=current_sentence_id,
                                               n_sentences=1)
            this_sentences.append(this_sentence)
            #this_words.append(self.full_df.iloc[indice, ]["Token"])
            months.append(self.full_df.iloc[indice, ]["month"])
        newdf = pd.DataFrame()
        newdf['left_sentences'] = left_contexts
        newdf['this_sentence'] = this_sentences
        newdf['right_sentences'] = right_contexts
        newdf['month'] = months
        return newdf #.sort_values(by='month')
    
    def get_sents(self, direction, current_filename, current_sentence_id, n_sentences):
        sentences = []
        for n in range(1,n_sentences+1):
            sentence_id = current_sentence_id + (n * direction)
            this_sentence = self.create_sentence(current_filename, sentence_id)
            sentences.append(this_sentence)
        #print(' '.join(sentences))
        return ' '.join(sentences)
    
    def create_sentence(self, current_filename, sentence_id):
        words = self.full_df.query(f'filename=="{current_filename}" and Sentence_idx=={sentence_id}')['Token']
        sentence = ' '.join(words)
        #print(sentence)
        return sentence
        

In [24]:
kwic = ContextViewer(corpus_annotations, corpus_metadata)

Searching in a corpus of 33165791 word occurences


In [25]:
kwic.get_context_words(n_words=5)

Insert a word to search: Grippe


Unnamed: 0,left_context,word,right_context,month
0,. 3. verschied in einem Feld- 8 lazarett infolge,Grippe,"im 32 , Lebensjahre unser Wi Prokurfft Sert",1918-10
1,Rachricht von dem am 19. Oltober d. I. infolge,Grippe,"erfolgten Hin- scheidens unseres Prokuristen ,...",1918-10
2,"Elfe , geb. Freudenthal . Infolge Erkrankung «",Grippe,"mit Lungenentzündung Jam nat kurzem , aber sch...",1918-10
3,Kräftsensteges entichlief heute nacht 3 Uhr an...,Grippe,"meine unvergeß » Tiche , herzensgute Geywieger...",1918-10
4,"nicht sterbenz es ist kein Muß , daß man",Grippe,"oder Typhus stirbt , unsere Heilkunde ist noch...",1919-09
...,...,...,...,...
374,einem Feldlazarett | im Felde infolge Anste > ung,Grippe,in „ treuen Pfichtersülung fürs | Vaterland der,1918-09
375,ver fiel der polizeilichen Auflösung Zur „ B...,Grippe,hat der Gemeinderat der Stadt Beru alle öffent...,1918-07
376,"chte Über änftige Erfolge , die mit einem Heil...",Grippe,"gemacht worden find , müssen mit Bore fit ausg...",1918-10
377,letzten Tegen lessen Übrigens die Anna ) els ob,Grippe,ihren Höhepunkt über- schritten dat . 2 ;,1918-10
