# Analysis



Now we'll take the CONLL data and do some analysis. Like 
* plot the frequencies of some lemma
* look at some lemmas contexts
* analyze some lemmas collocations


## 0. Imports and data upload

In [1]:
import pandas as pd

In [2]:
import re

In [3]:
import os

In [4]:
conllfiles = (r"../data/conll")

In [5]:
corpus_annotations = {}
for filename in os.listdir(conllfiles):
    if '.conll' in filename:
        path = os.path.join(conllfiles, filename)  
        data = pd.read_csv(path) 
        corpus_annotations[filename] = data

## 1. Search lemma and plot frequency

In [6]:
class WordSearchEngine:
    
    def __init__(self, corpus_annotated):
        self.prepare_index_dataframe_for_search(corpus_annotated)
    
    def prepare_index_dataframe_for_search(self, corpus_annotated):
        for filename, annotated_text in corpus_annotated.items():
            year, month, day = self.get_date_fname(filename)
            annotated_text['month'] = month
        self.full_df = pd.concat(corpus_annotated.values())
        print(f'Searching in a corpus of {self.full_df.shape[0]} word occurences')
        
    def get_date_fname(self, filename):  ## REDO WITH METADATA
        date_pattern = re.search(r'(19\d\d)(\d\d)(\d\d)-0-0-0-0', filename)
        year = date_pattern.group(1)
        month = f'{year}-{date_pattern.group(2)}'
        day =  f'{month}-{date_pattern.group(3)}'
        return year, month, day    
        
    def search_and_plot(self):
        search_term = input('Insert a word to search: ')
        if len(search_term) == 0:
            search_term = 'Grippe'
        result = self.full_df.query(f'Lemma=="{search_term}"')
        result.groupby('month').count().Lemma.plot(title=f'frequency of {search_term}');
        

In [7]:
engine = WordSearchEngine(corpus_annotations)

Searching in a corpus of 3008370 word occurences


In [9]:
engine.search_and_plot()

ImportError: matplotlib is required for plotting when the default backend "matplotlib" is selected.

## 2. Exploring the contexts

Let us look at the contexts in which the words appear


### 2.1 KWIC

In [10]:
class ContextViewer:
    
    def __init__(self, corpus_annotated):
        self.full_df = self.prepare_index_dataframe_for_search(corpus_annotated)
        self.full_df = self.full_df.reset_index()
        #print(type(self.full_df))
        
    def prepare_index_dataframe_for_search(self, corpus_annotated):
        for filename, annotated_text in corpus_annotated.items():
            year, month, day = self.get_date_fname(filename)
            annotated_text['month'] = month
        return pd.concat(corpus_annotated.values())
        #print(f'Searching in a corpus of {self.full_df.shape[0]} word occurences')
        
    def get_date_fname(self, filename): ## REDO WITH METADATA
        date_pattern = re.search(r'(19\d\d)(\d\d)(\d\d)-0-0-0-0', filename)
        year = date_pattern.group(1)
        month = f'{year}-{date_pattern.group(2)}'
        day =  f'{month}-{date_pattern.group(3)}'
        return year, month, day    
        
    def get_context(self):
        search_lemma = input('Insert a word to search: ')
        if len(search_lemma) == 0:
            search_lemma = 'Grippe'
        indices = self.full_df.query(f'Lemma=="{search_lemma}"').index
        #print(indices)
        left_contexts = []
        this_words = []
        right_contexts = []
        months = []
        for indice in indices:
            left = self.full_df.iloc[indice-10:indice-1, ]["Token"]
            leftс = left[~left.str.contains('\n')]
            right = self.full_df.iloc[indice+1:indice+10, ]["Token"]
            rightс = right[~right.str.contains('\n')]
            left_contexts.append(' '.join(leftс))
            right_contexts.append(' '.join(rightс))
            this_words.append(self.full_df.iloc[indice, ]["Token"])
            months.append(self.full_df.iloc[indice, ]["month"])
        newdf = pd.DataFrame()
        newdf['left_context'] = left_contexts
        newdf['word'] = this_words
        newdf['right_context'] = right_contexts
        newdf['month'] = months
        return newdf #.sort_values(by='month')

In [11]:
kwic = ContextViewer(corpus_annotations)

In [12]:
kwic.get_context()

Unnamed: 0,left_context,word,right_context,month
0,", 15. Oktober 1918. En LI IR",Grippe,wütel weiter Zunahme der ſcheren Fälle in Berlin,1918-10
1,": Poſen , 14. Oltober . desfälle",Grippe,find in der vergan- „ auf 58 geſtiegen,1918-10
2,einzelne alten werden können . t ſich,Grippe,aus . te fi die Grippewelle N,1918-10
3,"Meldung , Marſchall Io ffr e ſchwer krank",Grippe,darniederliegt . Sein Beſuch in England wurde,1918-10
4,"wurde abgeſagt . Es beſtätigt fie , daß",Grippe,", und gefährliche Epidemien im franzöſiſchen S...",1918-10
5,|| - Zahn- Praxis Klömpen Beize | a,Grippe,DIRE Tr beite wi ; ichel im Gegſcht,1918-12
6,Feldlazarett | im Felde infolge Anſte > ung,Grippe,in „ treuen Pfichterſülung fürs | Vaterland der,1918-09
7,3. verſchied in einem Feld- 8 lazarett infolge,Grippe,"im 32 , Lebensjahre unſer Wi Prokurfft Sert",1918-10
8,von dem am 19. Oltober d. I. infolge,Grippe,"erfolgten Hin- ſcheidens unſeres Prokuriſten ,...",1918-10
9,"Elfe , geb. Freudenthal . Infolge Erkrankung «",Grippe,"mit Lungenentzündung Jam nat kurzem , aber ſch...",1918-10


### 2.2 Collocations