In [54]:
import pandas as pd
import numpy as np
import json
import spacy
from spacy.lang.de import German
from spacy.lang.de.stop_words import STOP_WORDS
import string
import re
import collections

In [34]:
#Load Data from JSON into Pandas Dataframe
filename = 'scraped_data/all_reports_all_years_all_pages.json'
df = pd.read_json('scraped_data/all_reports_all_years_all_pages.json', orient='columns', encoding='utf-8')

#Delete all Rows where there is empty articles (String/NaN))
print("How Many Empty Articles are in the Dataframe? {}".format(len(df["article"][(df['article'].values == '')])))
df['article'].replace('', np.nan, inplace=True)
df.dropna(subset=['article'], inplace=True)
print("After Cleaning the Articles, how many empty are there? {}".format(len(df["article"][(df['article'].values == '')])))

#Drop all unneeded columns that are irrelevant for the Model
columns_to_remove = ['response', 'published', 'bezirk', 'subheads', 'url', 'items']
df_clean = df.drop(columns_to_remove, 1)
df_clean.head(5)
pd.set_option('display.max_colwidth', -1)



How Many Empty Articles are in the Dataframe? 6
After Cleaning the Articles, how many empty are there? 0


In [55]:
nlp = spacy.load('de_core_news_sm')
punctuations = string.punctuation
punctuations += "–"
punctuations += "€"
punctuations += '„'
punctuations += '“'

def f(x):
    processed = nlp(x)    
    all_processed_descrip = list()
    for sent in processed.sents:
        processed_description = list()
        #Split Doc into Sentences into Words
        words = [token.text for token in sent]
        #Lemmatize Words
        lemmatized_words = [token.lemma_ for token in sent]
        #Remove punctuation and numbers
        remove_punctuation = [token for token in lemmatized_words if (token not in punctuations and not re.match('\d+', token))]
        #Normalize Data
        normalized = [token.lower() for token in remove_punctuation]
        #Remove Stopwords
        removed_stopwords = [token for token in normalized if token not in STOP_WORDS]
        #append final analyzed sentence to preprocessed:
        processed_description.append(removed_stopwords)
        #collect all processed descriptions to one list:
        all_processed_descrip.append(processed_description)
   
    return all_processed_descrip
    

# preprocessed articles and headlines
processed_articles = df_clean["article"].apply(lambda x: f(x))
processed_headlines = df_clean["headline"].apply(lambda x: f(x))

In [129]:
#further cleaning:
# done: numbers erased (including time, things like 45-jähriger)
# done: string.punctuation added with Gedankenstrich, €
# done: add Gänsefüsschen (id 4)

# TO DO (alternatives)?:
# Stoppwörter durch 'unknown' ersetzen?
# Uhrzeiten ersetzen?
# berliner orte erkennen? 'berliner strasse' als eins lesen?
# Hundert Euro in 100 € umwandeln dann entfernen? (id 16)

In [130]:
#add new columns to matrix:
df_clean["processed_articles"] = processed_articles
df_clean["processed_headlines"] = processed_headlines

In [131]:
#remove unnecessary braces etc.:
df_clean["processed_articles"] = df_clean["processed_articles"].apply(np.ravel)
df_clean["processed_headlines"] = df_clean["processed_headlines"].apply(np.ravel)

#df_clean

In [132]:
#save results to file:
destination_path = 'processed_data/preprocessed_data.json'
df_clean.to_json(destination_path)

# Read in the analyzed data from json file 
(skips the above part of pipeline analysis which has been saved to a file)


In [3]:
#load data new from file to work with it from here:
data = pd.read_json('processed_data/preprocessed_data.json', orient='columns', encoding='utf-8')
#data.sort_index()

In [4]:
#combine headlines (times 2) with text so we have one column to be analyzed:
def combineTextToBeAnalyzed(processed_headline, processed_article):
    emptyList = []
    emptyList.append(processed_headline)
    emptyList.append(processed_headline)
    return emptyList + processed_article

#create new column:
newCol = data.apply(lambda x: combineTextToBeAnalyzed(x['processed_headlines'], x['processed_articles']), axis=1)

data['combinedText'] = newCol
#data.sort_index()

## Shuffle data

In [6]:
#shuffle data:
shuffled_data = data.sample(frac=1)
shuffled_data

Unnamed: 0,article,headline,processed_articles,processed_headlines,combinedText
374,Ein Mann und zwei Kinder wurden bei einem Unfa...,Drei Verletzte bei Unfall,"[[mann, kind, unfall, gestern, abend, buch, ve...","[verletzt, unfall]","[[verletzt, unfall], [verletzt, unfall], [mann..."
9377,In der vergangenen Nacht nahmen Polizeibeamte ...,Betrunkener mutmaßlicher Autodieb festgenommen,"[[vergangen, nacht, nehmen, polizeibeamte, aut...","[betrunkener, mutmaßlich, autodieb, festnehmen]","[[betrunkener, mutmaßlich, autodieb, festnehme..."
8062,Gestern Nachmittag ereignete sich in Moabit ei...,Zusammenstoß zwischen Polizeiwagen und Motorrad,"[[gestern, nachmittag, ereignen, moabit, verke...","[zusammenstoß, polizeiwagen, motorrad]","[[zusammenstoß, polizeiwagen, motorrad], [zusa..."
2842,Bei Verkehrsunfällen schwer verletzt: Zwei Fuß...,Bei Verkehrsunfällen schwer verletzt,"[[verkehrsunfällen, schwer, verletzen], [fußgä...","[verkehrsunfällen, schwer, verletzen]","[[verkehrsunfällen, schwer, verletzen], [verke..."
6394,Ein Passant alarmierte heute früh Polizei und ...,Auto brannte,"[[passant, alarmieren, früh, polizei, feuerweh...","[auto, brennen]","[[auto, brennen], [auto, brennen], [passant, a..."
9346,Ein Passant bemerkte gestern Abend eine verlet...,Seniorin beraubt,"[[passant, bemerken, gestern, abend, verletzen...","[seniorin, berauben]","[[seniorin, berauben], [seniorin, berauben], [..."
2530,Ein 49-jähriger Mann alarmierte heute Mittag d...,Fußgänger und Radfahrer geraten in Streit,"[[mann, alarmieren, mittag, polizei, imbiss, a...","[fußgänger, radfahrer, geraten, streit]","[[fußgänger, radfahrer, geraten, streit], [fuß..."
7918,Der Polizeiliche Staatsschutz des Landeskrimin...,Sachbeschädigungen mit politischem Inhalt,"[[polizeiliche, staatsschutz, landeskriminalam...","[sachbeschädigung, politisch, inhalt]","[[sachbeschädigung, politisch, inhalt], [sachb..."
4264,Auch an den Weihnachtsfeiertagen sind Trickbet...,Trickbetrüger „arbeiten“ auch Weihnachten,"[[weihnachtsfeiertagen, trickbetrüger, aktiv],...","[trickbetrüger, arbeiten, weihnachten]","[[trickbetrüger, arbeiten, weihnachten], [tric..."
2965,Heute Vormittag wurde eine Fußgängerin in Gato...,Fußgängerin angefahren und schwer verletzt,"[[vormittag, fußgängerin, gatow, verkehrsunfal...","[fußgängerin, anfahren, schwer, verletzen]","[[fußgängerin, anfahren, schwer, verletzen], [..."


## Indexierung der Wörter; Erstellen von Lookup-Tables

Nachbau der Ursprungssätze mit neuen Wort-Indizes; Gesamtanzahl der einzigartigen Wörter nötig


In [55]:
column_vals = data['combinedText'].values
#column_vals

In [83]:
#remove duplicates from a list of words to get a unique word corpus:
def remove_duplicates(li):
    set_of_words = []
    for word in li:
       if word not in set_of_words:
          set_of_words.append(word)
    return set_of_words

In [91]:
def count_words(words):
    dict = {}
    for word in words:
        if word not in dict:
            dict[word] = 1
        else:
            dict[word] += 1
    return dict

In [92]:
def get_descending_list_of_most_common_words(words):
    unique_words = count_words(words)
    return {k: v for k, v in sorted(unique_words.items(), key=lambda item: item[1], reverse=True)}    

In [93]:
#example:
woerter = ['hallo', 'hallo', 'tada', 'bald', 'tada', 'hallo', 'hallo']
res = get_descending_list_of_most_common_words(woerter)
res

{'hallo': 4, 'tada': 2, 'bald': 1}

In [98]:
sentences = [sent for pd_list in column_vals for sent in pd_list]
words = [word for sent in sentences for word in sent]


#vocabulary_size=None
if not vocabulary_size:
    unique_words = remove_duplicates(words)
    vocabulary_size = len(unique_words)


words_len = len(words)
print(words_len)
print(vocabulary_size)

#words

605936
605303


In [89]:
def build_lookup_tables(docs, vocabulary_size=None):
    '''
    :param docs: Spalte eines pandas-DF: data['preprocessed'].values
    :return sentences: Alle Sätze aller Dokumente in einer Liste
    :return words: Alle Wörter aller Dokumente in einer Liste
    :return word_count: Häufigkeiten der jeweiligen Wörter in allen Dokumenten
    :return word_2_index_dict: Lookup-Table
    :return index_2_word_dict: Lookup-Table
    :return sentences_as_index: Alle Sätze aller Dokumente mit Wortindex, anstatt des Wortes
    :return sentences_as_index_flattened: wie sentences_as_index, aber ohne subarrays (flattened)
    :return vocabulary_size: Anzahl der einzigartigen Wörter
    '''
    sentences = [sent for pd_list in docs for sent in pd_list]
    words = [word for sent in sentences for word in sent]
  
    if not vocabulary_size:
        # Anzahl der einzigartigen Wörter
        unique_words = remove_duplicates(words)
        vocabulary_size = len(unique_words)
 
    # Anzahl der Worthäufigkeiten
    #word_count = collections.Counter(words).most_common(vocabulary_size) #vonbasti
    #basti hatte word_count vom typ list, wir haben ein dict gerade(!)
    word_count = get_descending_list_of_most_common_words(words)
    #brauchen wir die folgende zeile überhaupt?
    #word_count.append(['UNK', -1]) # Flag für Wörter, die nicht oft vorkommen (nur wichtig bei begrenzter Vokabulargröße)
  
    # Lookup-Tables
    word_2_index_dict = {}
    for index, word in enumerate(word_count):
        word_2_index_dict[word[0]] = index
  
    index_2_word_dict = dict(zip(word_2_index_dict.values(), word_2_index_dict.keys()))
  
    # Wörter der Anfragetexte durch Indizes austauschen:
    sentences_as_index = []
    unknown_word_count = 0
    for sent in sentences:
        sent_index = []
        for word in sent:
            if word in word_2_index_dict:
                sent_index.append(word_2_index_dict[word])
            else:
                unknown_word_count += 1
        if sent_index:
            sentences_as_index.append(sent_index)
    word_count[-1][1] = unknown_word_count

    sentences_as_index_flattened = [word for sent in sentences_as_index for word in sent]
  
    return sentences, words, word_count, word_2_index_dict, index_2_word_dict, sentences_as_ind


In [99]:
 #sentences, words, word_count, word_2_index_dict, index_2_word_dict, sentences_as_index, sentences_as_index_flattened, vocabulary_size = build_lookup_tables(data['combinedText'].values)