In [28]:
import pandas as pd


class IswPreprocessor:
    def __init__(self, filename):
        print(' ------ Preprocssing ISW German corpus ------')
        self.row_isw_data = self.load_isw_tsv_file(filename)
        self.cleaned_isw_data = self.clean_isw_data()

    def load_isw_tsv_file(self, filename='data/test-full-isw-release.tsv'):
        isw_data = pd.read_csv(filename, quotechar='"',
                               delimiter="\t", skiprows=None)
        return isw_data

    def clean_isw_data(self, selected_cols=[]):
        """
        :return: clean isw_data
        """
        # Keep only selected cols
        selected_cols = ['fileid', 'token', 'lemma', 'ontoNer']
        isw_set = self.row_isw_data[selected_cols]

        # Clean up incorrect rows  e.g. fileid -> total 82 of it
        isw_set = isw_set[isw_set.fileid != "fileid"]

        # Drop empty token
        isw_drop_non = isw_set[isw_set.lemma != "NONE"]
        isw_drop_non.reset_index(drop=True, inplace=True)

        # Replace NONE tag with "O"
        isw_drop_non['ontoNer'].replace(
            to_replace='NONE', value='O', inplace=True)

        print("Total number of sentences", len(isw_drop_non.fileid.unique()))
        print("Total number of ner tags in isw", len(list(set(isw_drop_non["ontoNer"].values))))
        return isw_drop_non

    def get_list_of_sentences_labels(self):
        """
        return : list of sentences : ['I have apple', 'I am here', 'hello ']
        return : list of labels : ['O', 'O', 'B-GPE', ...]
        """
        data = self.cleaned_isw_data
        # Group the sentence with its fileid
        agg_func = lambda s: [(token, lem, ner) for token, lem, ner in zip(s["token"].values.tolist(),
                                                    s["lemma"].values.tolist(),
                                                    s["ontoNer"].values.tolist())]
        grouped = data.groupby("fileid").apply(agg_func)
        grouped_all = [s for s in grouped]

        sentences = [" ".join([s[0] for s in sent]) for sent in grouped_all]
        labels = [[s[2] for s in label] for label in grouped_all]
        return sentences, labels

    def get_tag2idx_idx2tag(self):
        """
        return : dict of ner label with idx : {'B-ADD': 0, 'B-AGE': 1, 'B-ART': 2, 'B-CARDINAL': 3,'B-CREAT': 4, ...}
        """
        data = self.cleaned_isw_data
        # ners_vals : list of ner labels
        ners_vals = list(set(data["ontoNer"].values))
        # Set as dict {key:idx}
        tag2idx = {t: i for i, t in enumerate(sorted(ners_vals))}
        idx2tag = {i: t for t, i in tag2idx.items()}
        return tag2idx, idx2tag


class TweetPreprocessor:
    def __init__(self, filename='data/merged_headlines_annos.compact.tsv'):
        print(' ------ Preprocssing Tweets corpus ------')
        self.file = open(filename, encoding='utf-8')
        self.ners_vals=[]

    def get_list_of_sentences_labels(self):
        """
        return : list of sentences : ['I have apple', 'I am here', 'hello ']
        return : list of labels : ['O', 'O', 'B-GPE', ...]
        """
        labels, label, sentences, sentence, flat_labels = [], [], [], [], []
        for line in self.file:
            if line.startswith("#"):
                continue
            line = line.strip()
            splits = line.split("\t")
            if line.startswith("NONE"):
                if len(label)>0 and len(sentence)>0:
                    sentences.append(" ".join(sentence))
                    labels.append(label)
                    sentence = []
                    label = []
                continue
            sentence.append(splits[1])
            label.append(splits[3])
            flat_labels.append(splits[3])
        
        if len(label)>0 and len(sentence)>0:
            sentences.append(" ".join(sentence))
            labels.append(label)
            
        labels = [list(map(lambda x: x if x != 'NONE' else 'O', i)) for i in labels]
        self.ners_vals = list(map(lambda x: x if x != 'NONE' else 'O', set(flat_labels)))
        print("Total number of tweets", len(sentences))
        print("Total number of ner tags in tweets", len(self.ners_vals))

        return sentences, labels

    def get_tag2idx_idx2tag(self):
        """
        return : dict of tag2idx : {'B-ADD': 0, 'B-AGE': 1, 'B-ART': 2, 'B-CARDINAL': 3,'B-CREAT': 4, ...}
        return : dict of idx2tag : inverted
        """
        tag2idx = {t: i for i, t in enumerate(sorted(self.ners_vals))}
        idx2tag = {i: t for t, i in tag2idx.items()}
        return tag2idx, idx2tag




# filename = 'data/test-full-isw-release.tsv'
# pre = IswPreprocessor(filename)

# sentences = pre.get_list_of_sentences()
# labels = pre.get_list_of_nerlabels()

# print(labels[0])


## For Tweet data set

In [29]:
filename='data/merged_headlines_annos.compact.tsv'

tweet_pre = TweetPreprocessor(filename)
sentences, labels = tweet_pre.get_list_of_sentences_labels()
tag2idx, idx2tag = tweet_pre.get_tag2idx_idx2tag()

print(len(labels))
print(len(sentences))
print(sentences[-1])
print(labels[-1])
print(tag2idx)
print(idx2tag)

 ------ Preprocssing Tweets corpus ------
Total number of tweets 8957
Total number of ner tags in tweets 63
8957
8957
Moin , Axel !
['O', 'O', 'B-PER', 'O']
{'B-AGE': 0, 'B-ART': 1, 'B-CARDINAL': 2, 'B-CREAT': 3, 'B-DATE': 4, 'B-DUR': 5, 'B-EVT': 6, 'B-FAC': 7, 'B-FRAC': 8, 'B-FREQ': 9, 'B-GPE': 10, 'B-LAN': 11, 'B-LAW': 12, 'B-LOC': 13, 'B-MED': 14, 'B-MISC': 15, 'B-MON': 16, 'B-NRP': 17, 'B-ORDINAL': 18, 'B-ORG': 19, 'B-PER': 20, 'B-PERC': 21, 'B-PRODUCT': 22, 'B-PROJ': 23, 'B-QUANT': 24, 'B-RATE': 25, 'B-SCORE': 26, 'B-SORD': 27, 'B-TIME': 28, 'B-TITLE': 29, 'B-URL': 30, 'I-AGE': 31, 'I-ART': 32, 'I-CARDINAL': 33, 'I-CREAT': 34, 'I-DATE': 35, 'I-DUR': 36, 'I-EVT': 37, 'I-FAC': 38, 'I-FRAC': 39, 'I-FREQ': 40, 'I-GPE': 41, 'I-LAN': 42, 'I-LAW': 43, 'I-LOC': 44, 'I-MED': 45, 'I-MISC': 46, 'I-MON': 47, 'I-NRP': 48, 'I-ORDINAL': 49, 'I-ORG': 50, 'I-PER': 51, 'I-PERC': 52, 'I-PRODUCT': 53, 'I-PROJ': 54, 'I-QUANT': 55, 'I-RATE': 56, 'I-SCORE': 57, 'I-SORD': 58, 'I-TIME': 59, 'I-TITLE': 60,

## For ISW data set

In [30]:
FILE_NAME = "data/test-full-isw-release.tsv"
# Load preprocessed sentences, labels and tag2idx
pre = IswPreprocessor(filename=FILE_NAME)

sentences, labels = pre.get_list_of_sentences_labels()
# Create dicts for mapping from labels to IDs and back
tag2idx, idx2tag = pre.get_tag2idx_idx2tag()

print(len(labels))
print(len(sentences))
print(sentences[-1])
print(labels[-1])
print(tag2idx)
print(idx2tag)

 ------ Preprocssing ISW German corpus ------
Total number of sentences 83
Total number of ner tags in isw 60
83
83
Der wirtschaftliche war es wohl auch nicht dass man Neid empfunden hat Das war auch etwas ganz Modernes Die Juden in Europa bis zum 18 Jahrhundert waren in Ghettos überall auch in Österreich Ich glaube von Joseph und unter Maria Theresia waren auch Vertreibungen dieser oder jener Gesellschaftlich haben die Juden eigentlich erst Stellung bekommen am Ende des 19 und Anfang des 20 Jahrhunderts vor dem hat der ökonomische Faktor ich glaube überhaupt keine Rolle gespielt Na ja sie sie waren z T eben äh äh Finanziers für für Herrscher und Das waren sehr wenige Das waren wenige ja Aber aber wenn die ihre Schulden dann doch einmal nicht bezahlen konnten dann hat man das gerne Ja aber das waren wenige Nicht jeder Herrscher hat einen Juden gehabt Viele haben ihn anscheinend gehabt aber nicht alle Aber ich mein in größeren Massen kann man sagen dass der ökonomische Faktor erst im 20

In [48]:
filename='data/NER-de-train.padded_reannotated.csv'
pad_data = pd.read_csv(filename, quotechar='"', delimiter="\t", engine='python', encoding='utf-8', error_bad_lines=False)

Skipping line 251608: unexpected end of data


In [49]:
pad_data

Unnamed: 0,runningid,tokid,token,inner,outer,oNer1,ontoNer,COMMENT
0,1,#,n-tv.de vom 26.02.2005,[2005-02-26],,[2005-02-26],XXX,
1,2,1,Schartau,B-PER,O,B-PER,B-PER,
2,3,2,sagte,O,O,O,NONE,
3,4,3,dem,O,O,O,NONE,
4,5,4,\tO\tO\tO\tNONE\t\n6\t5\tTagesspiegel\tB-ORG\t...,O,O,O,NONE,
...,...,...,...,...,...,...,...,...
251601,500470,36,no,O,O,O,NONE,
251602,500471,37,Email,O,O,O,NONE,
251603,500472,38,has,O,O,O,NONE,
251604,500473,39,gone,O,O,O,NONE,
