## For Tweet data set

In [1]:
class TweetPreprocessor:
    def __init__(self, filename='data/merged_headlines_annos.compact.tsv'):
        print(' ------ Preprocssing Tweets corpus ------')
        self.file = open(filename, encoding='utf-8')

    def get_list_of_sentences_labels(self):
        """
        return : list of sentences : ['I have apple', 'I am here', 'hello ']
        return : list of labels : ['O', 'O', 'B-GPE', ...]
        """
        labels, label, sentences, sentence, flat_labels = [], [], [], [], []
        for line in self.file:
            if line.startswith("#"):
                continue
            line = line.strip()
            splits = line.split("\t")
            if line.startswith("NONE"):
                if len(label)>0 and len(sentence)>0:
                    sentences.append(" ".join(sentence))
                    labels.append(label)
                    sentence = []
                    label = []
                continue
            sentence.append(splits[1])
            label.append(splits[3])
            flat_labels.append(splits[3])
        labels = [list(map(lambda x: x if x != 'NONE' else 'O', i)) for i in labels]
        ners_vals = list(map(lambda x: x if x != 'NONE' else 'O', set(flat_labels)))
        print("Total number of tweets", len(sentences))
        print("Total number of ner tags in tweets", len(ners_vals))
        return sentences, labels, ners_vals

    def get_tag2idx_idx2tag(self, ners_vals):
        """
        return : dict of tag2idx : {'B-ADD': 0, 'B-AGE': 1, 'B-ART': 2, 'B-CARDINAL': 3,'B-CREAT': 4, ...}
        return : dict of idx2tag : inverted
        """
        tag2idx = {t: i for i, t in enumerate(sorted(ners_vals))}
        idx2tag = {i: t for t, i in tag2idx.items()}
        return tag2idx, idx2tag

In [2]:
filename='data/merged_headlines_annos.compact.tsv'

tweet_pre = TweetPreprocessor(filename)
sentences, labels, ners_vals = tweet_pre.get_list_of_sentences_labels()
tag2idx, idx2tag = tweet_pre.get_tag2idx_idx2tag(ners_vals)

 ------ Preprocssing Tweets corpus ------
Total number of tweets 8956
Total number of ner tags in tweets 63


In [3]:
print(len(labels))
print(len(sentences))
print(sentences[1])
print(labels[1])
print(tag2idx)
print(idx2tag)

8956
8956
https://t.co/ilxFAEWkoK
['B-URL']
{'B-AGE': 0, 'B-ART': 1, 'B-CARDINAL': 2, 'B-CREAT': 3, 'B-DATE': 4, 'B-DUR': 5, 'B-EVT': 6, 'B-FAC': 7, 'B-FRAC': 8, 'B-FREQ': 9, 'B-GPE': 10, 'B-LAN': 11, 'B-LAW': 12, 'B-LOC': 13, 'B-MED': 14, 'B-MISC': 15, 'B-MON': 16, 'B-NRP': 17, 'B-ORDINAL': 18, 'B-ORG': 19, 'B-PER': 20, 'B-PERC': 21, 'B-PRODUCT': 22, 'B-PROJ': 23, 'B-QUANT': 24, 'B-RATE': 25, 'B-SCORE': 26, 'B-SORD': 27, 'B-TIME': 28, 'B-TITLE': 29, 'B-URL': 30, 'I-AGE': 31, 'I-ART': 32, 'I-CARDINAL': 33, 'I-CREAT': 34, 'I-DATE': 35, 'I-DUR': 36, 'I-EVT': 37, 'I-FAC': 38, 'I-FRAC': 39, 'I-FREQ': 40, 'I-GPE': 41, 'I-LAN': 42, 'I-LAW': 43, 'I-LOC': 44, 'I-MED': 45, 'I-MISC': 46, 'I-MON': 47, 'I-NRP': 48, 'I-ORDINAL': 49, 'I-ORG': 50, 'I-PER': 51, 'I-PERC': 52, 'I-PRODUCT': 53, 'I-PROJ': 54, 'I-QUANT': 55, 'I-RATE': 56, 'I-SCORE': 57, 'I-SORD': 58, 'I-TIME': 59, 'I-TITLE': 60, 'I-URL': 61, 'O': 62}
{0: 'B-AGE', 1: 'B-ART', 2: 'B-CARDINAL', 3: 'B-CREAT', 4: 'B-DATE', 5: 'B-DUR', 6: 'B-EVT

## For ISW data set

In [4]:
import pandas as pd


class IswPreprocessor:
    def __init__(self, filename):
        print(' ------ Preprocssing ISW German corpus ------')
        self.row_isw_data = self.load_isw_tsv_file(filename)
        self.cleaned_isw_data = self.clean_isw_data()

    def load_isw_tsv_file(self, filename='data/test-full-isw-release.tsv'):
        isw_data = pd.read_csv(filename, quotechar='"',
                               delimiter="\t", skiprows=None)
        print("Total number of rows", len(isw_data))
        print("Total number of sentences", len(isw_data.fileid.unique()))
        return isw_data

    def clean_isw_data(self, selected_cols=[]):
        """
        :return: clean isw_data
        """
        # Keep only selected cols
        selected_cols = ['fileid', 'token', 'lemma', 'ontoNer']
        isw_set = self.row_isw_data[selected_cols]

        # Clean up incorrect rows  e.g. fileid -> total 82 of it
        isw_set = isw_set[isw_set.fileid != "fileid"]

        # Drop empty token
        isw_drop_non = isw_set[isw_set.lemma != "NONE"]
        isw_drop_non.reset_index(drop=True, inplace=True)

        # Replace NONE tag with "O"
        isw_drop_non['ontoNer'].replace(
            to_replace='NONE', value='O', inplace=True)
        return isw_drop_non

    def get_list_of_sentences(self):
        """
        return : list of sentences : ['I have apple', 'I am here', 'hello ']
        """
        data = self.cleaned_isw_data
        # Group the sentence with its fileid
        agg_func = lambda s: [(token, lem, ner) for token, lem, ner in zip(s["token"].values.tolist(),
                                                    s["lemma"].values.tolist(),
                                                    s["ontoNer"].values.tolist())]
        grouped = data.groupby("fileid").apply(agg_func)
        grouped_all = [s for s in grouped]

        sentences = [" ".join([s[0] for s in sent]) for sent in grouped_all]
        return sentences

    def get_list_of_nerlabels(self):
        """
        return : list of labels : ['O', 'O', 'B-GPE', ...]
        """
        data = self.cleaned_isw_data
        # Group the sentence with its fileid
        agg_func = lambda s: [(token, lem, ner) for token, lem, ner in zip(s["token"].values.tolist(),
                                                    s["lemma"].values.tolist(),
                                                    s["ontoNer"].values.tolist())]
        grouped = data.groupby("fileid").apply(agg_func)
        grouped_all = [s for s in grouped]

        labels = [[s[2] for s in label] for label in grouped_all]

        return labels

    def get_tag2idx_idx2tag(self):
        """
        return : dict of ner label with idx : {'B-ADD': 0, 'B-AGE': 1, 'B-ART': 2, 'B-CARDINAL': 3,'B-CREAT': 4, ...}
        """
        data = self.cleaned_isw_data
        # ners_vals : list of ner labels
        ners_vals = list(set(data["ontoNer"].values))
        # Set as dict {key:idx}
        tag2idx = {t: i for i, t in enumerate(sorted(ners_vals))}
        idx2tag = {i: t for t, i in tag2idx.items()}
        return tag2idx, idx2tag

In [5]:
FILE_NAME = "data/test-full-isw-release.tsv"
# Load preprocessed sentences, labels and tag2idx
pre = IswPreprocessor(filename=FILE_NAME)

sentences = pre.get_list_of_sentences()
labels = pre.get_list_of_nerlabels()
# Create dicts for mapping from labels to IDs and back
tag2idx, idx2tag = pre.get_tag2idx_idx2tag()

 ------ Preprocssing ISW German corpus ------
Total number of rows 300684
Total number of sentences 84


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [6]:
print(len(labels))
print(len(sentences))
print(sentences[1])
print(labels[1])
print(tag2idx)
print(idx2tag)

83
83
Aber Sie haben ja einen sehr schönen Beruf erlernt Sie haben doch reingeschrieben das war eigentlich Ihr Berufswunsch Säuglingsschwester Die Säuglings auf jeden Fall Und da hab ich auch Also ich hab viel zu wenig damit gemacht Das geb ich zu Aber das war eben weil wir nach Ecuador gegangen sind neunzehnfuffzig Ich war in der Hadassa im Spital in der Säuglings äh abteilung nicht dass ich überhaupt keine Ahnung hab Sicher hab ich nur ich hab keine Ahnung wer Ihnen auch erzählen warum denn das ist auch ein Beruf der sich ununterbrochen verändert So wie die Medizin als solche Es Ich hab nicht gewusst dass dieses Jahr also ob es jetzt achtundzeunzig ist das zu Ende geht oder neunundneunzig ist ein Jahr wo die Mütter lernen müssen Babys zu stillen Bis über ein Jahr oder wie haben sie neulich im im Fernsehen gesagt Mindestens ein Jahr Und dazu kann ich Ihnen sagen Also ich hab ein Enkelkind das ist jetzt zehn elf Monate und die stillt den wirklich Aber nicht wegen dem LACHEND Jahr sonde