<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#POS-tagging" data-toc-modified-id="POS-tagging-0.1"><span class="toc-item-num">0.1&nbsp;&nbsp;</span>POS tagging</a></span></li><li><span><a href="#Chunking" data-toc-modified-id="Chunking-0.2"><span class="toc-item-num">0.2&nbsp;&nbsp;</span>Chunking</a></span></li><li><span><a href="#NER-tagging" data-toc-modified-id="NER-tagging-0.3"><span class="toc-item-num">0.3&nbsp;&nbsp;</span>NER tagging</a></span></li><li><span><a href="#Supersense-tagging" data-toc-modified-id="Supersense-tagging-0.4"><span class="toc-item-num">0.4&nbsp;&nbsp;</span>Supersense tagging</a></span></li></ul></li></ul></div>

In [1]:
from glob import glob
from collections import Counter
import pandas as pd

from IPython.display import display
import re
import os

In [2]:
! rm -rf ../data/processed/NER
! rm -rf ../data/processed/POS
! rm -rf ../data/processed/CHUNKING
! rm -rf ../data/processed/SUPERSENSE

In [3]:
DATA_DRIVE_PATH = "G:/AzureBackup/"
OUTPUT_DIRECTORY = "../data/processed/"
SEQ_SPLITTER = re.compile(r'\n\s*\n', flags=re.M)

In [4]:
def read_conll_data(filename, ncols=2):
    with open(filename, encoding='utf-8') as fp:
        for seq in SEQ_SPLITTER.split(fp.read()):
            seq_ = []
            for line in seq.splitlines():
                line = line.rstrip()
                if not line:
                    continue
                values = line.split("\t")
                if len(values) < ncols:
                    # Skip invalid lines
                    continue
                seq_.append(values)
            if not seq_:
                seq_ = []
                continue
            yield seq_
        
def sequence2str(seq):
    return "\n".join([
        "\t".join(line)
        for line in seq
    ])
        
def write_conll_data(sequences, filename):
    with open(filename, "w+", encoding='utf-8') as fp:
        for seq in sequences:
            seq_str = sequence2str(seq)
            print(seq_str, end="\n\n", file=fp)
    
def parse_label(label):
    return label.upper()

def clean_label(label):
    if not label:
        label = "O" 
    return parse_label(label)

DIMSUM_PREFIX_MAP = {"N": "NOUN", "V": "VERB"}
def clean_dimsum_label(label):
    if label:
        prefix, label = label.upper().split(".")
        prefix = DIMSUM_PREFIX_MAP.get(prefix, prefix)
        label = "{}.{}".format(prefix, label)
    return clean_label(label)

POS_MAP = {"PRT": "PART", ".": "PUNCT", "CONJ": "CCONJ", "VPP": "VBP"} # CONJ in Foster and DimSum should be CCONJ
def clean_pos_label(label):
    if label:
        label = POS_MAP.get(label, label)
    return clean_label(label)

def clean_ner_label(label):
    label = label.split(".")
    if len(label) == 2:
        label = "{}-{}".format(label[0].split("-")[0], '.'.join(label[1:]))
    else:
        label = ".".join(label)
    return clean_label(label)
    
def extract_token_labels(seq, token_idx=0, label_idx=1, parse_label=parse_label):
    return [(line[token_idx], parse_label(line[label_idx])) for line in seq]
    
def get_stats(sequences, token_idx=0, label_idx=1):
    stats = {
        "sequences": len(sequences),
        "total_tokens": 0
    }
    token_vocab = Counter()
    label_vocab = Counter()
    for seq in sequences:
        stats["total_tokens"] += len(seq)
        for row in seq:
            token_vocab[row[token_idx].upper()] += 1
            label_vocab[row[label_idx].upper()] += 1
    for key, vocab in [
        ("tokens", token_vocab),
        ("labels", label_vocab)
    ]:
        stats["{}_vocab".format(key)] = vocab
        stats["{}_unique".format(key)] = len(vocab.keys())
    return stats 
    
def process_file(input_files, output_file, token_idx=0, label_idx=1, parse_label=parse_label):
    if isinstance(input_files, str):
        input_files = [input_files]
    sequences = []
    for input_file in input_files:
        # replace datadrive path with current data drive
        input_file = input_file.replace("/datadrive/", "")
        input_file = os.path.join(DATA_DRIVE_PATH, input_file)
        for seq in read_conll_data(input_file):
            seq = extract_token_labels(seq, token_idx, label_idx, parse_label)
            sequences.append(seq)
    # get stats
    stats = get_stats(sequences)
    write_conll_data(sequences, output_file)
    return stats
    
def split_label(label):
    if label == "O":
        boundary = label
        label = None
    else:
        boundary, label = label.split("-", 1)
    return boundary, label
    
def gather_data(files, task_name, split_boundary=False, token_idx=0, label_idx=1, parse_label=parse_label, stats_data=None):
    if stats_data is None:
        stats_data = []
    for key, data_dict in files.items():
        for split_prefix, input_files in data_dict.items():
            #input_file = "/datadrive/Datasets/lowlands-data/ACL2014/crowdsourced_POS/data/foster-twitter.test"
            output_dir = os.path.join(OUTPUT_DIRECTORY, task_name, key)
            os.makedirs(output_dir, exist_ok=True)
            output_file = os.path.join(output_dir, "{}.conll".format(split_prefix))
            stats = process_file(input_files, output_file, token_idx, label_idx, parse_label)
            stats["labels"] = list(sorted(stats["labels_vocab"].keys()))
            if split_boundary:
                boundaries, labels = zip(*[split_label(label) for label in stats["labels_vocab"].keys()])
                stats["boundaries"] = list(set(boundaries))
                stats["labels"] = list(set([l for l in labels if l]))
                stats["labels_unique"] = len(stats["labels"])
            stats["data_key"] = key
            stats["split_prefix"] = split_prefix
            stats_data.append(stats)
            print("{:15s}\t{:5s}\t{}".format(key, split_prefix, output_file))
    return stats_data

def show_stats(stats_data, data_order=None):
    df = pd.DataFrame(stats_data).drop(["tokens_vocab", "labels_vocab"], 1).set_index(["data_key", "split_prefix"])
    with pd.option_context("display.max_colwidth", -1):
        display(df)
        print(df.to_latex())

## POS tagging

In [5]:
POS_FILES={
    "Owoputi": {
        "train": [
            "/datadrive/Datasets/Twitter/TweeboParser/ark-tweet-nlp-0.3.2/data/twpos-data-v0.3/oct27.splits/oct27.train",
            "/datadrive/Datasets/Twitter/TweeboParser/ark-tweet-nlp-0.3.2/data/twpos-data-v0.3/daily547.conll"
        ],
        "dev": "/datadrive/Datasets/Twitter/TweeboParser/ark-tweet-nlp-0.3.2/data/twpos-data-v0.3/oct27.splits/oct27.dev",
        "test": "/datadrive/Datasets/Twitter/TweeboParser/ark-tweet-nlp-0.3.2/data/twpos-data-v0.3/oct27.splits/oct27.test",
    },
    "Foster": {
        "test": "/datadrive/Datasets/lowlands-data/ACL2014/crowdsourced_POS/data/foster-twitter.test",
    },
    "TwitIE": {
        "dev": "/datadrive/Datasets/Twitter/twitter-pos-bootstrap/data/foster_dev.conll",
        "test": "/datadrive/Datasets/Twitter/twitter-pos-bootstrap/data/foster_eval.conll"
    },
    "Ritter": {
        "train": "/datadrive/Datasets/Twitter/RitterNER/twitter_processed/pos.cleaned.train.txt",
        "dev": "/datadrive/Datasets/Twitter/RitterNER/twitter_processed/pos.cleaned.dev.txt",
        "test": "/datadrive/Datasets/Twitter/RitterNER/twitter_processed/pos.cleaned.test.txt",
    },
    "lowlands": {
        "test": [
            "/datadrive/Datasets/lowlands-data/ACL2014/crowdsourced_POS/data/lowlands.test",
            "/datadrive/Datasets/lowlands-data/ACL2014/crowdsourced_POS/data/ritter.test",
            "/datadrive/Datasets/lowlands-data/ACL2014/crowdsourced_POS/data/gimpel.GOLD"
        ]
    },
    "Tweetbankv2": {
        "dev": "/datadrive/Datasets/Twitter/Tweebank/pos/en-ud-tweet-dev.txt",
        "train": "/datadrive/Datasets/Twitter/Tweebank/pos/en-ud-tweet-train.txt",
        "test": "/datadrive/Datasets/Twitter/Tweebank/pos/en-ud-tweet-test.txt",
    }
}

DIMSUM_FILES = {
    "DiMSUM2016": {
        "train": "/datadrive/Datasets/Twitter/dimsum-data/conll/dimsum16.train",
        "test": "/datadrive/Datasets/Twitter/dimsum-data/conll/dimsum16.test"
    }
}

In [6]:
stats_data = gather_data(POS_FILES, "POS", split_boundary=False, parse_label=clean_pos_label);
stats_data = gather_data(DIMSUM_FILES, "POS", split_boundary=False, token_idx=1, label_idx=3, parse_label=clean_pos_label, stats_data=stats_data);
show_stats(stats_data)

Owoputi        	train	../data/processed/POS\Owoputi\train.conll
Owoputi        	dev  	../data/processed/POS\Owoputi\dev.conll
Owoputi        	test 	../data/processed/POS\Owoputi\test.conll
Foster         	test 	../data/processed/POS\Foster\test.conll
TwitIE         	dev  	../data/processed/POS\TwitIE\dev.conll
TwitIE         	test 	../data/processed/POS\TwitIE\test.conll
Ritter         	train	../data/processed/POS\Ritter\train.conll
Ritter         	dev  	../data/processed/POS\Ritter\dev.conll
Ritter         	test 	../data/processed/POS\Ritter\test.conll
lowlands       	test 	../data/processed/POS\lowlands\test.conll
Tweetbankv2    	dev  	../data/processed/POS\Tweetbankv2\dev.conll
Tweetbankv2    	train	../data/processed/POS\Tweetbankv2\train.conll
Tweetbankv2    	test 	../data/processed/POS\Tweetbankv2\test.conll
DiMSUM2016     	train	../data/processed/POS\DiMSUM2016\train.conll
DiMSUM2016     	test 	../data/processed/POS\DiMSUM2016\test.conll


Unnamed: 0_level_0,Unnamed: 1_level_0,labels,labels_unique,sequences,tokens_unique,total_tokens
data_key,split_prefix,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Owoputi,train,"[!, #, $, &, ,, @, A, D, E, G, L, M, N, O, P, R, S, T, U, V, X, Y, Z, ^, ~]",25,1547,6572,22326
Owoputi,dev,"[!, #, $, &, ,, @, A, D, E, G, L, N, O, P, R, S, T, U, V, X, Z, ^, ~]",23,327,2036,4823
Owoputi,test,"[!, #, $, &, ,, @, A, D, E, G, L, N, O, P, R, S, T, U, V, X, Z, ^, ~]",23,500,2754,7152
Foster,test,"[ADJ, ADP, ADV, CCONJ, DET, NOUN, NUM, PART, PRON, PUNCT, VERB, X]",12,250,1068,2841
TwitIE,dev,"['', (, ), ,, :, CC, CD, DT, FW, HT, IN, JJ, JJR, JJS, MD, NN, NNP, NNPS, NNS, PDT, POS, PRP, PRP$, PUNCT, RB, RBR, RBS, RP, RT, SYM, TO, UH, URL, USR, VB, VBD, VBG, VBN, VBP, VBZ, WDT, WP, WRB]",43,269,1229,2998
TwitIE,test,"['', (, ), ,, :, CC, CD, DT, EX, FW, HT, IN, JJ, JJR, JJS, MD, NN, NNP, NNPS, NNS, PDT, POS, PRP, PRP$, PUNCT, RB, RBR, RBS, RP, RT, SYM, TO, UH, URL, USR, VB, VBD, VBG, VBN, VBP, VBZ, WDT, WP, WRB]",44,250,1182,2841
Ritter,train,"['', (, ), ,, :, CC, CD, DT, EX, FW, HT, IN, JJ, JJR, JJS, LS, MD, NN, NNP, NNPS, NNS, O, POS, PRP, PRP$, PUNCT, RB, RBR, RBS, RP, RT, SYM, TO, UH, URL, USR, VB, VBD, VBG, VBN, VBP, VBZ, WDT, WP, WRB]",45,632,3539,12196
Ritter,dev,"['', (, ), ,, :, CC, CD, DT, HT, IN, JJ, JJR, JJS, MD, NN, NNP, NNS, POS, PRP, PRP$, PUNCT, RB, RBR, RP, RT, TO, UH, URL, USR, VB, VBD, VBG, VBN, VBP, VBZ, WDT, WP, WRB]",38,71,695,1362
Ritter,test,"['', (, ), ,, :, CC, CD, DT, EX, HT, IN, JJ, JJR, JJS, MD, NN, NNP, NNPS, NNS, PDT, POS, PRP, PRP$, PUNCT, RB, RBR, RP, RT, SYM, TO, UH, URL, USR, VB, VBD, VBG, VBN, VBP, VBZ, WDT, WRB]",41,84,735,1627
lowlands,test,"[ADJ, ADP, ADV, CCONJ, DET, NOUN, NUM, PART, PRON, PUNCT, VERB, X]",12,1318,4805,19794


\begin{tabular}{lllrrrr}
\toprule
           &      &                                                                                                                                                                                                    labels &  labels\_unique &  sequences &  tokens\_unique &  total\_tokens \\
data\_key & split\_prefix &                                                                                                                                                                                                           &                &            &                &               \\
\midrule
Owoputi & train &  [!, \#, \$, \&, ,, @, A, D, E, G, L, M, N, O, P, R, S, T, U, V, X, Y, Z, \textasciicircum , \textasciitilde ] &  25 &  1547 &  6572 &  22326 \\
           & dev &  [!, \#, \$, \&, ,, @, A, D, E, G, L, N, O, P, R, S, T, U, V, X, Z, \textasciicircum , \textasciitilde ] &  23 &  327 &  2036 &  4823 \\
           & test &  [!, \#, \$, \&, ,, @, A, D, E,

## Chunking

In [7]:
CHUNKING_FILES = {
    "Ritter": {
        "train": "/datadrive/Datasets/Twitter/RitterNER/twitter_processed/chunk.train.conll",
        "dev": "/datadrive/Datasets/Twitter/RitterNER/twitter_processed/chunk.dev.conll",
        "test": "/datadrive/Datasets/Twitter/RitterNER/twitter_processed/chunk.test.conll",
    }
}

In [8]:
stats_data = gather_data(CHUNKING_FILES, "CHUNKING", split_boundary=True);
show_stats(stats_data)

Ritter         	train	../data/processed/CHUNKING\Ritter\train.conll
Ritter         	dev  	../data/processed/CHUNKING\Ritter\dev.conll
Ritter         	test 	../data/processed/CHUNKING\Ritter\test.conll


Unnamed: 0_level_0,Unnamed: 1_level_0,boundaries,labels,labels_unique,sequences,tokens_unique,total_tokens
data_key,split_prefix,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Ritter,train,"[I, B, O]","[ADJP, PP, INTJ, ADVP, PRT, NP, SBAR, VP, CONJP]",9,551,3158,10584
Ritter,dev,"[I, B, O]","[ADJP, PP, INTJ, ADVP, PRT, NP, SBAR, VP]",8,118,994,2317
Ritter,test,"[I, B, O]","[ADJP, PP, INTJ, ADVP, PRT, NP, SBAR, VP]",8,119,988,2310


\begin{tabular}{llllrrrr}
\toprule
       &      & boundaries &                                            labels &  labels\_unique &  sequences &  tokens\_unique &  total\_tokens \\
data\_key & split\_prefix &            &                                                   &                &            &                &               \\
\midrule
Ritter & train &  [I, B, O] &  [ADJP, PP, INTJ, ADVP, PRT, NP, SBAR, VP, CONJP] &  9 &  551 &  3158 &  10584 \\
       & dev &  [I, B, O] &  [ADJP, PP, INTJ, ADVP, PRT, NP, SBAR, VP] &  8 &  118 &  994 &  2317 \\
       & test &  [I, B, O] &  [ADJP, PP, INTJ, ADVP, PRT, NP, SBAR, VP] &  8 &  119 &  988 &  2310 \\
\bottomrule
\end{tabular}



## NER tagging

In [9]:
NER_FILES={
    "Finin": {
        "train": "/datadrive/Datasets/lowlands-data/LREC2014/twitter_ner/data/finin.train.tsv",
        "test": [
            "/datadrive/Datasets/lowlands-data/LREC2014/twitter_ner/data/finin.test.tsv.utf8",
            "/datadrive/Datasets/lowlands-data/LREC2014/twitter_ner/data/ritter.test.tsv"
        ],
    },
    "Hege": {
        "test": "/datadrive/Datasets/lowlands-data/LREC2014/twitter_ner/data/hege.test.tsv",
    },
    "Ritter": {
        "train": "/datadrive/Datasets/Twitter/RitterNER/twitter_processed/ner.train.txt",
        "dev": "/datadrive/Datasets/Twitter/RitterNER/twitter_processed/ner.dev.txt",
        "test": "/datadrive/Datasets/Twitter/RitterNER/twitter_processed/ner.test.txt",
    },
    "YODIE": {
        "train": "/datadrive/Datasets/Twitter/YODIE/data/training.conll",
        "test": "/datadrive/Datasets/Twitter/YODIE/data/testing.conll"
    },
    "WNUT2016": {
        "train": "/datadrive/Codes/multi-task-nlp-keras/data/WNUT_NER/train.tsv",
        "test": "/datadrive/Codes/multi-task-nlp-keras/data/WNUT_NER/test.tsv",
        "dev": "/datadrive/Codes/multi-task-nlp-keras/data/WNUT_NER/dev.tsv",
    },
    "WNUT2017": {
        "train": "/datadrive/Codes/multi-task-nlp-keras/data/WNUT_2017/wnut17train.conll",
        "dev": "/datadrive/Codes/multi-task-nlp-keras/data/WNUT_2017/emerging.dev.conll",
        "test": "/datadrive/Codes/multi-task-nlp-keras/data/WNUT_2017/emerging.test.annotated",
    },
    "MSM2013": {
        "train": "/datadrive/Datasets/Twitter/MSM2013/data/msm2013-ce_challenge_gs/TweetsTrainingSetCH.tsv.conll",
        "test": "/datadrive/Datasets/Twitter/MSM2013/data/msm2013-ce_challenge_gs/goldStandard.tsv.conll",
    },
    "NEEL2016": {
        "train": "/datadrive/Datasets/Twitter/microposts-NEEL/processed/2016/microposts2016-neel-training_neel.gs.conll",
        "dev": "/datadrive/Datasets/Twitter/microposts-NEEL/processed/2016/microposts2016-neel-dev_neel.gs.conll",
        "test": "/datadrive/Datasets/Twitter/microposts-NEEL/processed/2016/microposts2016-neel-test_neel.gs.conll",
    },
    "BROAD": {
        "train": "/datadrive/Datasets/Twitter/broad_twitter_corpus/data_splits/train.conll",
        "dev": "/datadrive/Datasets/Twitter/broad_twitter_corpus/data_splits/dev.conll",
        "test": "/datadrive/Datasets/Twitter/broad_twitter_corpus/data_splits/test.conll",
        
    },
    "MultiModal": {
        "train": "/datadrive/Datasets/Twitter/NERmultimodal/data/train.conll",
        "dev": "/datadrive/Datasets/Twitter/NERmultimodal/data/dev.conll",
        "test": "/datadrive/Datasets/Twitter/NERmultimodal/data/test.conll",
    }
}

In [10]:
stats_data = gather_data(NER_FILES, "NER", split_boundary=True, parse_label=clean_ner_label);
show_stats(stats_data)

Finin          	train	../data/processed/NER\Finin\train.conll
Finin          	test 	../data/processed/NER\Finin\test.conll
Hege           	test 	../data/processed/NER\Hege\test.conll
Ritter         	train	../data/processed/NER\Ritter\train.conll
Ritter         	dev  	../data/processed/NER\Ritter\dev.conll
Ritter         	test 	../data/processed/NER\Ritter\test.conll
YODIE          	train	../data/processed/NER\YODIE\train.conll
YODIE          	test 	../data/processed/NER\YODIE\test.conll
WNUT2016       	train	../data/processed/NER\WNUT2016\train.conll
WNUT2016       	test 	../data/processed/NER\WNUT2016\test.conll
WNUT2016       	dev  	../data/processed/NER\WNUT2016\dev.conll
WNUT2017       	train	../data/processed/NER\WNUT2017\train.conll
WNUT2017       	dev  	../data/processed/NER\WNUT2017\dev.conll
WNUT2017       	test 	../data/processed/NER\WNUT2017\test.conll
MSM2013        	train	../data/processed/NER\MSM2013\train.conll
MSM2013        	test 	../data/processed/NER\MSM2013\test.con

Unnamed: 0_level_0,Unnamed: 1_level_0,boundaries,labels,labels_unique,sequences,tokens_unique,total_tokens
data_key,split_prefix,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Finin,train,"[I, B, O]","[LOC, PER, ORG]",3,10000,19663,172188
Finin,test,"[I, B, O]","[LOC, PER, ORG]",3,5369,13027,97525
Hege,test,"[I, B, O]","[LOC, PER, ORG]",3,1545,4552,20664
Ritter,train,"[I, B, O]","[COMPANY, OTHER, FACILITY, PERSON, MOVIE, MUSICARTIST, GEO-LOC, TVSHOW, PRODUCT, SPORTSTEAM]",10,1900,7695,36936
Ritter,dev,"[I, B, O]","[COMPANY, OTHER, PERSON, FACILITY, MOVIE, MUSICARTIST, GEO-LOC, TVSHOW, PRODUCT, SPORTSTEAM]",10,240,1731,4612
Ritter,test,"[I, B, O]","[COMPANY, OTHER, PERSON, FACILITY, MOVIE, MUSICARTIST, GEO-LOC, TVSHOW, PRODUCT, SPORTSTEAM]",10,254,1776,4921
YODIE,train,"[I, B, O]","[COMPANY, OTHER, PERSON, LOCATION, FACILITY, MOVIE, MUSICARTIST, GEO-LOC, UNK, TVSHOW, PRODUCT, SPORTSTEAM, ORGANIZATION]",13,396,2554,7905
YODIE,test,"[I, B, O]","[COMPANY, OTHER, FACILITY, LOCATION, PERSON, MOVIE, MUSICARTIST, GEO-LOC, UNK, TVSHOW, PRODUCT, SPORTSTEAM, ORGANIZATION]",13,397,2578,8032
WNUT2016,train,"[I, B, O]","[COMPANY, OTHER, FACILITY, PERSON, MOVIE, MUSICARTIST, GEO-LOC, TVSHOW, PRODUCT, SPORTSTEAM]",10,2394,9068,46469
WNUT2016,test,"[I, B, O]","[COMPANY, OTHER, PERSON, FACILITY, MOVIE, MUSICARTIST, GEO-LOC, TVSHOW, PRODUCT, SPORTSTEAM]",10,3850,16012,61908


\begin{tabular}{llllrrrr}
\toprule
           &      & boundaries &                                                                                                                     labels &  labels\_unique &  sequences &  tokens\_unique &  total\_tokens \\
data\_key & split\_prefix &            &                                                                                                                            &                &            &                &               \\
\midrule
Finin & train &  [I, B, O] &  [LOC, PER, ORG] &  3 &  10000 &  19663 &  172188 \\
           & test &  [I, B, O] &  [LOC, PER, ORG] &  3 &  5369 &  13027 &  97525 \\
Hege & test &  [I, B, O] &  [LOC, PER, ORG] &  3 &  1545 &  4552 &  20664 \\
Ritter & train &  [I, B, O] &  [COMPANY, OTHER, FACILITY, PERSON, MOVIE, MUSICARTIST, GEO-LOC, TVSHOW, PRODUCT, SPORTSTEAM] &  10 &  1900 &  7695 &  36936 \\
           & dev &  [I, B, O] &  [COMPANY, OTHER, PERSON, FACILITY, MOVIE, MUSICARTIST, GEO-LOC, TVS

## Supersense tagging

In [11]:
SUPERSENSE_TAGGING_FILES={
    "Ritter": {
        "train": "/datadrive/Datasets/Twitter/supersense-data-twitter/ritter-train.tsv",
        "dev": "/datadrive/Datasets/Twitter/supersense-data-twitter/ritter-dev.tsv",
        "test": "/datadrive/Datasets/Twitter/supersense-data-twitter/ritter-eval.tsv"
    },
    "Johannsen2014": {
        "test": "/datadrive/Datasets/Twitter/supersense-data-twitter/in-house-eval.tsv"
    }
}

# This dataset has some tagging issues as the super sense tags are assigned to multi-word units which are not contigous
DIMSUM_FILES = {
    "DiMSUM2016": {
        "train": "/datadrive/Datasets/Twitter/dimsum-data/conll/dimsum16.train",
        "test": "/datadrive/Datasets/Twitter/dimsum-data/conll/dimsum16.test"
    }
}

In [12]:
stats_data = gather_data(SUPERSENSE_TAGGING_FILES, "SUPERSENSE", split_boundary=True, label_idx=2);
show_stats(stats_data)

Ritter         	train	../data/processed/SUPERSENSE\Ritter\train.conll
Ritter         	dev  	../data/processed/SUPERSENSE\Ritter\dev.conll
Ritter         	test 	../data/processed/SUPERSENSE\Ritter\test.conll
Johannsen2014  	test 	../data/processed/SUPERSENSE\Johannsen2014\test.conll


Unnamed: 0_level_0,Unnamed: 1_level_0,boundaries,labels,labels_unique,sequences,tokens_unique,total_tokens
data_key,split_prefix,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Ritter,train,"[I, B, O]","[NOUN.BODY, NOUN.STATE, NOUN.ARTIFACT, NOUN.ATTRIBUTE, NOUN.FOOD, NOUN.TOPS, NOUN.COGNITION, NOUN.EVENT, NOUN.OBJECT, NOUN.MOTIVE, NOUN.GROUP, VERB.COMMUNICATION, NOUN.PHENOMENON, VERB.POSSESSION, VERB.COMPETITION, NOUN.POSSESSION, NOUN.FEELING, VERB.SOCIAL, NOUN.ANIMAL, VERB.CREATION, VERB.CONSUMPTION, VERB.PERCEPTION, VERB.CONTACT, VERB.WEATHER, VERB.BODY, NOUN.LOCATION, NOUN.QUANTITY, NOUN.SUBSTANCE, NOUN.RELATION, NOUN.TIME, NOUN.PERSON, VERB.COGNITION, VERB.EMOTION, NOUN.PLANT, VERB.STATIVE, VERB.MOTION, NOUN.COMMUNICATION, NOUN.PROCESS, NOUN.ACT, VERB.CHANGE]",40,551,3174,10652
Ritter,dev,"[I, B, O]","[NOUN.BODY, NOUN.STATE, NOUN.ARTIFACT, NOUN.ATTRIBUTE, NOUN.FOOD, NOUN.COGNITION, NOUN.EVENT, NOUN.OBJECT, NOUN.MOTIVE, NOUN.GROUP, VERB.COMMUNICATION, NOUN.PHENOMENON, VERB.COMPETITION, VERB.POSSESSION, NOUN.POSSESSION, NOUN.FEELING, VERB.SOCIAL, NOUN.ANIMAL, VERB.CREATION, VERB.CONSUMPTION, VERB.PERCEPTION, VERB.CONTACT, VERB.BODY, NOUN.LOCATION, NOUN.QUANTITY, NOUN.SUBSTANCE, NOUN.RELATION, NOUN.TIME, VERB.COGNITION, NOUN.PERSON, VERB.EMOTION, NOUN.PLANT, VERB.STATIVE, VERB.MOTION, NOUN.COMMUNICATION, NOUN.ACT, VERB.CHANGE]",37,118,1014,2242
Ritter,test,"[I, B, O]","[NOUN.BODY, NOUN.STATE, NOUN.ARTIFACT, NOUN.ATTRIBUTE, NOUN.FOOD, NOUN.TOPS, NOUN.COGNITION, NOUN.EVENT, NOUN.OBJECT, NOUN.MOTIVE, NOUN.SHAPE, NOUN.GROUP, VERB.COMMUNICATION, NOUN.PHENOMENON, VERB.POSSESSION, NOUN.FEELING, NOUN.POSSESSION, VERB.COMPETITION, VERB.SOCIAL, NOUN.ANIMAL, VERB.CREATION, VERB.CONSUMPTION, VERB.PERCEPTION, VERB.CONTACT, VERB.WEATHER, VERB.BODY, NOUN.LOCATION, NOUN.QUANTITY, NOUN.SUBSTANCE, NOUN.RELATION, NOUN.TIME, NOUN.PERSON, VERB.COGNITION, VERB.EMOTION, VERB.STATIVE, VERB.MOTION, NOUN.COMMUNICATION, NOUN.PROCESS, NOUN.ACT, VERB.CHANGE]",40,118,1011,2291
Johannsen2014,test,"[I, B, O]","[NOUN.BODY, NOUN.STATE, NOUN.ARTIFACT, NOUN.ATTRIBUTE, NOUN.FOOD, NOUN.COGNITION, NOUN.EVENT, NOUN.OBJECT, NOUN.SHAPE, NOUN.GROUP, VERB.COMMUNICATION, NOUN.PHENOMENON, VERB.COMPETITION, VERB.POSSESSION, NOUN.FEELING, NOUN.POSSESSION, VERB.SOCIAL, NOUN.ANIMAL, VERB.CREATION, VERB.CONSUMPTION, VERB.PERCEPTION, VERB.CONTACT, VERB.BODY, NOUN.LOCATION, NOUN.QUANTITY, NOUN.SUBSTANCE, NOUN.RELATION, NOUN.TIME, NOUN.PERSON, VERB.COGNITION, VERB.EMOTION, VERB.STATIVE, VERB.MOTION, NOUN.COMMUNICATION, NOUN.PROCESS, NOUN.ACT, VERB.CHANGE]",37,200,1249,3064


\begin{tabular}{llllrrrr}
\toprule
              &      & boundaries &                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       labels &  labels\_unique &  sequences &  tokens\_unique &  total\_tokens \\
data\_key & split\_prefix &            &                                                                                                                                                                                                                                                       