In [1]:
from glob import glob
from collections import Counter
import pandas as pd

from IPython.display import display
import re
import os

from itertools import chain

from SocialMediaIE.data.tokenization import tokenize

In [2]:
TASK_KEYS=["SENTIMENT", "ABUSIVE", "UNCERTAINITY"]
def get_files(task_key):
    return [os.path.normpath(path) for path in glob(f"../data/processed/{task_key}/**/*.json")]

In [3]:
task_files = get_files(TASK_KEYS[1])

In [4]:
def get_file_stats(file):
    df = pd.read_json(file, lines=True)
    token_counts = Counter(chain(*df.text.str.lower().apply(tokenize).values))
    stats = {
        "labels": ", ".join(sorted(df.label.unique().tolist())),
        "vocab": len(token_counts),
        "tokens": sum(token_counts.values()),
        "tweets": df.shape[0]
    }
    return stats

In [5]:
stats = get_file_stats(task_files[0])
stats

{'labels': 'abusive, hateful, normal, spam',
 'vocab': 22529,
 'tokens': 102534,
 'tweets': 4663}

In [6]:
def get_task_stats(task_key):
    file_stats = []
    for task_file in get_files(task_key):
        print(task_file)
        stats = get_file_stats(task_file)
        stats["task"] = task_key
        *base_path, data, split_key = task_file.split(os.sep)
        split_key = split_key.split(".")[0]
        stats["split"] = split_key
        stats["data"] = data
        #print(stats)
        file_stats.append(stats)
    df = pd.DataFrame.from_records(file_stats)
    print(df.groupby(["labels", "data"])["split"].count())
    df_t = df.drop(["task", "labels"], axis=1).set_index(["data", "split"])
    display(df_t)
    print(df_t.to_latex(bold_rows=True, multirow=True))
    return df

In [7]:
%%time
for task_key in TASK_KEYS:
    print(task_key)
    %time df = get_task_stats(task_key)

SENTIMENT
..\data\processed\SENTIMENT\Airline\dev.json
..\data\processed\SENTIMENT\Airline\test.json
..\data\processed\SENTIMENT\Airline\train.json
..\data\processed\SENTIMENT\Clarin\dev.json
..\data\processed\SENTIMENT\Clarin\test.json
..\data\processed\SENTIMENT\Clarin\train.json
..\data\processed\SENTIMENT\GOP\dev.json
..\data\processed\SENTIMENT\GOP\test.json
..\data\processed\SENTIMENT\GOP\train.json
..\data\processed\SENTIMENT\Healthcare\dev.json
..\data\processed\SENTIMENT\Healthcare\test.json
..\data\processed\SENTIMENT\Healthcare\train.json
..\data\processed\SENTIMENT\Obama\dev.json
..\data\processed\SENTIMENT\Obama\test.json
..\data\processed\SENTIMENT\Obama\train.json
..\data\processed\SENTIMENT\SemEval\dev.json
..\data\processed\SENTIMENT\SemEval\test.json
..\data\processed\SENTIMENT\SemEval\train.json
labels                       data      
negative, neutral, positive  Airline       3
                             Clarin        3
                             GOP           3

Unnamed: 0_level_0,Unnamed: 1_level_0,tokens,tweets,vocab
data,split,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Airline,dev,20079,981,3273
Airline,test,50777,2452,5630
Airline,train,182040,8825,11697
Clarin,dev,80672,4934,15387
Clarin,test,205126,12334,31373
Clarin,train,732743,44399,84279
GOP,dev,16339,803,3610
GOP,test,41226,2006,6541
GOP,train,148358,7221,14342
Healthcare,dev,15797,724,3304


\begin{tabular}{llrrr}
\toprule
        &       &  tokens &  tweets &  vocab \\
\textbf{data} & \textbf{split} &         &         &        \\
\midrule
\multirow{3}{*}{\textbf{Airline}} & \textbf{dev} &   20079 &     981 &   3273 \\
        & \textbf{test} &   50777 &    2452 &   5630 \\
        & \textbf{train} &  182040 &    8825 &  11697 \\
\cline{1-5}
\multirow{3}{*}{\textbf{Clarin}} & \textbf{dev} &   80672 &    4934 &  15387 \\
        & \textbf{test} &  205126 &   12334 &  31373 \\
        & \textbf{train} &  732743 &   44399 &  84279 \\
\cline{1-5}
\multirow{3}{*}{\textbf{GOP}} & \textbf{dev} &   16339 &     803 &   3610 \\
        & \textbf{test} &   41226 &    2006 &   6541 \\
        & \textbf{train} &  148358 &    7221 &  14342 \\
\cline{1-5}
\multirow{3}{*}{\textbf{Healthcare}} & \textbf{dev} &   15797 &     724 &   3304 \\
        & \textbf{test} &   16022 &     717 &   3471 \\
        & \textbf{train} &   14923 &     690 &   3511 \\
\cline{1-5}
\multirow{3}{*}{\textbf{Ob

Unnamed: 0_level_0,Unnamed: 1_level_0,tokens,tweets,vocab
data,split,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Founta,dev,102534,4663,22529
Founta,test,256569,11657,44540
Founta,train,922028,41961,118349
WaseemSRW,dev,25588,1464,5907
WaseemSRW,test,64893,3659,10646
WaseemSRW,train,234550,13172,23042


\begin{tabular}{llrrr}
\toprule
          &       &  tokens &  tweets &   vocab \\
\textbf{data} & \textbf{split} &         &         &         \\
\midrule
\multirow{3}{*}{\textbf{Founta}} & \textbf{dev} &  102534 &    4663 &   22529 \\
          & \textbf{test} &  256569 &   11657 &   44540 \\
          & \textbf{train} &  922028 &   41961 &  118349 \\
\cline{1-5}
\multirow{3}{*}{\textbf{WaseemSRW}} & \textbf{dev} &   25588 &    1464 &    5907 \\
          & \textbf{test} &   64893 &    3659 &   10646 \\
          & \textbf{train} &  234550 &   13172 &   23042 \\
\bottomrule
\end{tabular}

Wall time: 10.4 s
UNCERTAINITY
..\data\processed\UNCERTAINITY\Riloff\dev.json
..\data\processed\UNCERTAINITY\Riloff\test.json
..\data\processed\UNCERTAINITY\Riloff\train.json
..\data\processed\UNCERTAINITY\Swamy\dev.json
..\data\processed\UNCERTAINITY\Swamy\test.json
..\data\processed\UNCERTAINITY\Swamy\train.json
labels                                                               data  
definitely

Unnamed: 0_level_0,Unnamed: 1_level_0,tokens,tweets,vocab
data,split,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Riloff,dev,2126,145,1002
Riloff,test,5576,362,1986
Riloff,train,19652,1301,5090
Swamy,dev,1597,73,738
Swamy,test,3909,183,1259
Swamy,train,14026,655,2921


\begin{tabular}{llrrr}
\toprule
      &       &  tokens &  tweets &  vocab \\
\textbf{data} & \textbf{split} &         &         &        \\
\midrule
\multirow{3}{*}{\textbf{Riloff}} & \textbf{dev} &    2126 &     145 &   1002 \\
      & \textbf{test} &    5576 &     362 &   1986 \\
      & \textbf{train} &   19652 &    1301 &   5090 \\
\cline{1-5}
\multirow{3}{*}{\textbf{Swamy}} & \textbf{dev} &    1597 &      73 &    738 \\
      & \textbf{test} &    3909 &     183 &   1259 \\
      & \textbf{train} &   14026 &     655 &   2921 \\
\bottomrule
\end{tabular}

Wall time: 373 ms
Wall time: 26.9 s
