In [1]:
from glob import glob
from collections import Counter
import pandas as pd

from sklearn.model_selection import train_test_split

from IPython.display import display
import re
import os
import json
import gzip

from SocialMediaIE.data.tokenization import tokenize

In [2]:
SENTIMENT_WRITE_DIR = "../data/processed/SENTIMENT"
ABUSIVE_WRITE_DIR = "../data/processed/ABUSIVE"
UNCERTAINITY_WRITE_DIR = "../data/processed/UNCERTAINITY"

In [3]:
! rm -rf ../data/processed/SENTIMENT
! rm -rf ../data/processed/ABUSIVE
! rm -rf ../data/processed/UNCERTAINITY

In [4]:
def read_tweet_json_file(path):
    open_fn = open
    if path.endswith(".gz"):
        open_fn = gzip.open
    with open_fn(path, "rt", encoding="utf-8") as fp:
        for line in fp:
            row = json.loads(line)
            yield row
            
def tweet_json_to_df(json_file_path):
    df_json = pd.DataFrame([
        (row["id"], row["full_text"]) 
        for row in read_tweet_json_file(json_file_path)
    ], columns=["tweet_id", "text"]).drop_duplicates(subset="tweet_id")
    return df_json
            
def write_data(df, base_dir):
    groups = df.groupby(["dataset", "datasplit"])
    for (dataset, datasplit), df_group in groups:
        output_dir = os.path.join(base_dir, dataset)
        os.makedirs(output_dir, exist_ok=True)
        output_file = os.path.join(output_dir, f"{datasplit}.json")
        print(f"Data: {dataset}, split: {datasplit}, shape: {df_group.shape}, written to {output_file}")
        df_group[["tweet_id", "text", "label"]].to_json(output_file, orient="records", lines=True)
        
        
def split_data(df):
    df_train, df_test = train_test_split(
        df, test_size=0.2, random_state=1337, stratify=df.label)
    df_train, df_dev = train_test_split(
        df_train, test_size=0.1, random_state=1337, stratify=df_train.label)
    df.loc[df_train.index, "datasplit"] = "train"
    df.loc[df_dev.index, "datasplit"] = "dev"
    df.loc[df_test.index, "datasplit"] = "test"
    return df

## Parse Sentiment Benchmark Datasets

In [5]:
def load_sentiment_data(base_dir):
    split_file_path = os.path.join(base_dir, "data_with_train_dev_test_split.txt")
    json_file_path = os.path.join(base_dir, "joined_data_all.text.json")
    df = pd.read_csv(split_file_path, sep="\t")
    df = df[(df.language == "english")]
    df_json = pd.read_json(json_file_path, orient="split").drop_duplicates(subset="tid")
    df = df.merge(df_json, on="tid", how="inner")
    df = df.rename(columns={"tid": "tweet_id"})
    return df

SENTIMENT_BASE_DIR = "G:\\AzureBackup\\Datasets\\Twitter\\TwitterSentimentBenchmarks"

In [6]:
df = load_sentiment_data(SENTIMENT_BASE_DIR)
print(df.columns)
df.pivot_table(index="dataset", columns=["datasplit", "label"], values="tweet_id", aggfunc=len, margins=True)

Index(['tweet_id', 'favorites', 'is_quote', 'is_reply', 'retweets', 'u_id',
       'u_created_at', 'u_followers', 'u_friends', 'u_lists', 'u_statuses',
       'u_verified', 'u_location', 'u_has_url', 'n_hashtags', 'n_symbols',
       'n_urls', 'n_mentions', 'created_at', 'dataset', 'datasplit',
       'language', 'label', 'text'],
      dtype='object')


datasplit,dev,dev,dev,test,test,test,train,train,train,All
label,negative,neutral,positive,negative,neutral,positive,negative,neutral,positive,Unnamed: 10_level_1
dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Airline,613,205,163,1532,512,408,5515,1843,1467,12258
Clarin,1276,2158,1500,3191,5394,3749,11485,19418,13496,61667
GOP,471,202,130,1175,505,326,4230,1818,1173,10030
Healthcare,405,178,141,428,156,133,326,192,172,2131
Obama,80,79,50,199,197,126,715,707,455,2608
SemEval,836,1779,1968,3418,11097,8588,1736,5223,5286,39931
All,3681,4601,3952,9943,17861,13330,24007,29201,22049,128625


In [7]:
write_data(df, SENTIMENT_WRITE_DIR)

Data: Airline, split: dev, shape: (981, 24), written to ../data/processed/SENTIMENT\Airline\dev.json
Data: Airline, split: test, shape: (2452, 24), written to ../data/processed/SENTIMENT\Airline\test.json
Data: Airline, split: train, shape: (8825, 24), written to ../data/processed/SENTIMENT\Airline\train.json
Data: Clarin, split: dev, shape: (4934, 24), written to ../data/processed/SENTIMENT\Clarin\dev.json
Data: Clarin, split: test, shape: (12334, 24), written to ../data/processed/SENTIMENT\Clarin\test.json
Data: Clarin, split: train, shape: (44399, 24), written to ../data/processed/SENTIMENT\Clarin\train.json
Data: GOP, split: dev, shape: (803, 24), written to ../data/processed/SENTIMENT\GOP\dev.json
Data: GOP, split: test, shape: (2006, 24), written to ../data/processed/SENTIMENT\GOP\test.json
Data: GOP, split: train, shape: (7221, 24), written to ../data/processed/SENTIMENT\GOP\train.json
Data: Healthcare, split: dev, shape: (724, 24), written to ../data/processed/SENTIMENT\Healthc

## Parse Abusive Benchmark Datasets

In [8]:
def load_abusive_data(base_dir):
    split_file_path = os.path.join(base_dir, "hatespeechtwitter.tab")
    json_file_path = os.path.join(base_dir, "tweets.jsonl")
    df = pd.read_csv(split_file_path, sep="\t")
    df = df.rename(columns={"maj_label": "label"})
    print(df.shape)
    df_json = tweet_json_to_df(json_file_path)
    print(df_json.shape)
    df = df.merge(df_json, on="tweet_id", how="inner")
    print(df.shape)
    df["dataset"] = "Founta"
    df["datasplit"] = "train"
    df = split_data(df)
    return df

ABUSIVE_BASE_DIR = "G:\\AzureBackup\\Datasets\\Twitter\\AbusiveTweets"

In [9]:
df = load_abusive_data(ABUSIVE_BASE_DIR)
print(df.columns)
df.pivot_table(index="dataset", columns=["datasplit", "label"], values="tweet_id", aggfunc=len, margins=True)

(80000, 2)
(58222, 2)
(58281, 3)
Index(['tweet_id', 'label', 'text', 'dataset', 'datasplit'], dtype='object')


datasplit,dev,dev,dev,dev,test,test,test,test,train,train,train,train,All
label,abusive,hateful,normal,spam,abusive,hateful,normal,spam,abusive,hateful,normal,spam,Unnamed: 13_level_1
dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Founta,470,186,3274,733,1175,466,8185,1831,4230,1677,29464,6590,58281
All,470,186,3274,733,1175,466,8185,1831,4230,1677,29464,6590,58281


In [10]:
write_data(df, ABUSIVE_WRITE_DIR)

Data: Founta, split: dev, shape: (4663, 5), written to ../data/processed/ABUSIVE\Founta\dev.json
Data: Founta, split: test, shape: (11657, 5), written to ../data/processed/ABUSIVE\Founta\test.json
Data: Founta, split: train, shape: (41961, 5), written to ../data/processed/ABUSIVE\Founta\train.json


## Parse Abusive HateSpeech data

In [11]:
def load_hatespeech_srw_data(base_dir):
    srw_split_file_path = os.path.join(base_dir, "NAACL_SRW_2016.csv")
    css_split_file_path = os.path.join(base_dir, "NLP+CSS_2016.csv")
    json_file_path = os.path.join(base_dir, "tweet_ids.json")
    df = pd.read_csv(srw_split_file_path, sep=",", header=None, names=["tweet_id", "label"])
    print(df.shape)
    
    # Load CSS data
    df_css = pd.read_csv(css_split_file_path, sep="\t", usecols=["TweetID"]).reset_index()
    df_css = df_css.rename(columns={
        "index": "tweet_id",
        "TweetID": "label"
    })
    print(df_css.shape)
    # Rename label
    df_css.loc[df_css.label == "neither", "label"] = "none"
    # Drop both label as too ambigous
    df_css = df_css.drop(df_css[df_css.label == "both"].index, axis=0)
    print(df_css.shape)
    
    # Concat both data
    df = pd.concat([df, df_css], axis=0)
    print(df.shape)
    df = df.drop_duplicates(subset=["tweet_id", "label"])
    print(df.shape)
    
    # Join with text
    df_json = tweet_json_to_df(json_file_path)
    print(df_json.shape)
    df = df.merge(df_json, on="tweet_id", how="inner")
    print(df.shape)
    df["dataset"] = "WaseemSRW"
    df["datasplit"] = "train"
    df = split_data(df)
    return df

HATESPEECH_SRW_BASE_DIR = "G:\\AzureBackup\\Datasets\\Twitter\\hatespeech"

In [12]:
df = load_hatespeech_srw_data(HATESPEECH_SRW_BASE_DIR)
print(df.columns)
df.pivot_table(index="dataset", columns=["datasplit", "label"], values="tweet_id", aggfunc=len, margins=True)

(16907, 2)
(6909, 2)
(6859, 2)
(23766, 2)
(19752, 2)
(18172, 2)
(18295, 3)
Index(['tweet_id', 'label', 'text', 'dataset', 'datasplit'], dtype='object')


datasplit,dev,dev,dev,test,test,test,train,train,train,All
label,none,racism,sexism,none,racism,sexism,none,racism,sexism,Unnamed: 10_level_1
dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
WaseemSRW,1007,159,298,2516,399,744,9057,1437,2678,18295
All,1007,159,298,2516,399,744,9057,1437,2678,18295


In [13]:
write_data(df, ABUSIVE_WRITE_DIR)

Data: WaseemSRW, split: dev, shape: (1464, 5), written to ../data/processed/ABUSIVE\WaseemSRW\dev.json
Data: WaseemSRW, split: test, shape: (3659, 5), written to ../data/processed/ABUSIVE\WaseemSRW\test.json
Data: WaseemSRW, split: train, shape: (13172, 5), written to ../data/processed/ABUSIVE\WaseemSRW\train.json


## Parse Veridicality data

In [14]:
## TOO SMALL DATA DO NOT USE

VERIDICALITY_LABEL_MAP = {
    "DY": "definitely_yes",
    "DN": "definitely_no",
    "PY": "probably_yes",
    "PN": "probably_no",
    "UC": "uncertain",
}

def load_veridicality_data(base_dir):
    split_file_path = os.path.join(base_dir, "veridicality_data.csv")
    json_file_path = os.path.join(base_dir, "tweet_ids.txt.json.gz")
    df = pd.read_csv(split_file_path, sep=",")
    df = df.rename(columns={
        "TweetID": "tweet_id", 
        " Annotation": "label"
    })
    print(df.label.value_counts())
    df.label = df.label.map(VERIDICALITY_LABEL_MAP.get)
    print(df.shape)
    df_json = tweet_json_to_df(json_file_path)
    print(df_json.shape)
    df = df.merge(df_json, on="tweet_id", how="inner")
    print(df.shape)
    df["dataset"] = "Swamy"
    df["datasplit"] = "train"
    df = split_data(df)
    return df

VERIDICALITY_BASE_DIR = "G:\\AzureBackup\\Datasets\\Twitter\\Twitter-Veridicality\\data"

In [15]:
df = load_veridicality_data(VERIDICALITY_BASE_DIR)
print(df.columns)
df.pivot_table(index="dataset", columns=["datasplit", "label"], values="tweet_id", aggfunc=len, margins=True)

UC    390
DY    284
PY    224
PN     65
DN     34
Name: label, dtype: int64
(997, 2)
(237442, 2)
(911, 3)
Index(['tweet_id', 'label', 'text', 'dataset', 'datasplit'], dtype='object')


datasplit,dev,dev,dev,dev,dev,test,test,test,test,test,train,train,train,train,train,All
label,definitely_no,definitely_yes,probably_no,probably_yes,uncertain,definitely_no,definitely_yes,probably_no,probably_yes,uncertain,definitely_no,definitely_yes,probably_no,probably_yes,uncertain,Unnamed: 16_level_1
dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Swamy,2,20,5,17,29,6,51,12,42,72,23,182,45,148,257,911
All,2,20,5,17,29,6,51,12,42,72,23,182,45,148,257,911


In [16]:
write_data(df, UNCERTAINITY_WRITE_DIR)

Data: Swamy, split: dev, shape: (73, 5), written to ../data/processed/UNCERTAINITY\Swamy\dev.json
Data: Swamy, split: test, shape: (183, 5), written to ../data/processed/UNCERTAINITY\Swamy\test.json
Data: Swamy, split: train, shape: (655, 5), written to ../data/processed/UNCERTAINITY\Swamy\train.json


## Parse Sarcasm data

In [17]:
def load_sarcasm_data(base_dir):
    split_file_path = os.path.join(base_dir, "sarcasm-data", "sarcasm-annos-emnlp13.tsv")
    json_file_path = os.path.join(base_dir, "tweet_ids.txt.json.gz")
    df = pd.read_csv(split_file_path, sep="\t", header=None, names=["tweet_id", "label"])
    print(df.shape)
    df.label = df.label.str.lower()
    df_json = tweet_json_to_df(json_file_path)
    print(df_json.shape)
    df = df.merge(df_json, on="tweet_id", how="inner")
    print(df.shape)
    df["dataset"] = "Riloff"
    df["datasplit"] = "train"
    df = split_data(df)
    return df

SARCASM_BASE_DIR = "G:\\AzureBackup\\Datasets\\Twitter\\\SarcasmRiloff2013"

In [18]:
df = load_sarcasm_data(SARCASM_BASE_DIR)
print(df.columns)
df.pivot_table(index="dataset", columns=["datasplit", "label"], values="tweet_id", aggfunc=len, margins=True)

(3000, 2)
(1808, 2)
(1808, 3)
Index(['tweet_id', 'label', 'text', 'dataset', 'datasplit'], dtype='object')


datasplit,dev,dev,test,test,train,train,All
label,not_sarcasm,sarcasm,not_sarcasm,sarcasm,not_sarcasm,sarcasm,Unnamed: 7_level_1
dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Riloff,115,30,288,74,1033,268,1808
All,115,30,288,74,1033,268,1808


In [19]:
write_data(df, UNCERTAINITY_WRITE_DIR)

Data: Riloff, split: dev, shape: (145, 5), written to ../data/processed/UNCERTAINITY\Riloff\dev.json
Data: Riloff, split: test, shape: (362, 5), written to ../data/processed/UNCERTAINITY\Riloff\test.json
Data: Riloff, split: train, shape: (1301, 5), written to ../data/processed/UNCERTAINITY\Riloff\train.json
