In [1]:
import pandas as pd
import os
import numpy as np
import plotly.express as px

In [2]:
# Prepare target variable
def prepare_target(classval, classtype):
    target = ""
    if len(classval) > 0:
        target = f"[B{classtype}] " + " [SEP] ".join(classval) + f" [E{classtype}]"

    return target
    
def get_dataset(filename):
    covid_path = "Dataset/covid/annotations"
    politics_path = "Dataset/politics/annotations"

    covid_anno = pd.read_json(f"../{covid_path}/{filename}.jsonl", lines = True).reset_index(drop=True)
    politics_anno = pd.read_json(f"../{politics_path}/{filename}.jsonl", lines = True).reset_index(drop=True)

    covid_entities = covid_anno.apply(lambda x: x.hero + x.villain + x.victim + x.other, axis=1)
    politics_entities = politics_anno.apply(lambda x: x.hero + x.villain + x.victim + x.other, axis=1)

    final_ds = pd.concat([covid_anno, politics_anno], axis =0, ignore_index=True)

    # = final_ds.apply(lambda x: prepare_target(x.hero, x.villain, x.victim, x.other), axis=1)
    final_ds["Hero"] = final_ds.apply(lambda x: prepare_target(x.hero, "H"), axis=1)
    final_ds["Villain"] = final_ds.apply(lambda x: prepare_target(x.villain, "VIL"), axis=1)
    final_ds["Victim"] = final_ds.apply(lambda x: prepare_target(x.victim, "VIC"), axis=1)
    final_ds["Other"] = final_ds.apply(lambda x: prepare_target(x.other, "OTH"), axis=1)
    final_ds["Target"] = final_ds["Hero"] + " " + final_ds["Villain"] + " " + final_ds["Victim"] + " " + final_ds["Other"]
    final_ds["Target"] = final_ds["Target"].apply(lambda x: x.strip())
    return final_ds

In [5]:
_entities = pd.read_csv("../Dataset/Entities.csv")
px.histogram(_entities["1"])

In [4]:
train = get_dataset("train")
valid = get_dataset("val")

#### Preprocess text

In [5]:
train.OCR.values[:5]

array(['Bernie or Elizabeth?\nBe informed.Compare them on the issues that matter.\nIssue: Who makes the dankest memes?\n',
       'Extending the\nBrexit deadline until\nOctober 31st in\norder to ensure a deal\neveryone can agree with.\nUsing the extension\nto elect a new prime\nminister and then\ntake a recess until one\nmonth before the deadline\nimgflip.com\n',
       'kwai\ngkwa 0964\n#nnevvy\napplause to Thais from Hong Kong\nWHY THAIS DID NOT GET HURT?\nCULTURAL DIFFERENCE\nJUST STAY IN THE WALL\n',
       'So, I order this\nfoce mask to\nprotect ogainst\nfhe Corond virus\nHooold up, it says\nsomething here.\n"Made in China"\n',
       'best candidate for\nJA\n2020\njoe biden\nKamala harris\nBernie sanders\nTikTok\n@eliguthrie\n'],
      dtype=object)

In [6]:
def preprocess_text(text):
    new_text = text.replace("\n"," ")
    _text = []
 
    for t in new_text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        _text.append(t)
    return " ".join(_text)

train.OCR = train.OCR.apply(lambda x: preprocess_text(x))
valid.OCR = valid.OCR.apply(lambda x: preprocess_text(x))

In [8]:
train.head()

Unnamed: 0,OCR,image,hero,villain,victim,other,Hero,Villain,Victim,Other,Target
0,Bernie or Elizabeth? Be informed.Compare them ...,covid_memes_18.png,[],[],[],"[bernie sanders, elizabeth warren]",,,,[BOTH] bernie sanders [SEP] elizabeth warren [...,[BOTH] bernie sanders [SEP] elizabeth warre...
1,Extending the Brexit deadline until October 31...,covid_memes_19.png,[],[uk government],[],[],,[BVIL] uk government [EVIL],,,[BVIL] uk government [EVIL]
2,kwai gkwa 0964 #nnevvy applause to Thais from ...,covid_memes_252.png,[thais],[],[],[hong kong],[BH] thais [EH],,,[BOTH] hong kong [EOTH],[BH] thais [EH] [BOTH] hong kong [EOTH]
3,"So, I order this foce mask to protect ogainst ...",covid_memes_255.png,[],[china],[],"[face mask, made in china, coronavirus]",,[BVIL] china [EVIL],,[BOTH] face mask [SEP] made in china [SEP] cor...,[BVIL] china [EVIL] [BOTH] face mask [SEP] m...
4,best candidate for JA 2020 joe biden Kamala ha...,covid_memes_20.png,[joe biden],[],[],"[bernie sanders, kamala harris, tiktok]",[BH] joe biden [EH],,,[BOTH] bernie sanders [SEP] kamala harris [SEP...,[BH] joe biden [EH] [BOTH] bernie sanders [S...


In [9]:
def resample_data(data):
    ds = data.copy()

    entity_dict = {}
    
    def count_entity(x):
        if x is not None and len(x) > 0:
            for item in x:
                key = item.lower()
                if key in entity_dict:
                    entity_dict[key] = entity_dict[key] + 1
                else:
                    entity_dict[key] = 1

    # Prepare count for all entities
    _=ds.apply(lambda x: count_entity(x.hero + x.villain + x.victim + x.other), axis=1)

    # Find all entities with low count
    filtered_entities = []
    for key, value in entity_dict.items():
        if value < 5:
            filtered_entities.append(key)

    # Get indexes for low occuring entities
    def get_low_occuring_indexes(row):
        index = row.name
        entities = row.entities
        for _item in entities:
            if _item in filtered_entities:
                return index
        return None

    # 
    ds = ds.reset_index(drop=True)
    ds["entities"] =  ds.apply(lambda x: x.hero + x.villain + x.victim + x.other, axis=1)
    indexes = ds.apply(lambda x:get_low_occuring_indexes(x), axis = 1)

    # Resample low occuring indices 
    _tmpds = pd.DataFrame()
    for _index in indexes[~indexes.isna()].index.values:
        _tmpds = pd.concat([_tmpds
        , pd.DataFrame(np.repeat(ds.loc[_index].values.reshape(1,len(ds.columns)), 20, axis = 0), columns=ds.columns)]
        , axis = 0, ignore_index=True)

    # Merge resample df with actual dataset
    ds = pd.concat([_tmpds, ds], axis = 0, ignore_index=True)
    ds = ds.reset_index(drop=True)

    # Shuffle dataset
    ds = ds.sample(frac=1).reset_index(drop = True)

    return ds

In [10]:
train = resample_data(train)
print(train.shape, valid.shape)

(66572, 12) (650, 11)


In [12]:
# Save datasets
train.to_parquet("../Dataset/train.parquet")
valid.to_parquet("../Dataset/valid.parquet")