## data preprocessing and post classification analysis

In [23]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter as C
import re
from PIL import Image

In [11]:
all_data = pd.read_csv("../data/data/all_data.csv")

In [12]:
## all files in models ie logs
#[os.path.join(top, file) for top, dirs, files in os.walk("./hate-speech-detection/models") for file in files]

In [13]:
## all images
#os.listdir("../data/data/img")

In [14]:
## displaying all rows of df
#pd.set_option('display.max_rows', None)

In [15]:
# getting test data
#test_data = pd.read_json("../data/data/test.jsonl", lines=True)
#test = test_data.sort_values(by=["img"]).reset_index(drop=True)

In [16]:
def open_accuracys(path):
    # opening .accuracys files and getting the file names of the images with the lowest and highest squared error
    with open(path) as f:
        file = f.read()
        file = file.split("\n")
    file = [x.split("\t")[0].split("/")[5].split(".")[0] for x in file if x.startswith("..")]
    return file[:20], file[20:] # lowest squared error, highest squared error

In [17]:
top100_0_11, bott100_0_11 = open_accuracys("./hate-speech-detection/results/models/clf100.0.11.pt.accuracys")

In [18]:
top100_0_b11, bott100_0_b11 = open_accuracys("./hate-speech-detection/results/models/clf100.0.b11.pt.accuracys")

In [19]:
top3b01, bott3b01 = open_accuracys("./hate-speech-detection/results/accuracys")

In [20]:
top_bott = [top100_0_11, bott100_0_11, top100_0_b11, bott100_0_b11, top3b01, bott3b01]

In [22]:
def open_img(lst):
    for x in lst:
        im = Image.open("../data/data/img/" + str(x) + ".png")
        im.show()
    pass

In [24]:
def flatten_list(lst):
    return [subsublist for sublist in lst for subsublist in sublist]

In [25]:
def get_text_counts(lst):
    # counting unique words and unique bigrams for all memes ie COUNT([{w1, w2}, {w2,  w3}, {w1, w3}])
    # outputs two dicts where either bigram or word is the key and count is the value. both dicts are sorted by value, descending
    x = all_data[all_data["id"].isin(lst)]
    x = x.text.to_list()
    
    # getting a list of bigrams for each sentence (including punctuation unfortunately) by splitting on everything that's not a word
    # maybe rm punctuation instead?
    bigrams = [{(w.lower(), re.split(r"(\W+)", sent)[i+1].lower()) for i, w in enumerate(sent.split(" ")) if i != (len(sent.split(" "))-1)} for sent in x ]
    bigrams = flatten_list(bigrams)
    
    words = [{w.lower() for w in set(re.split(r"(\W+)", sent))} for sent in x]
    words = flatten_list(words)
    
    count_words = C(words)
    count_bigrams = C(bigrams)
    
    return dict(sorted(count_words.items(), key=lambda item: item[1], reverse=True)), dict(sorted(count_bigrams.items(), key=lambda item: item[1], reverse=True))

In [26]:
top100_0_11c, bott100_0_11c, top100_0_b11c, bott100_0_b11c, top3b01c, bott3b01c = [get_text_counts(l) for l in top_bott]

In [31]:
# word counts for the 20 memes with the lowest squared error
top100_0_11c[0]

{' ': 20,
 "'": 9,
 'the': 9,
 ', ': 8,
 'and': 7,
 'in': 7,
 'of': 7,
 '': 6,
 'to': 6,
 'you': 6,
 'we': 6,
 'a': 6,
 'islam': 5,
 'muslims': 5,
 's': 5,
 'muslim': 5,
 'they': 4,
 '. ': 4,
 'is': 4,
 'religion': 4,
 'i': 4,
 'all': 3,
 'people': 3,
 '? ': 3,
 'are': 3,
 'no': 3,
 'it': 3,
 'll': 3,
 'but': 3,
 'if': 3,
 'kill': 3,
 'peace': 3,
 'were': 3,
 'women': 2,
 'when': 2,
 'killing': 2,
 'liberals': 2,
 'christians': 2,
 'think': 2,
 'shit': 2,
 'race': 2,
 'do': 2,
 'hate': 2,
 'their': 2,
 'too': 2,
 'child': 2,
 'love': 2,
 'only': 2,
 ', "': 2,
 ':': 2,
 'our': 2,
 'dog': 2,
 'can': 2,
 'believe': 2,
 'them': 2,
 '" ': 2,
 'what': 2,
 ' "': 2,
 'have': 2,
 'going': 2,
 'm': 2,
 'hell': 2,
 'let': 2,
 'with': 2,
 'christian': 2,
 '!': 1,
 'beating': 1,
 'defend': 1,
 'babies': 1,
 'pedophilia': 1,
 'murdering': 1,
 'beheading': 1,
 'rape': 1,
 ',': 1,
 'homosexuals': 1,
 'rapist': 1,
 'molesters': 1,
 ' , ': 1,
 'oh': 1,
 'good': 1,
 'so': 1,
 'huh': 1,
 'superior': 1,
 '

In [32]:
def find_words(w_list, img_list):
    # goes through list of memes and check whether they contain words from a certain list
    # outputs a dict with word as key and a set of file names as value
    
    x = all_data[all_data["id"].isin(img_list)].reset_index(drop=True)
    w_id = {}
    for row in x.iterrows():
        split = re.split(r"(\W+)", row[1].text)
        img = row[1].id
        for w in split:
            if w in w_list:
                if w not in w_id.keys():
                    w_id[w] = set()
                    w_id[w].add(img)
                else:
                    w_id[w].add(img)
    return w_id
    

In [33]:
# which memes from the 20 with the lowest squared errors contain the words "muslim", "muslims", "islam", "religion"
find_words(["muslim", "muslims", "islam", "religion"], top100_0_11)

{'islam': {31409, 35602, 38461, 48091, 61378},
 'muslims': {12867, 15690, 17306, 43519, 98125},
 'religion': {18640, 38461, 48091, 91486},
 'muslim': {17306, 19875, 31409, 43519, 91486}}

In [34]:
# moving test images to separate folder
#origin = "../data/data/"
#destination = "../data/data/test/"
#os.mkdir("../data/data/test/")
#for img in test_data.img.tolist():
#    os.rename((origin+img), (destination+img.lstrip("img/")))
#    

In [42]:
## creating new data split
# reading only dev and train as test doesn't have labels
dev_data = pd.read_json("../data/data/dev.jsonl", lines=True)
train_data = pd.read_json("../data/data/train.jsonl", lines=True)
all_data = pd.concat([dev_data, train_data], ignore_index=True)

all_data['id'] = list(map(lambda x: str(x).zfill(5), all_data['id'])) # making sure all img id's have their full file name even if it starts w 0
all_data['set'] = all_data.shape[0] * ['train'] # creating new column, setting all to train to start out
testsize = int(((15/100))*all_data.shape[0]) # setting test and validation size
valsize = testsize
test = testsize * ['test']
val = valsize * ['val']
all_data = all_data.sample(frac=1).reset_index(drop=True) # shuffle the dataframe 
all_data.loc[:(testsize-1), 'set'] = test # setting the top k (= testsize) items in the train/test/val column to test
all_data.loc[testsize:(testsize+valsize-1), 'set'] = val # same as above but with val



In [51]:
i_want_to_over_write_this_file = False

In [68]:
# write to file if you really want to but preferably not
if "all_data.csv" not in os.listdir("../data/data/"):
    if i_want_to_over_write_this_file:
        all_data.to_csv("../data/data/all_data.csv", index=False)

In [69]:
# better to read the existing file 
df = pd.read_csv("../data/data/all_data.csv")

In [70]:
# getting the rows for validation set 
not_val = df[(df['set'] == 'val') & (df['label'] == 0)].reset_index(drop=True) # not offensive
off_val = df[(df['set'] == 'val') & (df['label'] == 1)].reset_index(drop=True) # offensive
# for train
not_train = df[(df['set'] == 'train') & (df['label'] == 0)].reset_index(drop=True)
off_train = df[(df['set'] == 'train') & (df['label'] == 1)].reset_index(drop=True)
# for test
not_test = df[(df['set'] == 'test') & (df['label'] == 0)].reset_index(drop=True)
off_test = df[(df['set'] == 'test') & (df['label'] == 1)].reset_index(drop=True)

In [72]:
def write_txt(df, filename):
    file = "../data/data/" + filename
    with open(file, 'w') as f:
        for img in df.img.to_list():
            f.write(img+"\n")
    print("Done!")
        

In [73]:
are_you_sure = False

In [74]:
if "goodMemesList.txt.test" not in os.listdir("../data/data/"):
    if are_you_sure:
        write_txt(not_test, "goodMemesList.txt.test")
        write_txt(off_test, "hateMemesList.txt.test")
        write_txt(not_train, "goodMemesList.txt.train")
        write_txt(off_train, "hateMemesList.txt.train")
        write_txt(not_val, "goodMemesList.txt.val")
        write_txt(off_val, "hateMemesList.txt.val")

In [75]:
im_sure = False

In [78]:
if "balanced_data.csv" not in os.listdir("../data/data/"):
    if im_sure:
        label_1 = df.copy()[df["label"] == 1]
        label_0 = df.copy()[df["label"] == 0].sample(n=3300)
        balanced = pd.concat([label_1, label_0], ignore_index=True).to_csv("../data/data/balanced_data.csv", index=False)

In [80]:
# better to read the file instead
balanced_df = pd.read_csv("../data/data/balanced_data.csv")

In [83]:
# getting the rows for balanced validation set 
bal_not_val = balanced_df[(balanced_df['set'] == 'val') & (balanced_df['label'] == 0)].reset_index(drop=True)
bal_off_val = balanced_df[(balanced_df['set'] == 'val') & (balanced_df['label'] == 1)].reset_index(drop=True)
# and for train
bal_not_train = balanced_df[(balanced_df['set'] == 'train') & (balanced_df['label'] == 0)].reset_index(drop=True)
bal_off_train = balanced_df[(balanced_df['set'] == 'train') & (balanced_df['label'] == 1)].reset_index(drop=True)
# and for test
bal_not_test = balanced_df[(balanced_df['set'] == 'test') & (balanced_df['label'] == 0)].reset_index(drop=True)
bal_off_test = balanced_df[(balanced_df['set'] == 'test') & (balanced_df['label'] == 1)].reset_index(drop=True)

In [81]:
positive = False

In [82]:
if "bal.goodMemesList.txt.test" not in os.listdir("../data/data/"):
    if positive:
        write_txt(bal_not_test, "bal.goodMemesList.txt.test")
        write_txt(bal_off_test, "bal.hateMemesList.txt.test")
        write_txt(bal_not_train, "bal.goodMemesList.txt.train")
        write_txt(bal_off_train, "bal.hateMemesList.txt.train")
        write_txt(bal_not_val, "bal.goodMemesList.txt.val")
        write_txt(bal_off_val, "bal.hateMemesList.txt.val")

In [86]:
def write_caption_to_file(df, basedir):
    for img, text in zip(df.loc[:, "img"].to_list(), df.loc[:, "text"].to_list()):
        filename = basedir + img + ".ocr"
        with open(filename, "w") as f:
            f.write(text)
    print("Done!")
    

In [88]:
write_new_captions = False

In [89]:
if write_new_captions:    
    write_caption_to_file(off_test, "/home/gushansad@GU.GU.SE/lt2318-ai/aics-project/data/data/")
    write_caption_to_file(not_test, "/home/gushansad@GU.GU.SE/lt2318-ai/aics-project/data/data/")
    write_caption_to_file(not_train, "/home/gushansad@GU.GU.SE/lt2318-ai/aics-project/data/data/")
    write_caption_to_file(off_train, "/home/gushansad@GU.GU.SE/lt2318-ai/aics-project/data/data/")
    write_caption_to_file(not_val, "/home/gushansad@GU.GU.SE/lt2318-ai/aics-project/data/data/")
    write_caption_to_file(off_val, "/home/gushansad@GU.GU.SE/lt2318-ai/aics-project/data/data/")