In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image
from tqdm.auto import tqdm

data_dir = '/home/pratzohol/google-drive/work-stuff/harmful-meme-detection/datasets/fb-meme'
img_dir = os.path.join(data_dir, 'img')

In [11]:
splits = ['train', 'val', 'test']

df = []
for split in splits:
    if split == 'val':
        split = 'dev'

    file_path = os.path.join(data_dir, f'{split}.jsonl')
    split_df = pd.read_json(file_path, lines=True)
    split_df['split'] = split
    df.append(split_df)


In [12]:
df = pd.concat(df, axis=0, ignore_index=True)
df['id'] = df['img'].str.split('/').str[1].str.split('.').str[0]
df.index = df['id']
df.index.name = None
df.loc[df["split"] == 'dev', "split"] = "val"
print(df.shape)
df.head()

(10000, 5)


Unnamed: 0,id,img,label,text,split
42953,42953,img/42953.png,0.0,its their character not their color that matters,train
23058,23058,img/23058.png,0.0,don't be afraid to love again everyone is not ...,train
13894,13894,img/13894.png,0.0,putting bows on your pet,train
37408,37408,img/37408.png,0.0,i love everything and everybody! except for sq...,train
82403,82403,img/82403.png,0.0,"everybody loves chocolate chip cookies, even h...",train


In [13]:
df['split'].value_counts()

split
train    8500
test     1000
val       500
Name: count, dtype: int64

In [14]:
df['label'].value_counts(normalize=False)

label
0.0    5700
1.0    3300
Name: count, dtype: int64

In [15]:
# df["text"].to_csv("english.txt", index=False, header=False)

In [16]:
import cv2
import pytesseract

def preprocess_image(im):
    """Summary

    Args:
        im (np.array): Image in BGR format after using cv2.imread(<filePath>)

    Returns:
        np.array :
    """
    im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
    im = cv2.bilateralFilter(im, 9, 55, 60)
    _, im = cv2.threshold(im, 235, 255, cv2.THRESH_BINARY_INV)
    return im


def extract_text_from_meme_eng(file_path):
    im = cv2.imread(file_path)
    im = preprocess_image(im)

    tess_config = r'-l eng --tessdata-dir /usr/share/tesseract-ocr/tessdata_best --oem 1 --psm 11'
    txt = pytesseract.image_to_string(im, config=tess_config)
    txt = txt.replace('\n', ' ').strip()

    return txt


def extract_text_from_meme_all(file_path):
    im = cv2.imread(file_path)
    im = preprocess_image(im)

    tess_config = r'-l eng+chi_sim+chi_tra+tam+msa --tessdata-dir /usr/share/tesseract-ocr/tessdata_best --oem 1 --psm 11'
    txt = pytesseract.image_to_string(im, config=tess_config)
    txt = txt.replace('\n', ' ').strip()

    return txt

In [5]:
file_paths = list(df["img"])
file_paths = [f"{data_dir}/{fp}" for fp in file_paths]

dir = "/home/pratzohol/google-drive/work-stuff/harmful-meme-detection/datasets/fb-meme/ocr/"
all_dir = dir + "all.txt"
eng_dir = dir + "eng.txt"


In [41]:
# from tqdm import tqdm

# with open(all_dir, "w") as f:
#     for fp in tqdm(file_paths):
#         txt_all = extract_text_from_meme_all(fp)
#         f.write(txt_all)
#         f.write("\n")

100%|██████████| 10000/10000 [3:48:59<00:00,  1.37s/it] 


In [42]:
# with open(eng_dir, "w") as f:
#     for fp in tqdm(file_paths):
#         txt_eng = extract_text_from_meme_eng(fp)
#         f.write(txt_eng)
#         f.write("\n")

100%|██████████| 10000/10000 [37:15<00:00,  4.47it/s]


In [6]:
eng_dir_easyocr = dir + "eng_easyocr.txt"

In [23]:
# from tqdm import tqdm
# import easyocr

# reader = easyocr.Reader(['en'])

# with open(eng_dir_easyocr, "w") as f:
#     for fp in tqdm(file_paths):
#         txt = reader.readtext(fp, detail=0)
#         txt = " ".join(txt)
#         f.write(txt)
        # f.write("\n")

100%|██████████| 10000/10000 [38:12<00:00,  4.36it/s] 


In [13]:
df = pd.read_csv(data_dir + "/fb_hateful_memes_info.csv")

with open(eng_dir_easyocr, "r") as f:
    texts = f.readlines()

In [18]:
df["text_easyocr"] = texts
df

Unnamed: 0,id,img,label,text,split,text_easyocr
0,42953,img/42953.png,0.0,its their character not their color that matters,train,its their character n0t their color that mnatt...
1,23058,img/23058.png,0.0,don't be afraid to love again everyone is not ...,train,dJomFt be afaid to love again eqeryone Is not ...
2,13894,img/13894.png,0.0,putting bows on your pet,train,putling bbows @m Vouc peu\n
3,37408,img/37408.png,0.0,i love everything and everybody! except for sq...,train,i love everything and everybodlyl except for s...
4,82403,img/82403.png,0.0,"everybody loves chocolate chip cookies, even h...",train,"everybody loves chocolate chip cookies, even h..."
...,...,...,...,...,...,...
9995,3869,img/03869.png,,a mother's love for the child is a divine thing,test,a mother 's Iove for the child is a divine thi...
9996,23817,img/23817.png,,sea monkeys,test,sea momkevs\n
9997,56280,img/56280.png,,little miss muffet sat on her tuffet,test,1018 SUMMIT rLEvOIX SOMMET 57 DE 2018 little m...
9998,29384,img/29384.png,,they're in a row,test,thev're in a COw 0118402885 01let0813 0lleI023...


In [19]:
# df.to_csv(data_dir + "/fb_hateful_memes_info.csv", index=False)