## TEXT
* remove_null
* lower case
* stem and lemmatize

## IMAGE
* resize all and save (grayscale)
* augmentation using GANs

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import string
import os
import nltk
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import tensorflow as tf
from PIL import Image, ImageFilter

[nltk_data] Downloading package wordnet to /home/sameep/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
2023-03-31 22:17:57.811510: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-31 22:17:58.128145: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-03-31 22:17:58.155328: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No su

In [2]:
train_df = pd.read_json('../data/facebook/train.jsonl', lines = True)

test_df = pd.read_json('../data/facebook/test.jsonl', lines = True)

dev_df = pd.read_json('../data/facebook/dev.jsonl', lines = True)

test_df.head()

Unnamed: 0,id,img,text
0,16395,img/16395.png,handjobs sold seperately
1,37405,img/37405.png,introducing fidget spinner for women
2,94180,img/94180.png,happy pride month let's go beat up lesbians
3,54321,img/54321.png,laughs in [majority of u.s crime rate]
4,97015,img/97015.png,finds out those 72 virgins.. are goats


In [3]:
class Text_Preprocessor:
    def __init__(self, df):
        self.df = df

    def get_df(self):
        return self.df
    
    def lower_case(self):
        self.df['text'] = self.df['text'].apply(str.lower)

    def remove_punctuations(self):
        cleaned_text = []
        for index in tqdm(range(self.df.shape[0])):
            text = self.df['text'].iloc[index]

            word_tokens = text.split()
            
            table = str.maketrans('', '', string.punctuation)
            stripped = [w.translate(table) for w in word_tokens]

            filtered_sentence = " ".join(stripped).strip()
            cleaned_text.append(filtered_sentence)
        self.df['text'] = np.array(cleaned_text)

    def remove_null(self):
        for col in self.df.columns:
            if(self.df[col].isnull().sum() > 0):
                self.df.dropna(inplace = True)

    def stem_and_lemmatize(self):
        cleaned_text = []
        lemmatizer = WordNetLemmatizer()
        ps = PorterStemmer()

        for index in tqdm(range(self.df.shape[0])):
            text = self.df['text'].iloc[index]

            word_tokens = text.split()

            for count, word in enumerate(word_tokens):
                temp = lemmatizer.lemmatize(word)
                word_tokens[count] = ps.stem(temp)

            filtered_sentence = " ".join(word_tokens).strip()
            cleaned_text.append(filtered_sentence)
        
        self.df['text'] = np.array(cleaned_text)

In [4]:
train_text_preprocessor = Text_Preprocessor(train_df)
train_text_preprocessor.lower_case()
train_text_preprocessor.remove_null()
train_text_preprocessor.remove_punctuations()
train_text_preprocessor.stem_and_lemmatize()
train_df = train_text_preprocessor.get_df()

test_text_preprocessor = Text_Preprocessor(test_df)
test_text_preprocessor.lower_case()
test_text_preprocessor.remove_null()
test_text_preprocessor.remove_punctuations()
test_text_preprocessor.stem_and_lemmatize()
test_df = test_text_preprocessor.get_df()

dev_text_preprocessor = Text_Preprocessor(dev_df)
dev_text_preprocessor.lower_case()
dev_text_preprocessor.remove_null()
dev_text_preprocessor.remove_punctuations()
dev_text_preprocessor.stem_and_lemmatize()
dev_df = dev_text_preprocessor.get_df()

train_df.to_json('../data/facebook/train.jsonl')
test_df.to_json('../data/facebook/test.jsonl')
dev_df.to_json('../data/facebook/dev.jsonl')

train_df.head()

100%|██████████| 8500/8500 [00:00<00:00, 40826.65it/s]
100%|██████████| 8500/8500 [00:03<00:00, 2233.64it/s]
100%|██████████| 1000/1000 [00:00<00:00, 53790.37it/s]
100%|██████████| 1000/1000 [00:00<00:00, 4718.27it/s]
100%|██████████| 500/500 [00:00<00:00, 68520.94it/s]
100%|██████████| 500/500 [00:00<00:00, 5432.70it/s]


Unnamed: 0,id,img,label,text
0,42953,img/42953.png,0,it their charact not their color that matter
1,23058,img/23058.png,0,dont be afraid to love again everyon is not li...
2,13894,img/13894.png,0,put bow on your pet
3,37408,img/37408.png,0,i love everyth and everybodi except for squirr...
4,82403,img/82403.png,0,everybodi love chocol chip cooki even hitler


In [5]:
class Image_Preprocessing:
    def __init__(self, df, dir_name):
        self.df = df
        self.dir_name = dir_name
    
    def get_df(self):
        return self.df
    
    def resize_and_grayscale(self):
        for img in tqdm(self.df['img']):
            _image = Image.open(os.path.join('../data/facebook', img))
            _image = _image.resize((224, 224))
            _image.save(os.path.join('../data/facebook', img))
            

    def __rotate(self, img):
        new_file_name = "img/" + img.strip('.pngim') + "_augmented.png"
        path = os.path.join('../data/facebook', img)
        
        _image = np.array(Image.open(path))
        angle = tf.random.uniform([], minval=-60, maxval=60)
        _image = tf.keras.preprocessing.image.random_rotation(_image, angle, row_axis=0, col_axis=1, channel_axis=2)
        _image = Image.fromarray(_image)
        _image.save(os.path.join('../data/facebook', new_file_name))
        
        temp_df = self.df.loc[self.df['img'] == img]
        text = temp_df.loc[temp_df.iloc[0].name, 'text']
        id = temp_df.loc[temp_df.iloc[0].name, 'id']
        if 'label' in self.df.columns:
            label = temp_df.loc[temp_df.iloc[0].name, 'label']
            self.df.loc[len(self.df.index)] = [id, new_file_name, label, text]
        else:
            self.df.loc[len(self.df.index)] = [id, new_file_name, text]

    def __gaussian_noise(self, img):
        new_file_name = "img/" + img.strip(".pngim") + "_augmented.png"
        path = os.path.join('../data/facebook', img)
        
        _image = np.array(Image.open(path))
        noise = tf.random.normal(shape=tf.shape(_image), mean=0, stddev=50, dtype=tf.float32)
        _image = tf.clip_by_value(_image + noise, 0, 1)
        _image = _image*255
        _image = np.array(_image, dtype=np.uint8)
        _image = Image.fromarray(_image)
        _image.save(os.path.join('../data/facebook/', new_file_name))
        
        temp_df = self.df.loc[self.df['img'] == img]
        text = temp_df.loc[temp_df.iloc[0].name, 'text']
        id = temp_df.loc[temp_df.iloc[0].name, 'id']
        if 'label' in self.df.columns:
            label = temp_df.loc[temp_df.iloc[0].name, 'label']
            self.df.loc[len(self.df.index)] = [id, new_file_name, label, text]
        else:
            self.df.loc[len(self.df.index)] = [id, new_file_name, text]

    def __blur(self, img):
        new_file_name = "img/" + img.strip(".pngim") + "_augmented.png"
        path = os.path.join('../data/facebook', img)
        
        _image = (Image.open(path))
        _image = _image.filter(ImageFilter.BLUR)
        _image.save(os.path.join('../data/facebook/', new_file_name))
        
        temp_df = self.df.loc[self.df['img'] == img]
        text = temp_df.loc[temp_df.iloc[0].name, 'text']
        id = temp_df.loc[temp_df.iloc[0].name, 'id']
        if 'label' in self.df.columns:
            label = temp_df.loc[temp_df.iloc[0].name, 'label']
            self.df.loc[len(self.df.index)] = [id, new_file_name, label, text]
        else:
            self.df.loc[len(self.df.index)] = [id, new_file_name, text]

    def augmentation(self):
        for img in tqdm(self.df['img']):
            # Select a random augmentation
            augmentations = [self.__rotate, self.__gaussian_noise, self.__blur]
            selected_augmentation = np.random.choice(augmentations)

            # Apply the selected augmentation
            selected_augmentation(img)

In [6]:
%%time

train_img_preprocessor = Image_Preprocessing(train_df, "train")
train_img_preprocessor.augmentation()
train_img_preprocessor.resize_and_grayscale()
train_df = train_img_preprocessor.get_df()

test_img_preprocessor = Image_Preprocessing(test_df, "test")
test_img_preprocessor.augmentation()
test_img_preprocessor.resize_and_grayscale()
test_df = test_img_preprocessor.get_df()

dev_img_preprocessor = Image_Preprocessing(dev_df, "dev")
dev_img_preprocessor.augmentation()
dev_img_preprocessor.resize_and_grayscale()
dev_df = dev_img_preprocessor.get_df()

  0%|          | 0/8500 [00:00<?, ?it/s]2023-03-31 22:18:04.848702: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-03-31 22:18:04.848757: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2023-03-31 22:18:04.848817: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (sameep-Inspiron-5593): /proc/driver/nvidia/version does not exist
2023-03-31 22:18:04.850598: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
100

CPU times: user 59min 13s, sys: 41 s, total: 59min 54s
Wall time: 57min 23s





In [None]:
train_df.to_json('../data/facebook/train.jsonl', orient = 'records')
test_df.to_json('../data/facebook/test.jsonl', orient = 'records')
dev_df.to_json('../data/facebook/dev.jsonl', orient = 'records')