## TEXT
* remove_null
* lower case
* stem and lemmatize

## IMAGE
* resize all and save (grayscale)
* augmentation using GANs

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import string
import os
import nltk
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import tensorflow as tf
from PIL import Image, ImageFilter

In [None]:
train_df = pd.read_json('../data/facebook/train.jsonl', lines = True)

test_df = pd.read_json('../data/facebook/test.jsonl', lines = True)

dev_df = pd.read_json('../data/facebook/dev.jsonl', lines = True)

test_df.head()

In [None]:
class Text_Preprocessor:
    def __init__(self, df):
        self.df = df

    def get_df(self):
        return self.df
    
    def lower_case(self):
        self.df['text'] = self.df['text'].apply(str.lower)

    def remove_punctuations(self):
        cleaned_text = []
        for index in tqdm(range(self.df.shape[0])):
            text = self.df['text'].iloc[index]

            word_tokens = text.split()
            
            table = str.maketrans('', '', string.punctuation)
            stripped = [w.translate(table) for w in word_tokens]

            filtered_sentence = " ".join(stripped).strip()
            cleaned_text.append(filtered_sentence)
        self.df['text'] = np.array(cleaned_text)

    def remove_null(self):
        for col in self.df.columns:
            if(self.df[col].isnull().sum() > 0):
                self.df.dropna(inplace = True)

    def stem_and_lemmatize(self):
        cleaned_text = []
        lemmatizer = WordNetLemmatizer()
        ps = PorterStemmer()

        for index in tqdm(range(self.df.shape[0])):
            text = self.df['text'].iloc[index]

            word_tokens = text.split()

            for count, word in enumerate(word_tokens):
                temp = lemmatizer.lemmatize(word)
                word_tokens[count] = ps.stem(temp)

            filtered_sentence = " ".join(word_tokens).strip()
            cleaned_text.append(filtered_sentence)
        
        self.df['text'] = np.array(cleaned_text)

In [None]:
train_text_preprocessor = Text_Preprocessor(train_df)
train_text_preprocessor.lower_case()
train_text_preprocessor.remove_null()
train_text_preprocessor.remove_punctuations()
train_text_preprocessor.stem_and_lemmatize()
train_df = train_text_preprocessor.get_df()

test_text_preprocessor = Text_Preprocessor(test_df)
test_text_preprocessor.lower_case()
test_text_preprocessor.remove_null()
test_text_preprocessor.remove_punctuations()
test_text_preprocessor.stem_and_lemmatize()
test_df = test_text_preprocessor.get_df()

dev_text_preprocessor = Text_Preprocessor(dev_df)
dev_text_preprocessor.lower_case()
dev_text_preprocessor.remove_null()
dev_text_preprocessor.remove_punctuations()
dev_text_preprocessor.stem_and_lemmatize()
dev_df = dev_text_preprocessor.get_df()

train_df.to_json('../data/facebook/train.jsonl')
test_df.to_json('../data/facebook/test.jsonl')
dev_df.to_json('../data/facebook/dev.jsonl')

train_df.head()

In [None]:
class Image_Preprocessing:
    def __init__(self, df, dir_name):
        self.df = df
        self.dir_name = dir_name
    
    def get_df(self):
        return self.df
    
    def resize_and_grayscale(self):
        for img in tqdm(self.df['img']):
            _image = Image.open(os.path.join('../data/facebook', img))
            _image = _image.resize((224, 224))
            _image.save(os.path.join('../data/facebook', img))
            

    def __rotate(self, img):
        new_file_name = "img/" + img.strip('.pngim') + "_augmented.png"
        path = os.path.join('../data/facebook', img)
        
        _image = np.array(Image.open(path))
        angle = tf.random.uniform([], minval=-60, maxval=60)
        _image = tf.keras.preprocessing.image.random_rotation(_image, angle, row_axis=0, col_axis=1, channel_axis=2)
        _image = Image.fromarray(_image)
        _image.save(os.path.join('../data/facebook', new_file_name))
        
        temp_df = self.df.loc[self.df['img'] == img]
        text = temp_df.loc[temp_df.iloc[0].name, 'text']
        id = temp_df.loc[temp_df.iloc[0].name, 'id']
        if 'label' in self.df.columns:
            label = temp_df.loc[temp_df.iloc[0].name, 'label']
            self.df.loc[len(self.df.index)] = [id, new_file_name, label, text]
        else:
            self.df.loc[len(self.df.index)] = [id, new_file_name, text]

    def __gaussian_noise(self, img):
        new_file_name = "img/" + img.strip(".pngim") + "_augmented.png"
        path = os.path.join('../data/facebook', img)
        
        _image = np.array(Image.open(path))
        noise = tf.random.normal(shape=tf.shape(_image), mean=0, stddev=50, dtype=tf.float32)
        _image = tf.clip_by_value(_image + noise, 0, 1)
        _image = _image*255
        _image = np.array(_image, dtype=np.uint8)
        _image = Image.fromarray(_image)
        _image.save(os.path.join('../data/facebook/', new_file_name))
        
        temp_df = self.df.loc[self.df['img'] == img]
        text = temp_df.loc[temp_df.iloc[0].name, 'text']
        id = temp_df.loc[temp_df.iloc[0].name, 'id']
        if 'label' in self.df.columns:
            label = temp_df.loc[temp_df.iloc[0].name, 'label']
            self.df.loc[len(self.df.index)] = [id, new_file_name, label, text]
        else:
            self.df.loc[len(self.df.index)] = [id, new_file_name, text]

    def __blur(self, img):
        new_file_name = "img/" + img.strip(".pngim") + "_augmented.png"
        path = os.path.join('../data/facebook', img)
        
        _image = (Image.open(path))
        _image = _image.filter(ImageFilter.BLUR)
        _image.save(os.path.join('../data/facebook/', new_file_name))
        
        temp_df = self.df.loc[self.df['img'] == img]
        text = temp_df.loc[temp_df.iloc[0].name, 'text']
        id = temp_df.loc[temp_df.iloc[0].name, 'id']
        if 'label' in self.df.columns:
            label = temp_df.loc[temp_df.iloc[0].name, 'label']
            self.df.loc[len(self.df.index)] = [id, new_file_name, label, text]
        else:
            self.df.loc[len(self.df.index)] = [id, new_file_name, text]

    def augmentation(self):
        for img in tqdm(self.df['img']):
            # Select a random augmentation
            augmentations = [self.__rotate, self.__gaussian_noise, self.__blur]
            selected_augmentation = np.random.choice(augmentations)

            # Apply the selected augmentation
            selected_augmentation(img)

In [None]:
%%time

train_img_preprocessor = Image_Preprocessing(train_df, "train")
train_img_preprocessor.augmentation()
train_img_preprocessor.resize_and_grayscale()
train_df = train_img_preprocessor.get_df()

test_img_preprocessor = Image_Preprocessing(test_df, "test")
test_img_preprocessor.augmentation()
test_img_preprocessor.resize_and_grayscale()
test_df = test_img_preprocessor.get_df()

dev_img_preprocessor = Image_Preprocessing(dev_df, "dev")
dev_img_preprocessor.augmentation()
dev_img_preprocessor.resize_and_grayscale()
dev_df = dev_img_preprocessor.get_df()

In [None]:
train_df.to_json('../data/facebook/train.jsonl', orient = 'records')
test_df.to_json('../data/facebook/test.jsonl', orient = 'records')
dev_df.to_json('../data/facebook/dev.jsonl', orient = 'records')