## TEXT
* remove_null
* lower case
* stem and lemmatize

## IMAGE
* resize all and save (grayscale)
* augmentation using GANs

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import nltk
nltk.download('wordnet')
import cv2
from nltk.stem import WordNetLemmatizer, PorterStemmer
import tensorflow as tf
from PIL import Image

[nltk_data] Downloading package wordnet to /home/sameep/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
2023-03-16 15:50:22.752055: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-16 15:50:23.120580: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-03-16 15:50:23.128294: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No su

In [2]:
train_df = pd.read_csv('../data/facebook/train.csv')
train_df.drop('Unnamed: 0', axis = 1, inplace = True)

test_df = pd.read_csv('../data/facebook/test.csv')
test_df.drop('Unnamed: 0', axis = 1, inplace = True)

val_df = pd.read_csv('../data/facebook/val.csv')
val_df.drop('Unnamed: 0', axis = 1, inplace = True)

test_df.head()

Unnamed: 0,id,text
0,16395,handjob sold seper
1,37405,introduc fidget spinner for woman
2,94180,happi pride month let' go beat up lesbian
3,54321,laugh in [major of u. crime rate]
4,97015,find out those 72 virgins.. are goat


In [3]:
class Text_Preprocessor:
    def __init__(self, df):
        self.df = df

    def get_df(self):
        return self.df
    
    def lower_case(self):
        self.df['text'] = self.df['text'].apply(str.lower)

    def remove_null(self):
        for col in self.df.columns:
            if(self.df[col].isnull().sum() > 0):
                self.df.dropna(inplace = True)

    def stem_and_lemmatize(self):
        cleaned_text = []
        lemmatizer = WordNetLemmatizer()
        ps = PorterStemmer()

        for index in tqdm(range(self.df.shape[0])):
            text = self.df['text'].iloc[index]

            word_tokens = text.split()

            for count, word in enumerate(word_tokens):
                temp = lemmatizer.lemmatize(word)
                word_tokens[count] = ps.stem(temp)

            filtered_sentence = " ".join(word_tokens).strip()
            cleaned_text.append(filtered_sentence)
        
        self.df['text'] = np.array(cleaned_text)

In [4]:
train_text_preprocessor = Text_Preprocessor(train_df)
train_text_preprocessor.lower_case()
train_text_preprocessor.remove_null()
train_text_preprocessor.stem_and_lemmatize()
train_df = train_text_preprocessor.get_df()

test_text_preprocessor = Text_Preprocessor(test_df)
test_text_preprocessor.lower_case()
test_text_preprocessor.remove_null()
test_text_preprocessor.stem_and_lemmatize()
test_df = test_text_preprocessor.get_df()

val_text_preprocessor = Text_Preprocessor(val_df)
val_text_preprocessor.lower_case()
val_text_preprocessor.remove_null()
val_text_preprocessor.stem_and_lemmatize()
val_df = val_text_preprocessor.get_df()

train_df.to_csv('../data/facebook/train.csv')
test_df.to_csv('../data/facebook/test.csv')
val_df.to_csv('../data/facebook/val.csv')

train_df.head()

100%|██████████| 8500/8500 [00:08<00:00, 1056.69it/s]
100%|██████████| 1000/1000 [00:00<00:00, 1836.59it/s]
100%|██████████| 500/500 [00:00<00:00, 1564.57it/s]


Unnamed: 0,id,text,label
0,42953,it their charact not their color that matter,0
1,23058,don't be afraid to love again everyon is not l...,0
2,13894,put bow on your pet,0
3,37408,i love everyth and everybody! except for squir...,0
4,82403,"everybodi love chocol chip cookies, even hitler",0


In [7]:
class Image_Preprocessing:
    def __init__(self, df, dir_name):
        self.df = df
        self.dir_name = dir_name
    
    def get_df(self):
        return self.df
    
    def resize_and_grayscale(self):
        for id in tqdm(self.df['id']):
            file_name = str(id) + '.png'
            _image = Image.open(os.path.join('../data/facebook/img', self.dir_name, file_name))
            _image = _image.convert('L')
            _image = _image.resize((224, 224))
            _image.save(os.path.join('../data/facebook/img', self.dir_name, file_name))
            

    def __rotate(self, id):
        file_name = str(id) + '.png'
        new_file_name = str(id) + "_augmented.png"
        path = os.path.join('../data/facebook/img', self.dir_name, file_name)
        _image = np.array(Image.open(path))
        angle = tf.random.uniform([], minval=-60, maxval=60)
        _image = tf.keras.preprocessing.image.random_rotation(_image, angle, row_axis=0, col_axis=1, channel_axis=2)
        _image = Image.fromarray(_image)
        _image.save(os.path.join('../data/facebook/img', self.dir_name, new_file_name))
        text = self.df[self.df['id'] == id]['text']
        new_id = new_file_name.strip('.png')
        if 'label' in self.df.columns:
            label = self.df[self.df['id'] == id]['label']
            self.df.loc[len(self.df.index)] = [new_id, text, label]
        else:
            self.df.loc[len(self.df.index)] = [new_id, text]

    def __gaussian_noise(self, id):
        file_name = str(id) + '.png'
        new_file_name = str(id) + "_augmented.png"
        path = os.path.join('../data/facebook/img', self.dir_name, file_name)
        _image = np.array(Image.open(path))
        noise = tf.random.normal(shape=tf.shape(_image), mean=0.0, stddev=1.0, dtype=tf.float32)
        _image = tf.clip_by_value(_image + noise, 0, 1)
        _image = np.array(_image)
        _image = Image.fromarray(_image)
        _image.save(os.path.join('../data/facebook/img/', self.dir_name, new_file_name))
        text = self.df[self.df['id'] == id]['text']
        new_id = new_file_name.strip('.png')
        if 'label' in self.df.columns:
            label = self.df[self.df['id'] == id]['label']
            self.df.loc[len(self.df.index)] = [new_id, text, label]
        else:
            self.df.loc[len(self.df.index)] = [new_id, text]

    def __blur(self, id):
        file_name = str(id) + '.png'
        new_file_name = str(id) + "_augmented.png"
        path = os.path.join('../data/facebook/img', self.dir_name, file_name)
        _image = np.array(Image.open(path))
        kernel_size = tf.random.uniform([], minval=3, maxval=7, dtype=tf.int32)
        _image = tf.expand_dims(cv2.GaussianBlur(_image.np(), (kernel_size, kernel_size), 0), axis=2)
        _image = Image.fromarray(_image)
        _image.save(os.path.join('../data/facebook/img/', self.dir_name, new_file_name))
        text = self.df[self.df['id'] == id]['text']
        new_id = new_file_name.strip('.png')
        if 'label' in self.df.columns:
            label = self.df[self.df['id'] == id]['label']
            self.df.loc[len(self.df.index)] = [new_id, text, label]
        else:
            self.df.loc[len(self.df.index)] = [new_id, text]

    def augmentation(self):
        for id in tqdm(self.df['id']):
            # # Select a random augmentation
            # augmentations = [self.__rotate, self.__gaussian_noise, self.__blur]
            # selected_augmentation = np.random.choice(augmentations)

            # # Apply the selected augmentation
            # selected_augmentation(id)

            self.__gaussian_noise(id)

In [8]:
%%time

train_img_preprocessor = Image_Preprocessing(val_df, "val")
train_img_preprocessor.augmentation()
train_img_preprocessor.resize_and_grayscale()

  0%|          | 0/531 [00:00<?, ?it/s]


TypeError: Cannot handle this data type: (1, 1, 3), <f4