## TEXT
* remove_null
* lower case
* stem and lemmatize

## IMAGE
* resize all and save (grayscale)
* augmentation using GANs

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer, PorterStemmer

[nltk_data] Downloading package wordnet to /home/sameep/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
train_df = pd.read_csv('../data/facebook/train.csv')
train_df.drop('Unnamed: 0', axis = 1, inplace = True)

test_df = pd.read_csv('../data/facebook/test.csv')
test_df.drop('Unnamed: 0', axis = 1, inplace = True)

val_df = pd.read_csv('../data/facebook/val.csv')
val_df.drop('Unnamed: 0', axis = 1, inplace = True)

test_df.head()

Unnamed: 0,id,text
0,16395,handjobs sold seperately
1,37405,introducing fidget spinner for women
2,94180,happy pride month let's go beat up lesbians
3,54321,laughs in [majority of u.s crime rate]
4,97015,finds out those 72 virgins.. are goats


In [3]:
class Text_Preprocessor:
    def __init__(self, df):
        self.df = df

    def get_df(self):
        return self.df
    
    def lower_case(self):
        self.df['text'] = self.df['text'].apply(str.lower)

    def remove_null(self):
        for col in self.df.columns:
            if(self.df[col].isnull().sum() > 0):
                self.df.dropna(inplace = True)

    def stem_and_lemmatize(self):
        cleaned_text = []
        lemmatizer = WordNetLemmatizer()
        ps = PorterStemmer()

        for index in tqdm(range(self.df.shape[0])):
            text = self.df['text'].iloc[index]

            word_tokens = text.split()

            for count, word in enumerate(word_tokens):
                temp = lemmatizer.lemmatize(word)
                word_tokens[count] = ps.stem(temp)

            filtered_sentence = " ".join(word_tokens).strip()
            cleaned_text.append(filtered_sentence)
        
        self.df['text'] = np.array(cleaned_text)

In [4]:
train_preprocessor = Text_Preprocessor(train_df)
train_preprocessor.lower_case()
train_preprocessor.remove_null()
train_preprocessor.stem_and_lemmatize()
train_df = train_preprocessor.get_df()

test_preprocessor = Text_Preprocessor(test_df)
test_preprocessor.lower_case()
test_preprocessor.remove_null()
test_preprocessor.stem_and_lemmatize()
test_df = test_preprocessor.get_df()

val_preprocessor = Text_Preprocessor(val_df)
val_preprocessor.lower_case()
val_preprocessor.remove_null()
val_preprocessor.stem_and_lemmatize()
val_df = val_preprocessor.get_df()

train_df.head()

100%|██████████| 8500/8500 [00:03<00:00, 2650.96it/s]
100%|██████████| 1000/1000 [00:00<00:00, 4780.51it/s]
100%|██████████| 500/500 [00:00<00:00, 5439.59it/s]


Unnamed: 0,id,text,label
0,42953,it their charact not their color that matter,0
1,23058,don't be afraid to love again everyon is not l...,0
2,13894,put bow on your pet,0
3,37408,i love everyth and everybody! except for squir...,0
4,82403,"everybodi love chocol chip cookies, even hitler",0
