# Preprocessing the Toxic comments
Preprocess the data in 5 steps:
1. Lower casing all text, 
2. Data cleaning by removing unnecessary characters such as re-tweet symbol (RT), username, URL, and punctuation
3. Normalization using 'Alay' dictionary 
4. Stemming using PySastrawi [2]
5. Stop words removal using list from [3]

In [None]:
!pip install PySastrawi

In [None]:
import numpy as np
import pandas as pd

!ls '../input'

# Load data

In [None]:
data2 = pd.read_csv('../input/aldon-data-unprocessed/data_train.csv', encoding='latin-1')
data_test = pd.read_csv('../input/aldon-data-unprocessed/data_test.csv', encoding='latin-1')

alay_dict = pd.read_csv('../input/aldon-data-unprocessed/new_kamusalay.csv', encoding='latin-1', header=None)
alay_dict = alay_dict.rename(columns={0: 'original', 
                                      1: 'replacement'})

id_stopword_dict = pd.read_csv('../input/indonesian-stoplist/stopwordbahasa.csv', header=None)
id_stopword_dict = id_stopword_dict.rename(columns={0: 'stopword'})

#Delete unnamed columns
#data = data.loc[:, ~data.columns.str.contains('^Unnamed')]
data2 = data2.loc[:, ~data2.columns.str.contains('^Unnamed')]
data_test = data_test.loc[:, ~data_test.columns.str.contains('^Unnamed')]



### Text Data

In [None]:
print("Shape: ", data2.shape)
data2.head(15)



In [None]:
print("Test Shape: ", data_test.shape)
data_test.head(15)

In [None]:
data2.toxic.value_counts()

In [None]:
print("Toxic shape: ", data2[(data2['toxic'] == 1)].shape)
print("Non-toxic shape: ", data2[(data2['toxic'] == 0) ].shape)

### Alay Dict

In [None]:
print("Shape: ", alay_dict.shape)
alay_dict.head(15)

### ID Stopword

In [None]:
print("Shape: ", id_stopword_dict.shape)
id_stopword_dict.head()


# Preprocess

In [None]:
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def lowercase(text):
    return text.lower()

def remove_unnecessary_char(text):
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) # Remove every URL
    text = re.sub('\n',' ',text) # Remove every '\n'
    text = re.sub('\r',' ',text) # Remove every '\r'
    text = re.sub('(?i)rt',' ',text) # Remove every retweet symbol
    text = re.sub('@[^\s]+[ \t]','',text) # Remove every username
    text = re.sub('(?i)user','',text) # Remove every username
    text = re.sub('(?i)url',' ',text) # Remove every url
    text = re.sub(r'\\x..',' ',text) # Remove every emoji
    text = re.sub('  +', ' ', text) # Remove extra spaces
    text = re.sub(r'(\w)\1{2,}', r'\1\1', text) #Remove characters repeating more than twice 

    return text
    
def remove_nonaplhanumeric(text):
    text = re.sub('[^0-9a-zA-Z]+', ' ', text) 
    return text

alay_dict_map = dict(zip(alay_dict['original'], alay_dict['replacement']))
def normalize_alay(text):
    return ' '.join([alay_dict_map[word] if word in alay_dict_map else word for word in text.split(' ')])

def remove_stopword(text):
    text = ' '.join(['' if word in id_stopword_dict.stopword.values else word for word in text.split(' ')])
    text = re.sub('  +', ' ', text) # Remove extra spaces
    text = text.strip()
    return text

def stemming(text):
    return stemmer.stem(text)

def preprocess(text):
    text = remove_unnecessary_char(text) # 1
    text = lowercase(text) # 2
    text = remove_nonaplhanumeric(text) # 3
    text = normalize_alay(text) # 4
    text = stemming(text) # 5
    text = remove_stopword(text) # 6
    return text

def preprocess2(text):
    text = remove_unnecessary_char(text) # 1
    text = lowercase(text) # 2
    text = remove_nonaplhanumeric(text) # 3
    text = normalize_alay(text) # 4
    return text

def preprocess_test(text):
    text = remove_unnecessary_char(text) # 1
    text = lowercase(text) # 2
    text = remove_nonaplhanumeric(text) # 3
    text = normalize_alay(text) # 4
    return text

print("Text awal : RT Halooo,,,,, duniaa!!... Saaatnya menggambar mahatarii yang tenggelaaam... aamiin www.mataharitenggelam.com ðŸ")
print("remove_nonaplhanumeric: ", remove_nonaplhanumeric("RT Halooo,,,,, duniaa!!... Saaatnya menggambar mahatarii yang tenggelaaam... aamiin www.mataharitenggelam.com ðŸ"))
print("lowercase: ", lowercase("RT Halooo,,,,, duniaa!!... Saaatnya menggambar mahatarii yang tenggelaaam... aamiin www.mataharitenggelam.com ðŸ"))
print("stemming: ", stemming("RT Halooo,,,,, duniaa!!... Saaatnya menggambar mahatarii yang tenggelaaam... aamiin www.mataharitenggelam.com ðŸ"))
print("remove_unnecessary_char: ", remove_unnecessary_char("RT Halooo,,,,, duniaa!!... Saaatnya menggambar mahatarii yang tenggelaaam... aamiin www.mataharitenggelam.com ðŸ"))
print("normalize_alay: ", normalize_alay("RT Halooo,,,,, duniaa!!... Saaatnya menggambar mahatarii yang tenggelaaam... aamiin www.mataharitenggelam.com ðŸ"))
print("remove_stopword: ", remove_stopword(" RT Halooo,,,,, duniaa!!... Saaatnya menggambar mahatarii yang tenggelaaam... aamiin www.mataharitenggelam.com ðŸ"))
print("Hasil akhir : " )
preprocess("Text awal : RT Halooo,,,,, duniaa!!... Saaatnya menggaaambar mahatarii yang tenggelaaam... aamiin www.mataharitenggelam.com ðŸ")


In [None]:
preprocess_test("@usernamekl Bukti baru kerja paksa etnis Uighur di ladang kapas Xinjiang, China - BB... https://t.co/2cM3WZyOA2 via @YouTube")

In [None]:
data2['comment_text'] = data2['comment_text'].apply(preprocess2)
data_test['comment_text'] = data_test['comment_text'].apply(preprocess_test)

In [None]:
print("Shape: ", data2.shape)
data2.head(15)

# Save Preprocessed Data

In [None]:
print("Shape: ", data_test.shape)
data_test.head(15)

In [None]:
data2.to_csv('preprocessed_indonesian_toxic_tweet_nostemstop.csv', index=False)
data_test.to_csv('preprocessed_indonesian_toxic_tweet_nostemstop_test.csv', index=False)