# Indonesia Hatespeech Recognition
* 2301859650 - Cornelius Tantius
* 2301860154 - Jonathan Kristanto
* 2301865741 - Edgard Jonathan Putra Pranoto

# 1.0 Installing Required Library and Matching Required Library Version

In [None]:
# !pip install PySastrawi &> /dev/null

# 1.1 Importing Necessities

In [None]:
import numpy as np
import pandas as pd 
import os
import re
# from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import string
import random
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from collections import Counter
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
import os
import nltk
import spacy
from spacy.util import compounding
from spacy.util import minibatch
from sklearn.model_selection import train_test_split

# 1.2 Importing Data

In [None]:
data = pd.read_csv('../input/indonesian-abusive-and-hate-speech-twitter-text/data.csv', encoding='latin-1')
alay_dict = pd.read_csv('../input/indonesian-abusive-and-hate-speech-twitter-text/new_kamusalay.csv', names = ['original', 'replacement'], encoding='latin-1')
abusive_dict = pd.read_csv('../input/indonesian-abusive-and-hate-speech-twitter-text/abusive.csv', encoding='latin-1')
stopword_dict = pd.read_csv('../input/indonesian-stoplist/stopwordbahasa.csv', names = ['stopword'], encoding='latin-1')

In [None]:
data

# 2.0 Data Cleaning

In [None]:
# factory = StemmerFactory()
# stemmer = factory.create_stemmer()

def lowercase(text):
    return text.lower()

def remove_unnecessary_char(text):
    text = re.sub('\\+n', ' ', text)
    text = re.sub('\n'," ",text) # Remove every '\n'
    
    text = re.sub('rt',' ',text) # Remove every retweet symbol
    text = re.sub('RT',' ',text) # Remove every retweet symbol
    text = re.sub('user',' ',text) # Remove every username
    text = re.sub('USER', ' ', text)
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) # Remove every URL
    text = re.sub(':', ' ', text)
    text = re.sub(';', ' ', text)
    text = re.sub('\\+n', ' ', text)
    text = re.sub('\n'," ",text) # Remove every '\n'
    text = re.sub('\\+', ' ', text)
    text = re.sub('  +', ' ', text) # Remove extra spaces
    return text
    
def remove_nonaplhanumeric(text):
    text = re.sub('[^0-9a-zA-Z]+', ' ', text) 
    return text

alay_dict_map = dict(zip(alay_dict['original'], alay_dict['replacement']))
def normalize_alay(text):
    return ' '.join([alay_dict_map[word] if word in alay_dict_map else word for word in text.split(' ')])

def remove_stopword(text):
    text = ' '.join(['' if word in stopword_dict.stopword.values else word for word in text.split(' ')])
    text = re.sub('  +', ' ', text) # Remove extra spaces
    text = text.strip()
    return text

# def stemming(text):
#     return stemmer.stem(text)

def remove_emoticon_byte(text):
    text = text.replace("\\", " ")
    text = re.sub('x..', ' ', text)
    text = re.sub(' n ', ' ', text)
    text = re.sub('\\+', ' ', text)
    text = re.sub('  +', ' ', text)
    return text

def remove_early_space(text):
    if text[0] == ' ':
        return text[1:]
    else:
        return text

# print("remove_nonaplhanumeric: ", remove_nonaplhanumeric("Halooo,,,,, duniaa!!"))
# print("lowercase: ", lowercase("Halooo, duniaa!"))
# print("stemming: ", stemming("Perekonomian Indonesia sedang dalam pertumbuhan yang membanggakan"))
# print("remove_unnecessary_char: ", remove_unnecessary_char("Hehe\n\n RT USER USER apa kabs www.google.com\n  hehe"))
# print("normalize_alay: ", normalize_alay("aamiin adek abis"))
# print("remove_stopword: ", remove_stopword("ada hehe adalah huhu yang hehe"))



In [None]:
def preprocess(text):
#     text = lowercase(text) # 1
#     text = remove_nonaplhanumeric(text) # 2
    text = remove_unnecessary_char(text) # 2
    text = normalize_alay(text) # 3
    text = remove_unnecessary_char(text)
#     text = stemming(text) # 4
#     text = remove_stopword(text) # 5
    text = remove_emoticon_byte(text)
    text = remove_early_space(text)
    return text

def classify(hs):
    retval = ""
    if int(hs) == 1:
        retval = 'positive'
    else:
        retval = 'negative'
    return retval

In [None]:
data['text'] = data['Tweet'].apply(preprocess)
data['hs_class'] = data['HS'].apply(classify)
data[['text', 'hs_class']].sample(10)

In [None]:
train = data[['hs_class', 'text']]
base_train = train
train.to_csv('train_preprocessed.csv', index = False)
train.sample(5)

# 3.0 Data Visualization and WordCloud for Better Data Understanding and Exploration

In [None]:
temp = train.groupby('hs_class').count()['text'].reset_index().sort_values(by='text',ascending=False)
temp.style.background_gradient(cmap='Purples')

### Data Balancing

In [None]:
# balancing datasize
train = data[['hs_class', 'text']]
train_pos = train[train['hs_class']=='positive']
train_neg = train[train['hs_class']=='negative']
train_neg, removed = train_test_split(train_neg, test_size=(1.9/7), shuffle=True)
train = pd.concat([train_pos, train_neg])
temp = train.groupby('hs_class').count()['text'].reset_index().sort_values(by='text',ascending=False)
temp.style.background_gradient(cmap='Purples')

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(x='hs_class',data=train)

In [None]:
fig = go.Figure(go.Funnelarea(
    text =temp['hs_class'],
    values = temp.text,
    title = {"position": "top center", "text": "Funnel-Chart of Sentiment Distribution"}
    ))
fig.show()

In [None]:
train['temp_list'] = train['text'].apply(lambda x:str(x).split())
top = Counter([item for sublist in train['temp_list'] for item in sublist])
temp = pd.DataFrame(top.most_common(20))
temp.columns = ['Common_words','count']
temp.style.background_gradient(cmap='Blues')

In [None]:
positive_text = train[train['hs_class']=='positive']
negative_text = train[train['hs_class']=='negative']

In [None]:
top = Counter([item for sublist in positive_text['temp_list'] for item in sublist])
temp_positive = pd.DataFrame(top.most_common(20))
temp_positive.columns = ['Common_words','count']
temp_positive.style.background_gradient(cmap='Greens')

In [None]:
top = Counter([item for sublist in negative_text['temp_list'] for item in sublist])
temp_negative = pd.DataFrame(top.most_common(20))
temp_negative = temp_negative.iloc[1:,:]
temp_negative.columns = ['Common_words','count']
temp_negative.style.background_gradient(cmap='Reds')

In [None]:
def plot_wordcloud(text, mask=None, max_words=200, max_font_size=100, figure_size=(14.0,8.0), color = 'white',
                   title = None, title_size=40, image_color=False):
    stopwords = set(STOPWORDS)
    more_stopwords = {'u', "im"}
    stopwords = stopwords.union(more_stopwords)

    wordcloud = WordCloud(background_color=color,
                    stopwords = stopwords,
                    max_words = max_words,
                    max_font_size = max_font_size, 
                    random_state = 42,
                    width=400, 
                    height=200,
                    mask = mask)
    wordcloud.generate(str(text))
    
    plt.figure(figsize=figure_size)
    if image_color:
        image_colors = ImageColorGenerator(mask);
        plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear");
        plt.title(title, fontdict={'size': title_size,  
                                  'verticalalignment': 'bottom'})
    else:
        plt.imshow(wordcloud);
        plt.title(title, fontdict={'size': title_size, 'color': 'black', 
                                  'verticalalignment': 'bottom'})
    plt.axis('off');
    plt.tight_layout() 

In [None]:
pos_mask = np.array(Image.open('../input/masksforwordclouds/twitter_mask.png'))
plot_wordcloud(positive_text.text,color='white',max_font_size=100,title_size=30,title="WordCloud of Neutral Tweets", mask=pos_mask)

In [None]:
pos_mask = np.array(Image.open('../input/masksforwordclouds/twitter_mask.png'))
plot_wordcloud(negative_text.text,color='white',max_font_size=100,title_size=30,title="WordCloud of Neutral Tweets", mask=pos_mask)

# 4.0 Initial Modelling Sample
Model moved to https://www.kaggle.com/corneliustantius/indo-hatespeech-classifier-model

# 5.0 Data Export
Export to 70% Train, 20% Validate and 10% Test with some proper classed splitting

In [None]:
exp_data = train[['hs_class', 'text']].dropna()
exp_data.to_csv('data_preprocessed.csv', index = False)

data_train_pos = exp_data[exp_data['hs_class']=='positive']
data_train_neg = exp_data[exp_data['hs_class']=='negative']

train_test_pos, validate_pos = train_test_split(data_train_pos, test_size=.2)
train_pos, test_pos = train_test_split(train_test_pos, test_size=.125)

train_test_neg, validate_neg = train_test_split(data_train_neg, test_size=.2)
train_neg, test_neg = train_test_split(train_test_neg, test_size=.125)

train_exp = pd.concat([train_pos, train_neg])
validate_exp = pd.concat([validate_pos, validate_neg])
test_exp = pd.concat([test_pos, test_neg])

train_exp.to_csv('train_split.csv', index=False)
validate_exp.to_csv('validate_split.csv', index=False)
test_exp.to_csv('test_split.csv', index=False)

