<a href="https://colab.research.google.com/github/shabri-arrahim/TELKOM_DTI_Multi-Linear-Regression/blob/master/DS0121_HateSpeechDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hate Speech Detection [Text Mining]

Berikut merupakan implementasi dari $Text Mining$ untuk melakukan proses Hate Speech Detection.

---
---
# [Part 1] Import Libraries and Load Data

---
## 1 - Import Libraries

Import requiered libraries

In [106]:
import os
import re
import nltk
import string 
import random
import pandas as pd
import sklearn.metrics as metrics

from bs4 import BeautifulSoup
from sklearn.naive_bayes import GaussianNB
from nltk.tokenize import WordPunctTokenizer
from sklearn.model_selection import train_test_split
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.cuda.FloatTensor

## Uncomment out the following line if you're on a machine with a CPU set up for PyTorch!
# dtype = torch.FloatTensor 

print("Runtime device: {}\nTensor type: {}".format(device, dtype))

Runtime device: cuda
Tensor type: <class 'torch.cuda.FloatTensor'>


---
## 2 - Load IDHSD Data

The dataset is a two columns data of: label - tweet, consist of 713 tweets in Indonesian.
The label is Non_HS or HS. Non_HS for "non-hate-speech" tweet and HS for "hate-speech" tweet.

*   Number of Non_HS tweets: 453
*   Number of HS tweets: 260

Since this dataset is unbalanced, you might have to do over-sampling/down-sampling in order to create a balanced dataset.


In [3]:
!wget 'https://raw.githubusercontent.com/shabri-arrahim/TELKOM_DTI_Multi-Linear-Regression/master/datasets/IDHSD_RIO_unbalanced_713_2017.txt'

--2020-11-27 17:15:23--  https://raw.githubusercontent.com/shabri-arrahim/TELKOM_DTI_Multi-Linear-Regression/master/datasets/IDHSD_RIO_unbalanced_713_2017.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 77009 (75K) [text/plain]
Saving to: ‘IDHSD_RIO_unbalanced_713_2017.txt.3’


2020-11-27 17:15:24 (10.8 MB/s) - ‘IDHSD_RIO_unbalanced_713_2017.txt.3’ saved [77009/77009]



In [4]:
with open('IDHSD_RIO_unbalanced_713_2017.txt', encoding="utf8", errors="ignore") as f:
  df = pd.read_csv(f, sep='\t')
df.head()

Unnamed: 0,Label,Tweet
0,Non_HS,RT @spardaxyz: Fadli Zon Minta Mendagri Segera...
1,Non_HS,RT @baguscondromowo: Mereka terus melukai aksi...
2,Non_HS,Sylvi: bagaimana gurbernur melakukan kekerasan...
3,Non_HS,"Ahmad Dhani Tak Puas Debat Pilkada, Masalah Ja..."
4,Non_HS,RT @lisdaulay28: Waspada KTP palsu.....kawal P...


In [5]:
features = df[['Tweet']]
labels = df[['Label']]
print("Feature shape: {}\nLabel shape: {}".format(features.shape, labels.shape))

Feature shape: (713, 1)
Label shape: (713, 1)


In [6]:
features.head()

Unnamed: 0,Tweet
0,RT @spardaxyz: Fadli Zon Minta Mendagri Segera...
1,RT @baguscondromowo: Mereka terus melukai aksi...
2,Sylvi: bagaimana gurbernur melakukan kekerasan...
3,"Ahmad Dhani Tak Puas Debat Pilkada, Masalah Ja..."
4,RT @lisdaulay28: Waspada KTP palsu.....kawal P...


---
---
# [Part 2] Pre-Processing


---
## 1 - Data Cleaning (Text Cleaning, Tokenization, Stemming)

In [47]:
def text_clening(text):

    # Create stemmer
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    # Create Tokenizer
    tok = WordPunctTokenizer()

    # Create list of stopwords
    stopwords = nltk.corpus.stopwords.words('indonesian')

    # Text Cleaning Process
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    try:
        text = souped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        text = souped
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    clean_text = emoji_pattern.sub(r'', text)
    clean_text = ' '.join(re.sub("([RT])|(#[A-Za-z0-9]+)|(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|(https?:[^ ]+)|(www.[^ ])"," ", text).split()).lower()
    clean_text = ' '.join([word for word in clean_text.split() if word not in stopwords])

    # Text tokenization and remove stopword
    text_token = (" ".join([x for x in tok.tokenize(clean_text) if len(x) > 1])).strip()

    # Text stemming
    text_stem = (" ".join([stemmer.stem(x) for x in text_token.split(" ")])).strip()

    return text_stem

In [48]:
clean_text = []
for word in features.Tweet:
  clean_text.append(text_clening(word))

In [60]:
clean_features = pd.DataFrame({'tweet': clean_text})

In [56]:
enc_labels = labels['Label'].map({'HS': 1, 'Non_HS': 0}))

---
## 2 - Text Vectorization

In [66]:
# TFIDF Class
class TFIDF(object):
  
  def __init__(self):
    pass

  def fit_trasnform(self, data):
    bookOfWord = self.__bookOfWord(data)
    uniqueWords = self.__uniqueWords(data)
    N_DOC = bookOfWord.shape[0]
    IDF_DICT = dict.fromkeys(uniqueWords, 0)
    TFIDF_LIST = []
    for low in bookOfWord.to_list():
      NOW = self.__numOfWord(low, uniqueWords)
      TF = self.__computeTF(NOW, low)
      IDF = self.__computeIDF(NOW, N_DOC, IDF_DICT)
      TFIDF_LIST.append(self.__compute_TFIDF(TF, IDF))
    return TFIDF_LIST

  def __uniqueWords(self, data):
    unique_words = set(list(data.str.split(' ', expand=True).stack().unique()))
    if '' in unique_words:
      unique_words.remove('')
    return unique_words

  def __bookOfWord(self, data):
    bow = data.str.split()
    return bow

  #Compute NumOfWord
  def __numOfWord(self, bookOfWord, uniqueWords):
    numOfWord = dict.fromkeys(uniqueWords, 0)
    for word in bookOfWord:
      numOfWord[word] += 1
    return numOfWord

  #Compute TF
  def __computeTF(self, wordOfDict, bagOfWord):
    bowCount = len(bagOfWord)
    tfDict = dict.fromkeys(wordOfDict.keys(), 0)
    for word, count in wordOfDict.items(): 
      tfDict[word] = count / float(bowCount)
    return tfDict

  # Compute IDF
  def __computeIDF(self, document, n_doc, idfDict):
    for word, val in document.items():
      if val > 0:
        idfDict[word] += 1
    for word, val in idfDict.items():
      if val > 0:
        idfDict[word] = np.log(n_doc+1/float(val)+1)+1
    return idfDict

  # Compute TFIDF
  def __compute_TFIDF(self, listOftfBagOfWords, idfs):
    tfidf = dict.fromkeys(listOftfBagOfWords.keys(), 0)
    for word, val in listOftfBagOfWords.items():
      tfidf[word] = val * idfs[word]
    return tfidf

In [67]:
vectorizer = TFIDF()
features_vect = pd.DataFrame(vectorizer.fit_trasnform(clean_features['tweet']))

In [84]:
features_vect.head()

Unnamed: 0,lau,masjid2,ajah,cara2,wagub,duh,on,emak,kristen,media,ho,ahay,cuihhhhhh,mimpi,kondisi,tancap,maidah,ngumpet2,skema,sandang,bohong,bongkar,rena,maju,akh,ira,pantes,tahu,share,germo,tuips,onde,ketutunan,up,dicopo,gusur,aktif,ge,asu,waduk,...,mang,familiar,kecu,sumber,daging,abai,cnn,resmi,putra,pokok,kaga,galak,100,setia,klo,cebong,jamban,nyerang,allah,kualitas,coklat,some,puluh,ujar,kali,patung,islamophobia,gub,uips,figur,lunak,atur,enak,mirip2,menang,tunda,xoxo,metode,seba,unfollow
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


---
## 3 - Split Data

In [98]:
X_train, X_test, y_train, y_test = train_test_split(features_vect.values, enc_labels.values, test_size=0.2, random_state=35)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(570, 2257)
(570,)
(143, 2257)
(143,)


---
---
# [Part 3] Model

---
## 1 - Train Model

In [102]:
model = GaussianNB().fit(X_train, y_train)
labels_pred = model.predict(X_test)
labels_pred

array([0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0])

---
## 2 - Model Validation

In [104]:
conf_mat = metrics.confusion_matrix(y_test, labels_pred)
conf_mat

array([[73, 28],
       [11, 31]])

In [105]:
print('Test model accuracy: ', metrics.accuracy_score(y_test, labels_pred))
print('Test model precision: ', metrics.precision_score(y_test, labels_pred))
print('Test model recall: ', metrics.recall_score(y_test, labels_pred))
print('Test model F1 Score: ', metrics.f1_score(y_test, labels_pred))

Test model accuracy:  0.7272727272727273
Test model precision:  0.5254237288135594
Test model recall:  0.7380952380952381
Test model F1 Score:  0.613861386138614
