# Install library

In [2]:
!pip install Sastrawi



# Import library

In [29]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import re
from collections import Counter
import pickle

import nltk
nltk.download('punkt')
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate

pd.set_option('max_columns', 1000)
pd.set_option('max_rows', 1000)

factory = StemmerFactory()
stemmer = factory.create_stemmer()

[nltk_data] Downloading package punkt to /home/hadi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Read data

In [3]:
df_data = pd.read_excel('tweets2.xlsx')
print(df_data.shape[0])
df_data.head()

4459


Unnamed: 0,text,label
0,RT @AnakJakartaID: Kalian sok menerima The Jak...,0
1,RT @SupportPersija: Kalo emang mau usik anak t...,0
2,RT @JakmaniaBoeloes: Tetep jaga kondusifitas b...,0
3,"RT @SupportPersija: Ambil positif nya aja, Dij...",0
4,RT @SupportPersija: Selalu utamakan kepala din...,0


In [4]:
df_data.label.value_counts(dropna=False)

0    3020
1    1439
Name: label, dtype: int64

# Preprocessing

In [22]:
 def cleaning(tweet):
    # lowercase
    normal_tw = tweet.lower()
    # hapus URL
    normal_tw = re.sub(r'((www\.[^\s]*)|(https?://[^\s]*))|(pic\.twitter\.com/[^\s]*)', '', normal_tw)
    # hapus @username
    normal_tw = re.sub(r'@[^\s]+', '', normal_tw)
    # hapus hashtag
    normal_tw = re.sub(r'#[^\s]+', '', normal_tw)
    # hapus tanda baca
    normal_tw = re.sub(r'[^\w\s]', '', normal_tw) 
    # hapus angka
    normal_tw = re.sub(r'\d+', '', normal_tw)
    # remove spasi berlebih
    normal_tw = re.sub(r'\s+', ' ', normal_tw)
    # trim depan belakang
    normal_tw = normal_tw.strip()
    # regex huruf yang berulang kaya haiiii (untuk fitur unigram)
    normal_regex = re.compile(r"(.)\1{1,}")
    # buang huruf yang berulang
    normal_tw = normal_regex.sub(r"\1\1", normal_tw)
    return normal_tw

In [21]:
def remove_stopwords_and_normalize(tweet):
    stopwords = pd.read_csv('dataset/stopwordsID.csv', header=None)

    df_kamus_singkatan = pd.read_csv('dataset/kamus_singkatan.csv')
    df_kamus_alay = pd.read_csv('dataset/colloquial-indonesian-lexicon.csv')

    token = nltk.word_tokenize(tweet)
    token_new = []
    for k in token:
        if k in df_kamus_singkatan['singkatan'].values:
            k = df_kamus_singkatan.loc[df_kamus_singkatan['singkatan']
                                       == k, 'asli'].values[0]
        if k in df_kamus_alay['slang'].values:
            k = df_kamus_alay.loc[df_kamus_alay['slang']
                                  == k, 'formal'].values[0]
        if k not in stopwords[0].values:
            token_new.append(k)

    str_clean = ' '.join(token_new)
    return str_clean

In [20]:
def stemming(tweet):
    token = nltk.word_tokenize(tweet)
    stem_kalimat = []
    for k in token:
        stem_kata = stemmer.stem(k)
        stem_kalimat.append(stem_kata)

    stem_kalimat_str = ' '.join(stem_kalimat)
    return stem_kalimat_str

In [19]:
def preprocessing(list_tweet):
    tweet_clean = []
    for tw in list_tweet:
        normal_tweet = cleaning(tw)
        normal_tweet = remove_stopwords_and_normalize(normal_tweet)
#         normal_tweet = stemming(normal_tweet)
        tweet_clean.append(normal_tweet)
    return tweet_clean

In [18]:
raw_tweet = df_data['text']
len(raw_tweet)

4459

In [23]:
clean_tweet = preprocessing(raw_tweet)
clean_tweet[:3]

['sok menerima the jak sediakan kuota hadir bantul biar tribun noda',
 'usik anak the jak pakai atribut asli provokasi sok berani intinya',
 'jaga kondusifitas bantul hargai tuan rumah tahan menang']

In [24]:
label = df_data['label'].tolist()

# Feature extraction

In [25]:
'''
Function to extract TF (1-gram) features
'''
def tf_extraction(text, ngram_start, ngram_end):
    ngram = CountVectorizer(ngram_range=(ngram_start, ngram_end))
    ngram_matrix = ngram.fit_transform(np.array(text)).todense()
    feature_names = ngram.get_feature_names()
    return ngram_matrix, feature_names

# unigram features
ngram_feat, feature_names = tf_extraction(clean_tweet, 1, 1)
print(ngram_feat[:3])
print(feature_names[:3])

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
['__', '_selamat', 'aa']


In [26]:
'''
Function to extract orthography and url occurence features
'''
def orthography_and_url_extraction(text):
    all_orto_feat = []
    for t in text:
        capital_count = sum(1 for c in t if c.isupper())
        exclamation_count = sum(1 for c in t if c == "!")
        word_len = len(nltk.word_tokenize(t))
        char_len = len(t)
        url = 1 if 'http' in t.lower() or 'www' in t.lower() else 0
        orto_feat = [capital_count, exclamation_count, word_len, char_len, url]
        all_orto_feat.append(orto_feat)
    return all_orto_feat

orto_feat = orthography_and_url_extraction(raw_tweet)
orto_feat[:3]

[[11, 0, 28, 140, 0], [8, 3, 27, 140, 0], [25, 0, 26, 138, 0]]

In [27]:
'''
Function to extract TF-IDF (1-gram) features
'''
def tf_idf_extraction(text):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(np.array(text)).todense()
    return tfidf_matrix

# tf-idf features
tfidf_feat = tf_idf_extraction(clean_tweet)
print(tfidf_feat[:3])

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# Modeling

In [37]:
# list of features combinations
feat_list = [ngram_feat, tfidf_feat, np.hstack((ngram_feat, orto_feat)), np.hstack((tfidf_feat, orto_feat))]
feat_name = ['tf', 'tf-idf', 'tf and orthography', 'tf-idf and orthography']

# list of model to do prediction
mlp = MLPClassifier(random_state=0)
model_list = [mlp]
model_name = ['Multilayer Perceptron']

# build the model and evaluate the performance of it for each feature combination
df_recap = pd.DataFrame()
for f, fn in zip(feat_list, feat_name):
    print("Features : ", fn)
    X = f
    y = label
    for m, n in zip(model_list, model_name):
        scoring = ['accuracy', 'f1_macro', 'precision_macro', 'recall_macro']
        scores = cross_validate(m, X, y, cv=10, scoring=scoring)
        acc = np.mean(scores['test_accuracy'])
        f1 = np.mean(scores['test_f1_macro'])
        precision = np.mean(scores['test_precision_macro'])
        recall = np.mean(scores['test_recall_macro'])
        print("Classifier : ", n)
        print("Accuracy:", acc)
        print("F1-Score:", f1)
        print("Precision:", precision)
        print("Recall:", recall)
        df_recap = df_recap.append({
            'features': fn,
            'classifier': n,
            'accuracy': acc,
            'f1_score': f1,
            'precision': precision,
            'recall': recall
        }, ignore_index=True)
        print('='*90)

Features :  tf
Classifier :  Multilayer Perceptron
Accuracy: 0.9271159369174182
F1-Score: 0.9176366201263171
Precision: 0.9153103193501154
Recall: 0.924557598398658
Features :  tf-idf
Classifier :  Multilayer Perceptron
Accuracy: 0.9354139164609261
F1-Score: 0.9269566504615068
Precision: 0.9249255913368561
Recall: 0.933236369039349
Features :  tf and orthography
Classifier :  Multilayer Perceptron
Accuracy: 0.9398972136846879
F1-Score: 0.9325139832514504
Precision: 0.9349322088206315
Recall: 0.9369109459341247
Features :  tf-idf and orthography
Classifier :  Multilayer Perceptron
Accuracy: 0.9439366151055577
F1-Score: 0.9364854117490078
Precision: 0.9392624558039737
Recall: 0.9404360563094005


In [41]:
df_recap[['features', 'classifier', 'accuracy', 'precision', 'recall', 'f1_score']]

Unnamed: 0,features,classifier,accuracy,precision,recall,f1_score
0,tf,Multilayer Perceptron,0.927116,0.91531,0.924558,0.917637
1,tf-idf,Multilayer Perceptron,0.935414,0.924926,0.933236,0.926957
2,tf and orthography,Multilayer Perceptron,0.939897,0.934932,0.936911,0.932514
3,tf-idf and orthography,Multilayer Perceptron,0.943937,0.939262,0.940436,0.936485


In [40]:
df_recap[['features', 'classifier', 'accuracy', 'precision', 'recall', 'f1_score']].to_csv('df_recap.csv', index=False)

# Prediction

In [28]:
feat = np.hstack((tfidf_feat, orto_feat))
y = label
mlp = MLPClassifier(random_state=0)
mlp.fit(feat, y)

MLPClassifier(random_state=0)

In [32]:
# save the model to disk
pickle.dump(mlp, open('final_model.sav', 'wb'))

In [66]:
predictions = mlp.predict(np.hstack((tfidf_feat, orto_feat)))

In [68]:
df_data['prediction'] = predictions

In [69]:
df_data.to_csv('df_data_prediction.csv', index=False)

In [70]:
df_data[df_data.label != df_data.prediction]

Unnamed: 0,text,label,prediction
1009,Nah begini dong 4-4-2 ✊ #PERSIJADAY 🐅🔴⚪ https:...,0,1
1402,Emang gitu #DM4Jabar #demiz #DeddyDediAsyik #p...,0,1
1977,Belum lah. Kan masih naik\n\n#AsianGamesKita \...,0,1
4298,https://t.co/r1BXfwfbKj #PersibDay,1,0
