In [94]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import string
import warnings
warnings.filterwarnings('ignore')

## <span style="color:cyan"> Data Analytics </span>

In [197]:
train = pd.read_csv('train_cleaned.csv')
TARGET_COLS = ['label_kenaikanbbm','label_kesehatanAPBN','label_omnibus_law','label_kenaikanppn','label_pemerintah']
target = train[TARGET_COLS]
train = train.drop(TARGET_COLS, axis=1)

In [198]:
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

stop_factory = StopWordRemoverFactory()
more_stopword = ['dengan', 'ia','bahwa', 'oleh', 'sih']
stopwords = stop_factory.get_stop_words()+more_stopword

singkatan = {'dgn':'dengan', 'jgn': 'jangan', 'tdk':'tidak'}

factory = StemmerFactory()
stemmer = factory.create_stemmer()

def text_processing(text):
    patterns = re.findall(r'#[a-zA-Z]*', text)
    for inx_pattern, pattern in enumerate(patterns):
        patterns[inx_pattern] = re.sub(r'(?<!^)(?=[A-Z])', ' ', pattern)[2:]

    result = text.lower()
    result = re.sub(r'http\S+', '', result)
    result = result.replace('\n', ' ')
    result = re.sub('#[a-zA-Z]', ' ', result)
    result = re.sub('[^a-z A-Z]', ' ', result)
    result = result + ' ' + ' '.join(patterns)

    final_result = ''
    for kata in result.split():
        if kata in singkatan.keys():
            kata = singkatan[kata]
        if kata in stopwords:
            continue
        final_result += ' '+stemmer.stem(kata)

    return final_result

for inx, row in train.iterrows():
    if inx == 0:
        print('Before :', row['tweet'])

    result = text_processing(row['tweet'])
    
    if inx == 0:
        print('After :', result)

    train.at[inx, 'tweet'] = result

Before : Perdebatan antara Bayu Satria Utomo (Ketua BEM.UI) dengan salah satu Perwira Polisi pada saat aksi unjuk rasa menolak kenaikan harga BBM yang digelar oleh http://BEM.SI Kerakyatan, terdiri dari perwakilan Mahasiswa dari beberapa Kampus di Indonesia pada Jumat kemarin.
After :  debat bayu satria utomo ketua bem ui salah satu perwira polisi aksi unjuk rasa tolak naik harga bbm gelar rakyat diri wakil mahasiswa beberapa kampus indonesia jumat kemarin


# <span style="color:red"> Model

### <span style="color:orange"> Vectorization </span>

#### <span style="color:lightgreen"> TF-IDF </span>

In [184]:
from sklearn.feature_extraction.text import TfidfVectorizer

train = train['tweet'].values

vectorizer = TfidfVectorizer()
train = vectorizer.fit_transform(train)
train = train.toarray()
vectorizer.get_feature_names_out()

array(['aasb', 'abdul', 'abdurrahman', ..., 'zaman', 'zona', 'zulfan'],
      dtype=object)

#### <span style="color:red"> Word2Vec </span>

In [199]:
MAX_WORD = 60
import gensim

path = 'idn_word2vec/idwiki_word2vec_100.model'
id_w2v = gensim.models.word2vec.Word2Vec.load(path)

def word2vec(text):
    result = None
    for word in text.split():
        try:
            if result is None:
                result = id_w2v.wv.get_vector(word)
            else:
                result = np.vstack((result, id_w2v.wv.get_vector(word)))
        except KeyError:
            continue
        if result.shape[0] == MAX_WORD:
            break
        
    if result is None: # Tidak ada yang bisa ditranslate
        raise ValueError('Input doens\'t contain anything that can be translated')
    # Add Zero Suffix
    # print(len(result.shape))
    if result.shape[0] < MAX_WORD or len(result.shape) == 1:
        if len(result.shape) == 1:
            result = np.vstack((result, np.zeros((59, 100))))
        else:
            result = np.vstack((result, np.zeros((MAX_WORD-result.shape[0], 100))))
    
    return result

In [200]:
train_arr = None
invalid_row = []
for inx, row in train.iterrows():
    try:
        result = word2vec(row['tweet'])
    except ValueError:
        invalid_row.append(inx)
        continue
    
    if result.shape != (60, 100):
        invalid_row.append(inx)
        continue
    if train_arr is None:
        train_arr = result.reshape(60, 100)
    else:
        train_arr = np.vstack((train_arr, result))
    train.at[inx, 'tweet'] = result
print('Cant Translate Index : ', invalid_row)
print('---------- Result Vector ----------')
print(train['tweet'].iloc[0])
print('Dimension Size :', train['tweet'].iloc[0].shape)
train = train.drop(invalid_row)
target = target.drop(invalid_row)
train_arr = train_arr.reshape(-1, 60, 100)

Cant Translate Index :  [367, 518]
---------- Result Vector ----------
[[ 0.79103231 -1.4218334   0.6991778  ...  2.06165814 -1.72711885
   3.01871514]
 [ 0.98527479  1.76514661 -2.10552144 ...  1.66446638 -0.53946501
   0.75062287]
 [ 1.17328167  0.01186099 -1.70957327 ...  1.74395716 -2.08097768
  -0.3593666 ]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]
Dimension Size : (60, 100)


## <span style="color:lightgreen"> Machine Learning </span>

In [44]:
import pandas as pd
import numpy as np
import copy
from tqdm.notebook import tqdm_notebook

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMRegressor, LGBMClassifier
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostRegressor, CatBoostClassifier

def metrics_mse(actual, predicted):
    mse = mean_squared_error(actual, predicted)
    print(f"mean squared error: {mse}")

### <span style="color:lightgreen"> 1. Data Preparation

In [186]:
train_data = pd.read_csv("train_cleaned.csv")
train_data.drop(["tweet"], axis=1, inplace=True)

X = train

y1 = train_data.label_kenaikanbbm
y2 = train_data.label_kenaikanppn
y3 = train_data.label_kesehatanAPBN
y4 = train_data.label_omnibus_law
y5 = train_data.label_pemerintah

X_train, X_val, y_train, y_val = train_test_split(X, y3, random_state=0, test_size=0.2)
X.shape, y1.shape

((531, 2581), (531,))

### <span style="color:orange"> 2. Modelling </span>

In [None]:
baseline_model = RandomForestRegressor(max_depth=12, n_estimators=150)
baseline_model.fit(X_train, y_train)

y_pred_rf = baseline_model.predict(X_val)
metrics_mse(y_val, y_pred_rf)

mean squared error: 0.07371162895218446


In [None]:
model = LGBMRegressor(learning_rate=0.01, n_estimators=600).fit(X_train, y_train)
y_pred_lgb = model.predict(X_val)

metrics_mse(y_val, y_pred_lgb)

mean squared error: 0.0694670764001621


In [46]:
model = XGBRegressor(learning_rate=0.01, n_estimators=1000).fit(X_train, y_train)
y_pred_xgb = model.predict(X_val)

metrics_mse(y_val, y_pred_xgb)

mean squared error: 0.07242160913992203


In [None]:
model = CatBoostRegressor(learning_rate=0.01, n_estimators=1200).fit(X_train, y_train)
y_pred_cat = model.predict(X_val)

metrics_mse(y_val, y_pred_cat)

0:	learn: 0.2323125	total: 22.4ms	remaining: 33.6s
1:	learn: 0.2320788	total: 65.9ms	remaining: 49.4s
2:	learn: 0.2317853	total: 97.7ms	remaining: 48.7s
3:	learn: 0.2314921	total: 133ms	remaining: 49.6s
4:	learn: 0.2310690	total: 236ms	remaining: 1m 10s
5:	learn: 0.2307681	total: 404ms	remaining: 1m 40s
6:	learn: 0.2304552	total: 511ms	remaining: 1m 48s
7:	learn: 0.2301987	total: 615ms	remaining: 1m 54s
8:	learn: 0.2299393	total: 649ms	remaining: 1m 47s
9:	learn: 0.2297171	total: 717ms	remaining: 1m 46s
10:	learn: 0.2293863	total: 763ms	remaining: 1m 43s
11:	learn: 0.2290231	total: 805ms	remaining: 1m 39s
12:	learn: 0.2287586	total: 846ms	remaining: 1m 36s
13:	learn: 0.2284933	total: 873ms	remaining: 1m 32s
14:	learn: 0.2282636	total: 904ms	remaining: 1m 29s
15:	learn: 0.2279549	total: 936ms	remaining: 1m 26s
16:	learn: 0.2277531	total: 985ms	remaining: 1m 25s
17:	learn: 0.2275509	total: 1.05s	remaining: 1m 26s
18:	learn: 0.2273367	total: 1.09s	remaining: 1m 24s
19:	learn: 0.2271439	to

### <span style="color:pink"> 3. Tresholding Model </span>

In [187]:
class Model:
    LABEL = ["kenaikan bbm", "kenaikan ppn", "kesehatan apbn", "omnibus law", "pemerintah"]
    models = []
    def __init__(self, architecture, train_x=None, train_y:None|list=None, **params):
        self.architecture = architecture
        self.params = params

        if train_y is not None:
            if len(train_y) != len(self.LABEL):
                raise ValueError(f'Expected list of len {len(self.LABEL)} for label')
            self.fit(train_x, train_y)
    def fit(self, x, y):
        for inx, y in enumerate(y):
            self.models.append(self.architecture(**self.params))
            self.models[inx].fit(x, y)
    def predict(self, x, proba:bool=False) -> pd.DataFrame:
        df = pd.DataFrame()
        for inx, label in enumerate(self.LABEL):
            if proba:
                df[label] = self.models[inx].predict(x)
            else:
                pred = self.models[inx].predict(x)
                for i in range(len(pred)):
                    if pred[i] < 0.25:
                        pred[i] = 0.0
                    elif 0.25 <= pred[i] <= 0.75:
                        pred[i] = 0.5
                    else:
                        pred[i] = 1.0
                df[label] = pred
        return df
model = Model(XGBRegressor, X, [y1, y2, y3, y4, y5], learning_rate=0.01, n_estimators=1000)

## <span style="color:cyan"> Deep Learning </span>

In [201]:
target_enginered = []
for x in target.columns:
    target_enginered.append(
        pd.get_dummies(target[x]).values
    )
val_arr, train_arr = train_arr[-30:], train_arr[:-30]
val_target = []
for inx, y in enumerate(target_enginered):
    result, target_enginered[inx] = target_enginered[inx][-30:], target_enginered[inx][:-30]
    val_target.append(result)

In [207]:
import tensorflow.keras as keras
from keras import optimizers
from keras import layers, optimizers, losses, metrics

class DeepModel():
    models = []
    def __init__(self, x, ys) -> None:
        for y in ys:
            self.models.append(keras.Sequential([
                layers.LSTM(30),
                layers.Dense(25, activation='relu'),
                layers.Dense(3, activation='softmax')
            ]))
            self.models[-1].compile(loss=losses.CategoricalCrossentropy(), optimizer='adam', metrics=['accuracy', 'mae'])
            self.models[-1].fit(x.astype('float64'), y.astype('float64'), epochs=20, batch_size=8)

    def predict(self, text):
        result = []
        for x in text:
            now = []
            for y in range(5):
                now.append(self.models[y].predict(x.reshape(-1, 60, 100), verbose=0))
            result.append(now)
        return result
    
    def predict_label(self, text):
        pred = self.predict(text)
        result = []
        for single_pred in pred:
            now = []
            for y in single_pred:
                now.append(np.argmax(y))
            result.append(now)
        return result
    
    def evaluate_mae(self, text, truth):
        sum = 0
        for x in text:
            for inx, y in enumerate(self.models):
                pred = y.predict(x.reshape(-1, 60, 100), verbose=0)
                sum = (sum+np.mean((pred-truth[inx])))/2
        return sum

model = DeepModel(train_arr, target_enginered)
print('Validation MAE Loss', model.evaluate_mae(val_arr, val_target))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20

## Template Testing

In [152]:
test_data = [
    'Upaya pemerintah dalam mengatasi BBM saya nilai sangat bagus. Masyarakat hanya belum mengerti saja dampak kedepannya. Rakyat miskin itu sedang dibela eh malah ngelawan',
    'Entah kenapa semakin kesini semua dipajakin. Awalnya udah seneng padahal beli game secara digital harganya lebih murah dari beli secara fisik. Udah gitu pajaknya gede lagi',
    'Aslinya omnibus law ini bagus loh dan lumayan menguntungkan buruh, kalian saja yang mudah tergiring opini sehingga menentang kebijakan ini',
    'Pemerintah makin kesini makin ngada-ngada aja ya, udah semua dipajakin masih aja uangnya dikorupsi sama DPR. Anggaran gorden sampai miliaran buat apaan coba?',
    'Katanya OPEC+ bakal mengurangi ekspor pasokan minyak ke negara-negara lain... aduh sekarang BBM naik aja udah bikin repot masa bakal naik lagi sih...',
    # 'Bagus ini, akhirnya subsidi BBM diperlonggar. Aslinya yang paling untung itu orang-orang yang mampu, yang punya mobil. Kenapa? Karena sekali ngisi mereka bisa sampai puluhan liter. Sedangkan rakyat yang kurang mampu? Mereka hanya bisa mengisi sampai belasan liter karena memakai motor.',
    # 'Tiada hari tanpa masalah... kok bisa sih APBN yang uangnya dari pajak kita digunakan seenaknya oleh DPR buat ngebeli 100 TV LED? 1,5 miliar loh, gak sedikit itu.',
    # 'Vivo cuma ingin mengambil momentum sebelum akhirnya akan menaikan harga juga. Kenaikan harga BBM adalah keharusan, kalo terus ditahan maka akan menimbulkan kerugian… dan karena vivo swasta, mereka lebih paham dgn risiko kerugian. Tdk seperti BUMN yg disupport negara',
    # 'Jd maksudnya..gak papa rakyat menjerit dgn kenaikan BBM, TDL, pajak,dsb tuk kejar surplus..? Gile benar...',
    # 'Apa Prestasi pemerintahan sekarang .. bisa meredam mengaburkan isu kenaikan BBM',
    # 'Ga ngerti lagi kenapa paket data sekarang mahal sekali dan banget, kenaikan ppn 5-10% kayaknya ga segini deh, apa iya dengan harga segini provider masih anggap ini terjangkau? Sama sekali ga relevan dan kurang masuk akal'
]
test_target = [
    [[0, 0, 1], [0, 1, 0], [0, 1, 0], [0, 1, 0], [1, 0, 0]],
    [[0, 1, 0], [1, 0, 0], [0, 1, 0], [1, 0, 0], [0, 0, 0]],
    [[0, 1, 0], [0, 1, 0], [0, 1, 0], [1, 0, 0], [1, 0, 0]],
    [[0, 1, 0], [0, 1, 0], [0, 0, 1], [0, 1, 0], [0, 1, 0]],
    [[0, 0, 1], [1, 0, 0], [0, 0, 1], [1, 0, 0], [1, 0, 0]]
]
output = pd.DataFrame({'tweet':test_data})

In [195]:
result = []
for inx, data in enumerate(test_data):
    x = text_processing(data)
    if isinstance(model, DeepModel):
        x = word2vec(x)
        # print(x.reshape(-1, 60, 100))
        pred = model.predict(x.reshape(-1, 60, 100))[0]
        sum = 0
        for y in pred:
            sum = (sum+np.mean(np.abs(y-test_target)/2))/2
        result.append(sum)
    else:
        x = vectorizer.transform([x])
        x = x.toarray()
        pred = model.predict(x, True)
        sum = 0
        for y in pred.values[0]:
            sum = (sum+np.mean(np.abs(y-test_target)/2))/2
        result.append(sum)
print(result)

[0.22739795338362456, 0.2309630349650979, 0.19945578984916212, 0.22027742354199292, 0.20318512069061398]


## Input Testing

In [211]:
LABEL = ["kenaikan bbm", "kenaikan ppn", "kesehatan apbn", "omnibus law", "pemerintah"]
inp = input('Masukan text :')
print('Input \t\t:', inp)
inp = text_processing(inp)
print('Setelah text processing\t:', inp)

if isinstance(model, DeepModel):
    test = word2vec(inp)
    pred = model.predict_label(test.reshape(-1, 60, 100))[0]
    result = {
        "kenaikan bbm": 'negative' if pred[0] == 0 else 'netral' if pred[0] == 1 else 'positive',
        "kenaikan ppn": 'negative' if pred[1] == 0 else 'netral' if pred[1] == 1 else 'positive',
        "kesehatan apbn": 'negative' if pred[2] == 0 else 'netral' if pred[2] == 1 else 'positive',
        "omnibus law": 'negative' if pred[3] == 0 else 'netral' if pred[3] == 1 else 'positive',
        "pemerintah": 'negative' if pred[4] == 0 else 'netral' if pred[4] == 1 else 'positive'
    }
    print('---------- Hasil ----------')
    print(result)
else:
    test = vectorizer.transform([inp])
    test = test.toarray()
    pred = model.predict(test, True)
    result = {
        "kenaikan bbm": 'negative' if pred[0] == 0 else 'netral' if pred[0] == 1 else 'positive',
        "kenaikan ppn": 'negative' if pred[1] == 0 else 'netral' if pred[1] == 1 else 'positive',
        "kesehatan apbn": 'negative' if pred[2] == 0 else 'netral' if pred[2] == 1 else 'positive',
        "omnibus law": 'negative' if pred[3] == 0 else 'netral' if pred[3] == 1 else 'positive',
        "pemerintah": 'negative' if pred[4] == 0 else 'netral' if pred[4] == 1 else 'positive'
    }
    print('---------- Hasil ----------')
    print(result)

Input 		: Apa Prestasi pemerintahan sekarang .. bisa meredam mengaburkan isu kenaikan BBM
Setelah text processing	:  apa prestasi perintah sekarang redam abur isu naik bbm
---------- Hasil ----------
{'kenaikan bbm': 'negative', 'kenaikan ppn': 'netral', 'kesehatan apbn': 'netral', 'omnibus law': 'netral', 'pemerintah': 'negative'}
