In [2]:
%pip install --upgrade gensim
import gensim
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from keras import backend as K
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from time import time
print(gensim.__version__)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gensim
  Downloading gensim-4.2.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 2.1 MB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.2.0
4.2.0


In [50]:
# preprocess the data

def build_alay_dict():
    df = pd.read_csv('https://raw.githubusercontent.com/nasalsabila/kamus-alay/master/colloquial-indonesian-lexicon.csv')
    return dict(zip(df['slang'], df['formal']))

def translate_alay(input_string, alay_dict):
    string_splitted = input_string.split(" ")
    for i in range(len(string_splitted)):
        if(string_splitted[i] in alay_dict):
            string_splitted[i] = alay_dict[string_splitted[i]]
    return ' '.join(string_splitted)

_URLS = {
    "train" : "https://raw.githubusercontent.com/rayendito/TextClassification/main/data_worthcheck/train.csv",
    "test" : "https://raw.githubusercontent.com/rayendito/TextClassification/main/data_worthcheck/test.csv",
    "val" : "https://raw.githubusercontent.com/rayendito/TextClassification/main/data_worthcheck/dev.csv",
}

alay_dict = build_alay_dict()

train = pd.read_csv(_URLS["train"])
train = train.drop(labels = ["Unnamed: 0"], axis = 1)
test = pd.read_csv(_URLS["test"])
val = pd.read_csv(_URLS["val"])

train['text_aa'] = train['text_a'].apply(lambda x: translate_alay(x,alay_dict))
test['text_aa'] = test['text_a'].apply(lambda x: translate_alay(x,alay_dict))
val['text_aa'] = val['text_a'].apply(lambda x: translate_alay(x,alay_dict))

train['text_clean'] = train['text_aa'].apply(lambda x: gensim.utils.simple_preprocess(x))
test['text_clean'] = test['text_aa'].apply(lambda x: gensim.utils.simple_preprocess(x))
val['text_clean'] = val['text_aa'].apply(lambda x: gensim.utils.simple_preprocess(x))


train['label']=train['label'].map({'yes':1,'no':0})
test['label']=test['label'].map({'yes':1,'no':0})
val['label']=val['label'].map({'yes':1,'no':0})

x_train, y_train = train["text_clean"],train["label"]
x_test, y_test = test["text_clean"],test["label"]
x_val, y_val = val["text_clean"],val["label"]

# create word2vec model
vec_size = 300
w2vmodel = gensim.models.Word2Vec(
    # x_train,
    vector_size=vec_size,
    window=3,
    min_count=10

    # min_count=20,
    # window=4,
    # vector_size=vec_size,
    # sample=6e-5, 
    # alpha=0.03, 
    # min_alpha=0.0007, 
    # negative=20,
    )

t = time()
w2vmodel.build_vocab(x_train, progress_per=100)
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

t = time()
w2vmodel.train(x_train, total_examples=w2vmodel.corpus_count, epochs=30, report_delay=1)
print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

# test word similarity
print(w2vmodel.wv.most_similar("pemerintah"))
# get word index
print(w2vmodel.wv.index_to_key)

# create a list of word vectors
words = set(w2vmodel.wv.index_to_key)
x_train_vect = np.array([np.array([w2vmodel.wv[i] for i in ls if i in words])
                         for ls in x_train])
x_test_vect = np.array([np.array([w2vmodel.wv[i] for i in ls if i in words])
                         for ls in x_test])
x_val_vect = np.array([np.array([w2vmodel.wv[i] for i in ls if i in words])
                         for ls in x_val])
vectors = [w2vmodel.wv[word] for word in words]

# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
x_train_vect_avg = []
for v in x_train_vect:
    if v.size:
        x_train_vect_avg.append(v.mean(axis=0))
    else:
        x_train_vect_avg.append(np.zeros(vec_size, dtype=float))
        
x_test_vect_avg = []
for v in x_test_vect:
    if v.size:
        x_test_vect_avg.append(v.mean(axis=0))
    else:
        x_test_vect_avg.append(np.zeros(vec_size, dtype=float))

x_val_vect_avg = []
for v in x_val_vect:
    if v.size:
        x_val_vect_avg.append(v.mean(axis=0))
    else:
        x_val_vect_avg.append(np.zeros(vec_size, dtype=float))
  
# convert to numpy array
x_train_vect_avg = np.array(x_train_vect_avg)
x_test_vect_avg = np.array(x_test_vect_avg)
x_val_vect_avg = np.array(x_val_vect_avg)
y_train = np.array(y_train)
y_val = np.array(y_val)
y_test = np.array(y_test)

Time to build vocab: 0.0 mins
Time to train the model: 0.18 mins
[('berguna', 0.42950114607810974), ('efektif', 0.41681528091430664), ('lokdon', 0.4125816822052002), ('kritik', 0.3982113301753998), ('mengikuti', 0.3930703401565552), ('preventif', 0.38423076272010803), ('mendukung', 0.35873863101005554), ('perekonomian', 0.35820451378822327), ('menimbulkan', 0.35663172602653503), ('mitigasi', 0.350292831659317)]
['corona', 'nya', 'cowok', 'https', 'yang', 'enggak', 'covid', 'virus', 'ya', 'orang', 'depok', 'psbb', 'saja', 'distancing', 'indonesia', 'sudah', 'presiden', 'kalo', 'physical', 'menkes', 'jakarta', 'normal', 'masker', 'new', 'sih', 'gue', 'pemerintah', 'jokowi', 'ku', 'pakai', 'positif', 'begitu', 'banget', 'tapi', 'nih', 'warga', 'kena', 'rumah', 'menteri', 'kota', 'deh', 'kayak', 'karena', 'dari', 'kesehatan', 'dengan', 'masyarakat', 'masuk', 'gy', 'sama', 'amp', 'negara', 'di', 'gubernur', 'semoga', 'memang', 'tau', 'penyebaran', 'pasien', 'lantai', 'rakyat', 'bagaimana', 



In [51]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

# create a deep learning classifier using word2vec model
model = tf.keras.Sequential([
    # tf.keras.layers.Conv1D(32, kernel_size=5, activation='relu',input_shape=(vec_size,1)),
    # tf.keras.layers.Conv1D(64, kernel_size=5, activation='relu'),
    # tf.keras.layers.Conv1D(128, kernel_size=5, activation='relu'),
    # tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy',f1_m,precision_m, recall_m])

model.fit(x_train_vect_avg, y_train, epochs=10, validation_data=(x_val_vect_avg, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f3d1e591890>

In [52]:
# predict the model
y_pred = model.predict(x_test_vect_avg)
y_pred = np.array([round(y_pred[i][0]) for i in range(len(y_pred))])

# print(y_pred[-20:])
# print(y_test[-20:])
# print(y_val[-20:])

# calculate the metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = 2 * (precision * recall) / (precision + recall)

print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1: ", f1)

Accuracy:  0.8592857142857143
Precision:  0.7284671532846715
Recall:  0.7057991513437057
F1:  0.7169540229885057
