@inproceedings{wilie2020indonlu,
  title={IndoNLU: Benchmark and Resources for Evaluating Indonesian Natural Language Understanding},
  author={Bryan Wilie and Karissa Vincentio and Genta Indra Winata and Samuel Cahyawijaya and X. Li and Zhi Yuan Lim and S. Soleman and R. Mahendra and Pascale Fung and Syafri Bahar and A. Purwarianti},
  booktitle={Proceedings of the 1st Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 10th International Joint Conference on Natural Language Processing},
  year={2020}
}


In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os
import matplotlib.pyplot as plt

from transformers import BertTokenizer, TFAlbertModel, TFAutoModel
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
from imblearn.under_sampling import RandomUnderSampler
from tensorflow.keras.layers import Input, Dense

In [5]:
cwd = os.getcwd()
datapath = os.path.join(cwd, 'datasets')

df = pd.read_excel(os.path.join(datapath, 'all_cleaned.xlsx'))
df = df[['berita', 'label']]

df.drop_duplicates(inplace=True)
df.dropna(inplace = True)

df.label.value_counts()

label
0    20944
1     6474
Name: count, dtype: int64

In [6]:
X = df[['berita']]
y = df['label']
rus = RandomUnderSampler(random_state=1, replacement=True)# fit predictor and target variable
X_new, y_new = rus.fit_resample(X,y)
y_new.value_counts()

label
0    6474
1    6474
Name: count, dtype: int64

In [7]:
df_new = X_new
df_new['label'] = y_new
df_new

Unnamed: 0,berita,label
235,psi sebut banding pecat viani limardi tolak ad...,0
12172,tawar koalisi gerindra pks bilang cari teman l...,0
5192,dpr perintah kpu sepakat honor tugas tps naik ...,0
17290,megawati sebut bicara koalisi capres lihat din...,0
10955,dana milu capai rp triliun jokowi minta detail...,0
...,...,...
27427,raja salman arab saudi bawa orang orang sudah ...,1
27428,hehe selalu senyum lihat tingkah laku pak joko...,1
27429,pak jokowi jadi walikota periode pertama solo ...,1
27430,hari rabu nilai tukar rupiah puruk hingga semp...,1


In [8]:
from tensorflow.keras.callbacks import Callback
            
class LossHistory(Callback):
    def __init__(self):
        super(Callback, self).__init__()
        self.losses = []
        self.val_losses = []

    def on_train_begin(self, logs=None):
        self.losses = []
        self.val_losses = []

    def on_batch_end(self, batch, logs=None):
        self.losses.append(logs.get('loss'))
        self.val_losses.append(logs.get('val_loss'))
#         print(self.losses, self.val_losses)

callback = LossHistory()

In [8]:
df_train, df_test = train_test_split(df_new, test_size=0.3, random_state=42,
                                     stratify=df_new['label'])

In [10]:
model_name = 'indobenchmark/indobert-lite-base-p1'
# model_name = 'bert-base-cased'

# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 2)

tokenizer = BertTokenizer.from_pretrained(model_name)
# model = TFAutoModel.from_pretrained(model_name)



The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [11]:
np.mean([len(str(df_train.sample().berita.values).split()) for i in range(1000)])

146.646

In [12]:
max_len = 70

X_train = tokenizer(
    text=df_train['berita'].tolist(),
    add_special_tokens=True,
    max_length=max_len,
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=True,
    verbose=True
)

X_test = tokenizer(
    text=df_test['berita'].tolist(),
    add_special_tokens=True,
    max_length=max_len,
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=True,
    verbose=True
)

In [13]:
X_train.input_ids[1]

<tf.Tensor: shape=(70,), dtype=int32, numpy=
array([    2, 14025, 12006,    46,  1495, 14149,  8221, 29842,   119,
       16192, 14149,  2764,  4154,  6127,    19,  1225,  3892,  7043,
        2441, 10242, 10639, 14025, 12006,    46,  1436,  8521,  3121,
        1699,  1495, 14149,   745, 10354,  9674,  3976,  8221, 29842,
       14025, 12006,    46,   304,  9418, 12085,  1241, 14149,  8445,
        6776,  5115, 23296,   708, 18787, 21334,    63,  1800, 10165,
       24871,  1904, 12291, 21603,    50,  6101,   216, 14149,  5115,
         531,  5427,  9940,    63,   354,   728,     3])>

In [15]:
input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
# embeddings = dbert_model(input_ids, attention_mask = input_mask)[0]

embeddings = model(input_ids, attention_mask = input_mask)[0] # 0 = last hidden state, 1 = poller_output
out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32, activation='relu')(out)

y = Dense(1, activation='sigmoid')(out)

model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
# model.layers[2].trainable = False

In [16]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 70)]                 0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 70)]                 0         []                            
 )                                                                                                
                                                                                                  
 tf_albert_model (TFAlbertM  TFBaseModelOutputWithPooli   1168358   ['input_ids[0][0]',           
 odel)                       ng(last_hidden_state=(None   4          'attention_mask[0][0]']      
                             , 70, 768),                                                      

In [17]:
EPOCHS = 2
BATCH_SIZE = 16

optimizer = Adam(
    learning_rate=5e-05, # HF recommendation
    epsilon=1e-08,
    clipnorm=1.0
)

loss = 'binary_crossentropy'
metric = 'accuracy'

model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=metric
)


In [18]:
history = model.fit(
    x = {'input_ids':X_train['input_ids'], 'attention_mask':X_train['attention_mask']},
    y = df_train['label'],
    validation_data = ({'input_ids':X_test['input_ids'], 'attention_mask':X_test['attention_mask']},
                        df_test['label']),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[callback]
)

Epoch 1/2
Epoch 2/2


In [13]:
# model = tf.keras.models.load_model('models/model_2023-11-29', custom_objects={"TFAlbertModel": TFAlbertModel})

In [15]:
from sklearn.metrics import classification_report

predicted = model.predict({'input_ids': X_test['input_ids'], 'attention_mask': X_test['attention_mask']})
y_predicted = [1 if pred >= 0.5 else 0 for pred in predicted]
print(classification_report(df_test['label'], y_predicted))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1943
           1       0.99      0.97      0.98      1942

    accuracy                           0.98      3885
   macro avg       0.98      0.98      0.98      3885
weighted avg       0.98      0.98      0.98      3885



In [20]:
from datetime import date

today = date.today()
str(today)

'2023-11-29'

In [21]:
'model{}.keras'.format(str(today))

'model2023-11-29.keras'

In [22]:
path_to_dir = 'model_{}'.format(str(today))

In [23]:
model.save('models/model_{}'.format(str(today)))

INFO:tensorflow:Assets written to: models/model_2023-11-29\assets


INFO:tensorflow:Assets written to: models/model_2023-11-29\assets


In [1]:
model = tf.keras.models.load_model('models/model_2023-11-29', custom_objects={"TFAlbertModel": TFAlbertModel})

NameError: name 'tf' is not defined

In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 70)]                 0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 70)]                 0         []                            
 )                                                                                                
                                                                                                  
 tf_albert_model (TFAlbertM  TFBaseModelOutputWithPooli   1168358   ['input_ids[0][0]',           
 odel)                       ng(last_hidden_state=(None   4          'attention_mask[0][0]']      
                             , 70, 768),                                                      

In [17]:
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
tflite_quant_model = converter.convert()
#saving converted model in "converted_quant_model.tflite" file
open("models/converted_quant_model_litebert.tflite", "wb").write(tflite_quant_model)

INFO:tensorflow:Assets written to: C:\Users\LENOVO\AppData\Local\Temp\tmp9jypmrh7\assets


INFO:tensorflow:Assets written to: C:\Users\LENOVO\AppData\Local\Temp\tmp9jypmrh7\assets


11481752

In [19]:
df_test.label

25489    1
25646    1
23126    1
17712    0
8492     0
        ..
23459    1
4478     0
17670    0
24902    1
23223    1
Name: label, Length: 3885, dtype: int64

In [35]:
tf.expand_dims(X_test['attention_mask'][0], axis = 0)

<tf.Tensor: shape=(1, 70), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1]])>

In [34]:
tf.reshape(X_test['input_ids'][0], input_shape)

<tf.Tensor: shape=(1, 70), dtype=int32, numpy=
array([[    2,  9881,    55,  4425,  9319,  4007,  6101,  1871,   752,
         6845, 18532, 12368,    63,  7498,   696,    66, 12291,  1225,
         2600,   204,  2480,  1190, 20469,  9187,  2864,  3795,    39,
        18303,  3248,  2193,  9305,  2855,   920,  1127,  2641,   441,
          712,  5162,   491, 23089,  5129,  2336,  5115, 12368,    63,
         7498,   696,    66,  1397,   752, 12291,  1225,  1269, 10834,
         8259,  4849,  2600,   204,  4068, 20469,  9187,   683,  5115,
         9319,  4007,   752,  6845,   629,   562,     3]])>

In [30]:
df_test.label.values[0]

1

In [26]:
interpreter = tf.lite.Interpreter(model_path='models/converted_quant_model_litebert.tflite')
interpreter.allocate_tensors()

input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

input_shape = input_details[0]['shape']

output_data = []
pred = []

for i in range(100):
    input_data = tf.expand_dims(X_test['input_ids'][i], axis = 0)
    input_intent = tf.expand_dims(X_test['attention_mask'][i], axis = 0)
    interpreter.set_tensor(input_details[0]['index'], input_data)
    interpreter.set_tensor(input_details[1]['index'], input_intent)
    interpreter.invoke()
    get_tensor = interpreter.get_tensor(output_details[0]['index'])
    print(get_tensor, 1 if get_tensor >= 0.5 else 0, df_test.label.values[i])
    output_data.append(get_tensor)
    pred.append(1 if get_tensor >= 0.5 else 0)

output_data

[[0.04697829]] 0 1
[[0.26599503]] 0 1
[[0.0907713]] 0 1
[[0.09970863]] 0 0
[[0.02740857]] 0 0
[[0.04011282]] 0 1
[[0.05974764]] 0 1
[[0.05749785]] 0 1
[[0.10511776]] 0 1
[[0.03395897]] 0 0
[[0.03713262]] 0 1
[[0.09178298]] 0 1
[[0.04636384]] 0 1
[[0.03091474]] 0 0
[[0.06431253]] 0 0
[[0.03103101]] 0 0
[[0.05176967]] 0 0
[[0.04576581]] 0 0
[[0.09994493]] 0 1
[[0.03856257]] 0 0
[[0.10511776]] 0 1
[[0.05621246]] 0 0
[[0.05186893]] 0 1
[[0.10495745]] 0 0
[[0.10517703]] 0 0
[[0.04555296]] 0 1
[[0.04520574]] 0 1
[[0.05428829]] 0 1
[[0.05757875]] 0 1
[[0.05540879]] 0 1
[[0.03419619]] 0 1
[[0.04439409]] 0 1
[[0.03032523]] 0 1
[[0.04132477]] 0 0
[[0.07640906]] 0 0
[[0.04397864]] 0 1
[[0.09097861]] 0 1
[[0.03391322]] 0 1
[[0.10627519]] 0 1
[[0.04618493]] 0 1
[[0.10544624]] 0 0
[[0.0410606]] 0 1
[[0.10932309]] 0 1
[[0.09970863]] 0 1
[[0.05978647]] 0 0
[[0.03033004]] 0 0
[[0.04959667]] 0 0
[[0.05582564]] 0 1
[[0.03168443]] 0 0
[[0.06143473]] 0 0
[[0.04697829]] 0 0
[[0.08356812]] 0 1
[[0.13643566]]

[array([[0.04697829]], dtype=float32),
 array([[0.26599503]], dtype=float32),
 array([[0.0907713]], dtype=float32),
 array([[0.09970863]], dtype=float32),
 array([[0.02740857]], dtype=float32),
 array([[0.04011282]], dtype=float32),
 array([[0.05974764]], dtype=float32),
 array([[0.05749785]], dtype=float32),
 array([[0.10511776]], dtype=float32),
 array([[0.03395897]], dtype=float32),
 array([[0.03713262]], dtype=float32),
 array([[0.09178298]], dtype=float32),
 array([[0.04636384]], dtype=float32),
 array([[0.03091474]], dtype=float32),
 array([[0.06431253]], dtype=float32),
 array([[0.03103101]], dtype=float32),
 array([[0.05176967]], dtype=float32),
 array([[0.04576581]], dtype=float32),
 array([[0.09994493]], dtype=float32),
 array([[0.03856257]], dtype=float32),
 array([[0.10511776]], dtype=float32),
 array([[0.05621246]], dtype=float32),
 array([[0.05186893]], dtype=float32),
 array([[0.10495745]], dtype=float32),
 array([[0.10517703]], dtype=float32),
 array([[0.04555296]], dty

In [29]:
pd.Series(pred).value_counts()

0    100
Name: count, dtype: int64