# **Environment Preparing**

In [1]:
!pip install transformers



In [2]:
import transformers
from transformers import TFAutoModel, AutoTokenizer

import random
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report

import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

2022-10-14 05:07:20.582706: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2022-10-14 05:07:20.582828: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [3]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.experimental.TPUStrategy(tpu)

2022-10-14 05:07:28.002528: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-10-14 05:07:28.005242: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2022-10-14 05:07:28.005276: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2022-10-14 05:07:28.005306: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (58b71104ec62): /proc/driver/nvidia/version does not exist
2022-10-14 05:07:28.007908: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operation

# **Data Preparing**

In [4]:
df = pd.read_csv('../input/indonesia-multi-emotion-prediction/data.csv')
df.sample(10)

Unnamed: 0,Text,Anger,Disgusted,Afraid,Happy,Sad,Shocked
77,dapatkah jika ada yg lupa tidak pakai masker p...,1,0,0,0,0,0
12550,rt anies larang rokok dipajang di rak ritel ka...,0,0,0,0,0,0
17461,salut tidak mudah baginya untuk menjalani kari...,0,0,0,0,0,0
28557,rt boleh request selitkan notes untuk orang te...,0,0,0,0,0,0
19766,rt when black cats prowl and pumpkins gleam ma...,0,0,0,0,0,0
15040,tolong dong jangan sebar2 video jatuh dari got...,0,0,0,0,0,0
1614,mama en ppamu pasti bangga,0,0,0,0,0,0
17435,rt eh mas gk usah sok cari muka sm rakyat buat...,0,0,0,0,0,0
26703,rt the malay word for aroma is raksi,0,0,0,0,0,0
27619,rt ojolali malam jumat berlimpah sholawat doak...,0,0,0,0,0,0


In [5]:
text = df.Text.values
label = df[['Anger', 'Disgusted', 'Afraid', 'Happy', 'Sad', 'Shocked']]

In [6]:
X_train, X_test = train_test_split(np.array(list(range(df.shape[0]))), 
                                                    test_size=0.1,
                                                    random_state=1)
    
X_test, X_val = train_test_split(X_test,test_size=0.5,random_state=1)

In [7]:
train_df = df.iloc[X_train,:].reset_index(drop=True)
test_df = df.iloc[X_test,:].reset_index(drop=True)
val_df = df.iloc[X_val,:].reset_index(drop=True)

In [8]:
train_df.sample(10)

Unnamed: 0,Text,Anger,Disgusted,Afraid,Happy,Sad,Shocked
5806,rt mukanya ngeselin mondar mandir melulu di tl,0,0,0,0,0,0
21263,2 makanya kalimat hak segala bangsa dlm preamb...,0,0,0,1,0,0
8935,berusaha menerimamu apa adanya meskipun kamu k...,0,0,0,0,0,0
12474,rt selamat di dunia sengsara di akhiratlolos d...,0,0,0,0,0,0
24912,rt istri dan anak saya berusia 2 tahun 3 bulan...,0,0,0,0,0,0
8977,hujan lebat pagi hari,0,0,0,0,1,0
16777,kita kaga bisa liat apa siapa yg retweet anyg,0,0,0,0,0,0
15749,mudahan suami dari bisa menyadari kesalahannya...,0,0,0,0,0,0
16677,rt donda glory to god,0,0,0,0,0,0
5100,8 tahun yang kau pinjamkan belum cukup tolongl...,0,0,0,0,0,0


# **Helper Function**

In [9]:
def regular_encode(texts, tokenizer, maxlen=128):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

In [10]:
def callback():
    cb = []

    reduceLROnPlat = ReduceLROnPlateau(monitor='val_loss',  
                                       factor=0.5, patience=0, 
                                       verbose=1, mode='min', 
                                       epsilon=0.0001, min_lr=0,
                                       restore_best_weights=True)
    cb.append(reduceLROnPlat)
    
    es = EarlyStopping(monitor='val_loss', patience=4, verbose=0,
                       mode='min', restore_best_weights=True)
    
    cb.append(es)
    
    return 

In [11]:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.
        Only computes a batch-wise average of recall.
        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.
        Only computes a batch-wise average of precision.
        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [12]:
def build_model(transformer, learning_rate=1e-5, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    cls_token = Dropout(0.2)(cls_token)
    out = Dense(6, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=learning_rate), loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC(), tf.keras.metrics.BinaryAccuracy()])
    
    return model

# **Dataset Preparing**

In [13]:
RANDOM_SEED=1
AUTO = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 8 * strategy.num_replicas_in_sync
STEPS = X_train.shape[0] // BATCH_SIZE

In [14]:
tokenizer = AutoTokenizer.from_pretrained('jplu/tf-xlm-roberta-large')

Downloading:   0%|          | 0.00/513 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

In [15]:
X_train_t, y_train = regular_encode(list(train_df.Text.values), tokenizer), train_df[['Anger', 'Disgusted', 'Afraid', 'Happy', 'Sad', 'Shocked']]
X_test_t, y_test  = regular_encode(list(test_df.Text.values), tokenizer), test_df[['Anger', 'Disgusted', 'Afraid', 'Happy', 'Sad', 'Shocked']]
X_val_t , y_val  = regular_encode(list(val_df.Text.values), tokenizer), val_df[['Anger', 'Disgusted', 'Afraid', 'Happy', 'Sad', 'Shocked']]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [16]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_train_t, y_train))
    .repeat()
    .shuffle(len(X_train_t),
             seed=RANDOM_SEED)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_test_t, y_test))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_val_t, y_val))
    .batch(BATCH_SIZE)
)

# **Model Preparing**

In [17]:
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained('jplu/tf-xlm-roberta-large')
    model = build_model(transformer_layer, max_len=128, learning_rate=5e-6)

Downloading:   0%|          | 0.00/3.27G [00:00<?, ?B/s]

Some layers from the model checkpoint at jplu/tf-xlm-roberta-large were not used when initializing TFXLMRobertaModel: ['lm_head']
- This IS expected if you are initializing TFXLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFXLMRobertaModel were initialized from the model checkpoint at jplu/tf-xlm-roberta-large.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.


In [18]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 128)]             0         
_________________________________________________________________
tfxlm_roberta_model (TFXLMRo TFBaseModelOutputWithPool 559890432 
_________________________________________________________________
tf.__operators__.getitem (Sl (None, 1024)              0         
_________________________________________________________________
dropout_73 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense (Dense)                (None, 6)                 6150      
Total params: 559,896,582
Trainable params: 559,896,582
Non-trainable params: 0
_________________________________________________________________


# **Training**

In [19]:
tf.keras.backend.clear_session()

In [20]:
train_history = model.fit(
    train_dataset,
    steps_per_epoch=STEPS,
    validation_data=test_dataset,
    callbacks=callback(), 
    epochs=5
)

Epoch 1/5


  num_elements)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [21]:
pred = model.evaluate(valid_dataset)



In [22]:
text = np.array(['jelas sekali ada yang salah dalam sistem bernegara kita pr besar bagaimana merumuskannya kedepan ket video harus korupsi disini bisa didefinisikan langkah mensiasati anggarant',
                'bukan semua agama itu benar melainkan engkau meyakini bahwa agamamu benar dibarengi sikap saling menghormati jika ada pemeluk agama lain yg menyatakan bahwa agama yg dianutnya adalah benar',
                'saat konstantinopel takluk ayasofya tetap dijaga bahkan saat perang pun rumah ibadah tak boleh diganggu pengrusakan rumah ibadah di kab sintang kalimantan barat sangat disayangkan saya turut mengecam dan berduka cita semoga pelakunya dapat diproses hukum dengan adil',
                'sudah dinyatakan bersalah ya harus dihukum dari sudut pandang fadli zon yg mana yang menggelikan yg menggelikan sudah dinyatakan bersalah tidak hukum fadli zon vonis 4 tahun bui untuk habib rizieq menggelikan',
                'dia nih yang bikin umbrella academy tebel coba aja dia mau nurutin apa kata bapaknya buat gausah aneh aneh pasti umbrella academy tipis banget wkwk'], dtype='object')

In [23]:
text_encode = regular_encode(list(text), tokenizer)



In [24]:
result = model.predict(text_encode)

In [25]:
print('Emosi 1 : Anger')
print(f'Anger     : {round(result[0][0]*100)}%')
print(f'Disgusted : {round(result[0][1]*100)}%')
print(f'Afraid    : {round(result[0][2]*100)}%')
print(f'Happy     : {round(result[0][3]*100)}%')
print(f'Sad       : {round(result[0][4]*100)}%')
print(f'Shocked   : {round(result[0][5]*100)}%')

print('\nEmosi 2 : Happy')
print(f'Anger     : {round(result[1][0]*100)}%')
print(f'Disgusted : {round(result[1][1]*100)}%')
print(f'Afraid    : {round(result[1][2]*100)}%')
print(f'Happy     : {round(result[1][3]*100)}%')
print(f'Sad       : {round(result[1][4]*100)}%')
print(f'Shocked   : {round(result[1][5]*100)}%')

print('\nEmosi 3 : Anger, Sad')
print(f'Anger     : {round(result[2][0]*100)}%')
print(f'Disgusted : {round(result[2][1]*100)}%')
print(f'Afraid    : {round(result[2][2]*100)}%')
print(f'Happy     : {round(result[2][3]*100)}%')
print(f'Sad       : {round(result[2][4]*100)}%')
print(f'Shocked   : {round(result[2][5]*100)}%')

print('\nEmosi 3 : Anger, Disgusted')
print(f'Anger     : {round(result[3][0]*100)}%')
print(f'Disgusted : {round(result[3][1]*100)}%')
print(f'Afraid    : {round(result[3][2]*100)}%')
print(f'Happy     : {round(result[3][3]*100)}%')
print(f'Sad       : {round(result[3][4]*100)}%')
print(f'Shocked   : {round(result[3][5]*100)}%')

print('\nEmosi 3 : Anger, Shocked')
print(f'Anger     : {round(result[4][0]*100)}%')
print(f'Disgusted : {round(result[4][1]*100)}%')
print(f'Afraid    : {round(result[4][2]*100)}%')
print(f'Happy     : {round(result[4][3]*100)}%')
print(f'Sad       : {round(result[4][4]*100)}%')
print(f'Shocked   : {round(result[4][5]*100)}%')

Emosi 1 : Anger
Anger     : 20%
Disgusted : 0%
Afraid    : 0%
Happy     : 1%
Sad       : 12%
Shocked   : 0%

Emosi 2 : Happy
Anger     : 7%
Disgusted : 0%
Afraid    : 0%
Happy     : 35%
Sad       : 3%
Shocked   : 0%

Emosi 3 : Anger, Sad
Anger     : 9%
Disgusted : 0%
Afraid    : 3%
Happy     : 1%
Sad       : 41%
Shocked   : 0%

Emosi 3 : Anger, Disgusted
Anger     : 34%
Disgusted : 0%
Afraid    : 0%
Happy     : 0%
Sad       : 2%
Shocked   : 0%

Emosi 3 : Anger, Shocked
Anger     : 19%
Disgusted : 0%
Afraid    : 0%
Happy     : 1%
Sad       : 2%
Shocked   : 0%
