# IndoXTC - Fine-tune Full Toxic [XLM-R] Comparable
Exploring Indonesian hate speech/abusive & sentiment text classification using multilingual language model.

This kernel is a part of my undergraduate final year project.
Checkout the full github repository:
https://github.com/ilhamfp/indonesian-text-classification-multilingual

In [None]:
#############################
# Experiment configurations #
#############################

INDO_DATA_NAME = 'toxic'
DATA_PATH_INDO = '../input/simpler-preprocess-indonesian-hate-abusive-text'

FOREIGN_DATA_NAME = 'toxic'
DATA_PATH_FOREIGN = '../input/jigsaw-multilingual-toxic-comment-classification'

MODEL_NAME = 'XLM_R'

EXPERIMENT_TYPE = 'A' # A / B / C
TOTAL_DATA = 11852 # 500 / 1000 / 2500 / 5000 / 7500 / 11852
FOREIGN_LANG_DATA_MULT = 0.5 # 0.5 / 1 / 1.5 / 2 / 3
RANDOM_SEED = 1
VALIDATION_DATA = 0.1
EPOCHS = 25
LEARNING_RATE = 5e-6
USE_TPU = True

In [None]:
######
## model_full
######

import pandas as pd
import os
import random
import numpy as np
import torch
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, CSVLogger, EarlyStopping
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors, SentencePieceBPETokenizer
from tensorflow.keras import backend as K
    
def set_seed(seed=1):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True
    
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

def build_model(transformer, learning_rate=1e-5, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    cls_token = Dropout(0.2)(cls_token)
    out = Dense(2, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

def callback():
    cb = []

    reduceLROnPlat = ReduceLROnPlateau(monitor='val_loss',  
                                       factor=0.5, patience=2, 
                                       verbose=1, mode='min', 
                                       epsilon=0.0001, min_lr=0,
                                       restore_best_weights=True)
    cb.append(reduceLROnPlat)
    
    log = CSVLogger('log.csv')
    cb.append(log)
    
    es = EarlyStopping(monitor='val_loss', patience=4, verbose=0,
                       mode='min', restore_best_weights=True)
    
    cb.append(es)
    
    return cb

In [None]:
######
## load_data
######

# This source code is part of a final year undergraduate project
# on exploring Indonesian hate speech/abusive & sentiment text 
# classification using a multilingual language model
# 
# Checkout the full github repository: 
# https://github.com/ilhamfp/indonesian-text-classification-multilingual

import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import TensorDataset, DataLoader

RANDOM_SEED=1

def lowercase(text):
    return text.lower()

def remove_nonaplhanumeric(text):
    text = re.sub('[^0-9a-zA-Z]+', ' ', text) 
    return text

def remove_unnecessary_char(text):
    text = re.sub('\n',' ',text) # Remove every '\n'
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) # Remove every URL
    text = re.sub('  +', ' ', text) # Remove extra spaces
    text = text.strip()
    return text

def preprocess_text(text):
    text = lowercase(text)
    text = remove_nonaplhanumeric(text)
    text = remove_unnecessary_char(text)
    return text

def load_dataset_indonesian(data_name='prosa', data_path=None, data_path_test=None):
    if data_name == 'prosa':
        train = pd.read_csv('../input/dataset-prosa/data_train_full.tsv', sep='\t', header=None)
        train = train.rename(columns={0: "text", 1: "label"})
        train = train[train['label'] != 'neutral']
        train['label'] = train['label'].apply(lambda x: 1 if x=='positive' else 0)
        train['text'] = train['text'].apply(lambda x: preprocess_text(x))

        test = pd.read_csv('../input/dataset-prosa/data_testing_full.tsv', sep='\t', header=None)
        test = test.rename(columns={0: "text", 1: "label"})
        test = test[test['label'] != 'neutral']
        test['label'] = test['label'].apply(lambda x: 1 if x=='positive' else 0)
        test['text'] = test['text'].apply(lambda x: preprocess_text(x))
            
    elif data_name == 'trip_advisor':
        if data_path == None:
            train = pd.read_csv('../input/dataset-tripadvisor/train_set.csv')
#             train = pd.read_csv('../input/remove-duplicate-tripadvisor/train_set.csv')
        else:
            train = pd.read_csv(data_path)
            
        train = train.rename(columns={"content": "text", "polarity": "label"})
        train['label'] = train['label'].apply(lambda x: 1 if x=="positive" else 0)
        train['text'] = train['text'].apply(lambda x: preprocess_text(x))
        
        if data_path_test == None:
            test = pd.read_csv('../input/dataset-tripadvisor/test_set.csv')
#             test = pd.read_csv('../input/remove-duplicate-tripadvisor/test_set.csv')
        else:
            test = pd.read_csv(data_path_test)
            
        test = test.rename(columns={"content": "text", "polarity": "label"})
        test['label'] = test['label'].apply(lambda x: 1 if x=="positive" else 0)
        test['text'] = test['text'].apply(lambda x: preprocess_text(x))

    elif data_name == 'toxic':
        if data_path == None:
            data = pd.read_csv('../input/simpler-preprocess-indonesian-hate-abusive-text/preprocessed_indonesian_toxic_tweet.csv')
        else:
            data = pd.read_csv(data_path)
            
        temp = pd.DataFrame({
                   'HS': data['HS'].values,
                   'Abusive': data['Abusive'].values})

        data['label'] = temp.apply(lambda r: tuple(r), axis=1).apply(np.array)
            
        data = data[['Tweet', 'label']]
        data = data.rename(columns={'Tweet': 'text'})

        X_train, X_test, y_train, y_test = train_test_split(data.text.values, 
                                                            data.label.values, 
                                                            test_size=0.1,
                                                            random_state=RANDOM_SEED)
        train = pd.DataFrame({'text': X_train,
                              'label': y_train})

        test = pd.DataFrame({'text': X_test,
                             'label': y_test})
        
    print("~~~Train Data~~~")
    print('Shape: ', train.shape)
    print(train[0:2])
    print("\nLabel:")
#     print(train.label.value_counts())
    
    print("\n~~~Test Data~~~")
    print('Shape: ', test.shape)
    print(test[0:4])
    print("\nLabel:")
#     print(test.label.value_counts())
    return train, test
    
def load_dataset_foreign(data_name='yelp'):
    train = None
    if data_name == 'yelp':
        train = pd.read_csv('../input/yelp-review-dataset/yelp_review_polarity_csv/train.csv', header=None)
        train = train.rename(columns={0: "label", 1: "text"})
        train['label'] = train['label'].apply(lambda x: 1 if x==2 else 0)
        train['text'] = train['text'].apply(lambda x: preprocess_text(x))
    
    elif data_name == 'toxic':
        data = pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv')
        data['toxic'] = data['toxic'].apply(lambda x: 1 if x>=0.5 else 0)

        data = data[['comment_text', 'toxic']]
        data = data.rename(columns={'comment_text': 'text',
                                    'toxic': 'label'})

        data_pos = data[data['label'] == 1]
        data_neg = data[data['label'] == 0]
        train = pd.concat([data_pos[0:152111], 
                           data_neg[0:152111]]).reset_index(drop=True)
        
        train['text'] = train['text'].apply(lambda x: preprocess_text(x))

     
    print("~~~Data~~~")
    print('Shape: ', train.shape)
    print(train[0:2])
    print("\nLabel:")
    print(train.label.value_counts())
    return train

def split_train_test(train_x, train_y, total_data=50, valid_size=0.2):
    train_x_split, valid_x_split, train_y_split, valid_y_split = train_test_split(train_x, 
                                                                                  train_y, 
                                                                                  test_size=valid_size,
                                                                                  random_state=RANDOM_SEED)
    
        
    total_data_valid = int(np.floor(valid_size * total_data))
    total_data_train = total_data-total_data_valid

    train_x_split = train_x_split[:total_data_train]
    train_y_split = train_y_split[:total_data_train]
    valid_x_split = valid_x_split[:total_data_valid]
    valid_y_split = valid_y_split[:total_data_valid]
    
    return train_x_split, train_y_split, valid_x_split, valid_y_split
    
def load_features(data_path, total_data=50, valid_size=0.2):
    train_x = np.array([x for x in np.load('{}/train_text.npy'.format(data_path), allow_pickle=True)])
    train_y = pd.read_csv('{}/train_label.csv'.format(data_path)).label.values
    
    train_x_split, train_y_split, valid_x_split, valid_y_split = split_train_test(train_x,
                                                                                  train_y,
                                                                                  total_data=total_data,
                                                                                  valid_size=valid_size)
    return train_x_split, train_y_split, valid_x_split, valid_y_split
    

def load_experiment_features(data_path_indo,
                             data_path_foreign,
                             tipe='A', 
                             total_data=50, 
                             foreign_mult=1, 
                             valid_size=0.2,
                             ):
    ##########################
    # Load Preprocessed Data #
    ##########################
    if tipe == 'A':
        train_x, train_y, valid_x, valid_y = load_features(data_path_indo,
                                                           total_data=total_data, 
                                                           valid_size=valid_size)
        
    elif tipe == 'B':
        train_x, train_y, _, _ = load_features(data_path_foreign,
                                               total_data=total_data, 
                                               valid_size=valid_size)
        
        _, _, valid_x, valid_y = load_features(data_path_indo,
                                               total_data=total_data, 
                                               valid_size=valid_size)
        
    elif tipe == 'C':
        train_x_indo, train_y_indo, valid_x_indo, valid_y_indo = load_features(data_path_indo,
                                                                                total_data=total_data, 
                                                                                valid_size=valid_size)

        train_x_foreign, train_y_foreign, valid_x_foreign, valid_y_foreign = load_features(data_path_foreign,
                                                                                           total_data=int(total_data*foreign_mult), 
                                                                                           valid_size=valid_size)

        train_x = np.concatenate([
                    train_x_indo,
                    train_x_foreign,
                    ])

        train_y = np.concatenate([
                    train_y_indo,
                    train_y_foreign,
                ])

        valid_x = valid_x_indo

        valid_y = valid_y_indo
        

    test_x = np.array([x for x in np.load('{}/test_text.npy'.format(data_path_indo), allow_pickle=True)])
    test_y = pd.read_csv('{}/test_label.csv'.format(data_path_indo)).label.values

    #########################
    # Convert to dataloader #
    #########################
    batch_size = 32

    train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
    valid_data = TensorDataset(torch.from_numpy(valid_x), torch.from_numpy(valid_y))
    test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))
    
    train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(valid_data, batch_size=batch_size)
    test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size)

    return train_loader, valid_loader, test_loader

def load_train_dataset(data_name, total_data=50, valid_size=0.2, is_foreign=False):
    print('sampai')

    train = None
    if is_foreign:
        train = load_dataset_foreign(data_name)
    else:
        train, test = load_dataset_indonesian(data_name)

    print('sampai2')

    train_x_split, train_y_split, valid_x_split, valid_y_split = split_train_test(train.text.values,
                                                                                  train.label.values,
                                                                                  total_data=total_data,
                                                                                  valid_size=valid_size)
    print('sampai3')

    train_x_split = np.array([x for x in train_x_split])
    valid_x_split = np.array([x for x in valid_x_split])
    return train_x_split, train_y_split, valid_x_split, valid_y_split

def load_experiment_dataset(data_name_indo,
                            data_name_foreign,
                            tipe='A', 
                            total_data=50, 
                            foreign_mult=1, 
                            valid_size=0.2):
    
    #################
    # Load Raw Data #
    #################
    if tipe == 'A':
        train_x, train_y, valid_x, valid_y = load_train_dataset(data_name_indo,
                                                                total_data=total_data, 
                                                                valid_size=valid_size,
                                                                is_foreign=False)
        
    elif tipe == 'B':
        train_x, train_y, _, _ = load_train_dataset(data_name_foreign,
                                                    total_data=total_data, 
                                                    valid_size=valid_size,
                                                    is_foreign=True)
        
        _, _, valid_x, valid_y = load_train_dataset(data_name_indo,
                                                    total_data=total_data, 
                                                    valid_size=valid_size,
                                                    is_foreign=False)
        
    elif tipe == 'C':
        train_x_indo, train_y_indo, valid_x_indo, valid_y_indo = load_train_dataset(data_name_indo,
                                                                                    total_data=total_data, 
                                                                                    valid_size=valid_size,
                                                                                    is_foreign=False)

        train_x_foreign, train_y_foreign, valid_x_foreign, valid_y_foreign = load_train_dataset(data_name_foreign,
                                                                                                total_data=int(total_data*foreign_mult), 
                                                                                                valid_size=valid_size,
                                                                                                is_foreign=True)

        train_x = np.concatenate([
                    train_x_indo,
                    train_x_foreign,
                    ])

        train_y = np.concatenate([
                    train_y_indo,
                    train_y_foreign,
                ])

        valid_x = valid_x_indo

        valid_y = valid_y_indo
        
    

    _, test = load_dataset_indonesian(data_name=data_name_indo)
    test_x = test.text.values
    test_x = np.array([x for x in test_x])
    test_y = test.label.values
    
    indices = np.arange(len(train_x))
    np.random.seed(RANDOM_SEED)
    np.random.shuffle(indices)
    train_x = train_x[indices]
    train_y = train_y[indices]

    return (train_x, train_y), (valid_x, valid_y), (test_x, test_y)
    


In [None]:
import os
import random
import pandas as pd
import numpy as np
import torch
import tensorflow as tf
import transformers
from transformers import TFAutoModel, AutoTokenizer
set_seed(seed=RANDOM_SEED)

## TPU Configs

In [None]:
if USE_TPU:
    # Detect hardware, return appropriate distribution strategy
    try:
        # TPU detection. No parameters necessary if TPU_NAME environment variable is
        # set: this is always the case on Kaggle.
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
    except ValueError:
        tpu = None

    if tpu:
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
    else:
        # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
        strategy = tf.distribute.get_strategy()

    print("REPLICAS: ", strategy.num_replicas_in_sync)
    BATCH_SIZE = 8 * strategy.num_replicas_in_sync
    
else:
    BATCH_SIZE = 8 * 8

AUTO = tf.data.experimental.AUTOTUNE
MAX_LEN = 512

if MODEL_NAME == 'XLM_R':
    MODEL = 'jplu/tf-xlm-roberta-large'
elif MODEL_NAME == 'mBERT':
    MODEL = 'bert-base-multilingual-cased'

## Load Data

In [None]:
(x_train, y_train), (x_valid, y_valid), (x_test, y_test) = load_experiment_dataset(INDO_DATA_NAME,
                                                                                   FOREIGN_DATA_NAME,
                                                                                   tipe=EXPERIMENT_TYPE, 
                                                                                   total_data=TOTAL_DATA, 
                                                                                   foreign_mult=FOREIGN_LANG_DATA_MULT, 
                                                                                   valid_size=VALIDATION_DATA)

In [None]:
y_train = np.array([np.array([x[0], x[1]]) for x in y_train])
y_valid = np.array([np.array([x[0], x[1]]) for x in y_valid])
y_test = np.array([np.array([x[0], x[1]]) for x in y_test])

In [None]:
y_train.shape

## Tokenizing

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
%%time 

x_train = regular_encode(x_train, tokenizer, maxlen=MAX_LEN)
x_valid = regular_encode(x_valid, tokenizer, maxlen=MAX_LEN)
x_test = regular_encode(x_test, tokenizer, maxlen=MAX_LEN)

## Build Datasets Objects

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(len(x_train),
             seed=RANDOM_SEED)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)

## Load Model

In [None]:
%%time
if USE_TPU:
    with strategy.scope():
        transformer_layer = TFAutoModel.from_pretrained(MODEL)
        model = build_model(transformer_layer, max_len=MAX_LEN, learning_rate=LEARNING_RATE)
        
else:
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, max_len=MAX_LEN, learning_rate=LEARNING_RATE)
    
model.summary()

## Train

In [None]:
n_steps = x_train.shape[0] // BATCH_SIZE
print(n_steps)

In [None]:
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    callbacks = callback(), 
    epochs=EPOCHS
)

## Save

In [None]:
model.save_weights('model.h5') 

In [None]:
test_prediction = model.predict(test_dataset, verbose=1)

In [None]:
result_HS = pd.DataFrame()
result_HS['y_pred'] = test_prediction[:, 0]
result_HS['y_true'] = y_test[:, 0]
result_HS.to_csv('result_HS_{}_{}_{}_{}_{}_{}_full.csv'.format(INDO_DATA_NAME,
                                                    FOREIGN_DATA_NAME,
                                                    MODEL_NAME,
                                                    EXPERIMENT_TYPE,
                                                    TOTAL_DATA,
                                                    FOREIGN_LANG_DATA_MULT),
              index=False)

In [None]:
result_HS.head()

In [None]:
result_Abusive = pd.DataFrame()
result_Abusive['y_pred'] = test_prediction[:, 1]
result_Abusive['y_true'] = y_test[:, 1]
result_Abusive.to_csv('result_Abusive_{}_{}_{}_{}_{}_{}_full.csv'.format(INDO_DATA_NAME,
                                                    FOREIGN_DATA_NAME,
                                                    MODEL_NAME,
                                                    EXPERIMENT_TYPE,
                                                    TOTAL_DATA,
                                                    FOREIGN_LANG_DATA_MULT),
              index=False)

In [None]:
result_Abusive.head()