# Library

In [8]:
pip install tensorflow-gpu==2.1



In [9]:
pip install keras==2.3.1



In [10]:
pip install transformers==4.3.0



In [11]:
pip install pyvi



In [12]:
pip install sentencepiece



In [13]:
pip install vncorenlp



In [14]:
!mkdir -p vncorenlp/models/wordsegmenter
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr
!mv VnCoreNLP-1.1.1.jar vncorenlp/
!mv vi-vocab vncorenlp/models/wordsegmenter/
!mv wordsegmenter.rdr vncorenlp/models/wordsegmenter/

--2022-02-12 02:46:46--  https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 27412575 (26M) [application/octet-stream]
Saving to: ‘VnCoreNLP-1.1.1.jar’


2022-02-12 02:46:46 (307 MB/s) - ‘VnCoreNLP-1.1.1.jar’ saved [27412575/27412575]

--2022-02-12 02:46:46--  https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 526544 (514K) [application/octet-stream]
Saving to: ‘vi-voc

# Load data

In [21]:
import pandas as pd 
import numpy as np 

TEST_DATA = 'drive/My Drive/CODE/ViSocial/dataset/test_out_vihsd.csv'

# read data
test_data = pd.read_csv(TEST_DATA)

X_test = test_data['Comment']
y_test = test_data['label_id'].values

In [22]:
import pandas as pd 
import numpy as np 
import re 

from pyvi.ViTokenizer import ViTokenizer

from vncorenlp import VnCoreNLP

vncorenlp = VnCoreNLP("vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m') 

STOPWORDS = 'drive/My Drive/CODE/ViSocial/vietnamese-stopwords-dash.txt'
with open(STOPWORDS, "r") as ins:
    stopwords = []
    for line in ins:
        dd = line.strip('\n')
        stopwords.append(dd)
    stopwords = set(stopwords)

def filter_stop_words(train_sentences, stop_words):
    new_sent = [word for word in train_sentences.split() if word not in stop_words]
    train_sentences = ' '.join(new_sent)
        
    return train_sentences

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

def preprocess(text, tokenized = True, lowercased = True):
    text = ViTokenizer.tokenize(text) if tokenized else text
    text = filter_stop_words(text, stopwords)
    text = deEmojify(text)
    text = text.lower() if lowercased else text
    return text

def preprocess_phobert(text, tokenized=True, lowercased=True):
    # text = ViTokenizer.tokenize(text)
    # text = ' '.join(vncorenlp.tokenize(text)[0])
    text = filter_stop_words(text, stopwords)
    text = deEmojify(text)
    text = text.lower() if lowercased else text
    if tokenized:
        pre_text = ""
        sentences = vncorenlp.tokenize(text)
        for sentence in sentences:
            pre_text += " ".join(sentence)
        text = pre_text
    return text
# --------------TRICH XUAT DAC TRUNG -------------------------
def pre_process_features(X, y, tokenized = True, lowercased = True):
    X = [preprocess(str(p), tokenized = tokenized, lowercased = lowercased) for p in list(X)]
    for idx, ele in enumerate(X):
        if not ele:
            np.delete(X, idx)
            np.delete(y, idx)
    return X, y

def pre_process_features_phobert(X, y, tokenized = True, lowercased = True):
    X = [preprocess_phobert(str(p), tokenized = tokenized, lowercased = lowercased) for p in list(X)]
    for idx, ele in enumerate(X):
        if not ele:
            np.delete(X, idx)
            np.delete(y, idx)
    return X, y

def make_featues(X, y, tokenizer, is_one_hot_label=True, sequence_length=100):
    X = tokenizer.texts_to_sequences(X)
    X = sequence.pad_sequences(X, maxlen=sequence_length)
    if is_one_hot_label: 
        y = to_categorical(y, num_classes=3)

    return X, y

# Load saved models

In [25]:
import pickle
import torch

from keras.utils import to_categorical
from keras.preprocessing import text, sequence
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, BertTokenizer, BertForSequenceClassification

from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

from keras.models import load_model

class BuildDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


phobert = AutoModelForSequenceClassification.from_pretrained("drive/My Drive/CODE/ViSocial/transformer_model/vihsd_ensemble/phobert", num_labels = 3)
tokenizer_phobert = AutoTokenizer.from_pretrained("vinai/phobert-base",use_fast=False)

xlm_r_aug = AutoModelForSequenceClassification.from_pretrained("drive/My Drive/CODE/ViSocial/transformer_model/vihsd_ensemble/xlm-r", num_labels = 3)
tokenizer_xlm_r_aug = AutoTokenizer.from_pretrained("drive/My Drive/CODE/ViSocial/transformer_model/vihsd_ensemble/xlm-r",use_fast=False)

bert4news = AutoModelForSequenceClassification.from_pretrained("drive/My Drive/CODE/ViSocial/transformer_model/vihsd_ensemble/bert4news", num_labels = 3)
tokenizer_bert4news = AutoTokenizer.from_pretrained("NlpHUST/vibert4news-base-cased",use_fast=False)

Using TensorFlow backend.


Downloading:   0%|          | 0.00/557 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/895k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.
Some weights of the model checkpoint at drive/My Drive/CODE/ViSocial/transformer_model/vihsd_ensemble/xlm-r were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/551 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [27]:
test_X, test_y = pre_process_features(X_test, y_test, tokenized=False, lowercased = False)
test_encodings_xlm_r_aug = tokenizer_xlm_r_aug(test_X, truncation=True, padding=True, max_length=100)
test_dataset_xlm_r_aug = BuildDataset(test_encodings_xlm_r_aug, test_y)
pred_xlm_r_aug = Trainer(model=xlm_r_aug).predict(test_dataset_xlm_r_aug)

test_X, test_y = pre_process_features_phobert(X_test, y_test, tokenized=True, lowercased = False)
test_encodings_phobert = tokenizer_phobert(test_X, truncation=True, padding=True, max_length=100)
test_dataset_phobert = BuildDataset(test_encodings_phobert, test_y)
pred_phobert = Trainer(model=phobert).predict(test_dataset_phobert)

test_X, test_y = pre_process_features(X_test, y_test, tokenized=False, lowercased = False)
test_encodings_bert4news = tokenizer_bert4news(test_X, truncation=True, padding=True, max_length=100)
test_dataset_bert4news = BuildDataset(test_encodings_bert4news, test_y)
pred_bert4news = Trainer(model=bert4news).predict(test_dataset_bert4news)

# Hard voting

In [None]:
data_pred = [
#    np.argmax(pred_gru, axis=-1),
#    np.argmax(pred_cnn, axis=-1),
   np.argmax(pred_m_bert_cased.predictions, axis=-1),
#    np.argmax(pred_m_bert_uncased.predictions, axis=-1),
   np.argmax(pred_xlm_r.predictions, axis=-1),
   np.argmax(pred_distilbert.predictions, axis=-1),
   np.argmax(pred_phobert.predictions, axis=-1),
   np.argmax(pred_bert4news.predictions, axis=-1),          
]

In [None]:
final = []
for i in range(0, len(y_test)):
    tmp = [0,0,0]
    for d in data_pred:
        tmp[d[i]] +=1
    final.append(np.argmax(tmp, axis=-1))

In [None]:
cf = confusion_matrix(test_y, final)
print(cf)

evaluation = f1_score(test_y, final, average='micro')

print("F1 - micro: " + str(evaluation))

evaluation = f1_score(test_y, final, average='macro')
print("F1 - macro: " + str(evaluation))

evaluation = accuracy_score(test_y, final)
print("Accuracy: " + str(evaluation))

[[5365   60  123]
 [ 217  131   96]
 [ 260   47  381]]
F1 - micro: 0.8797904191616767
F1 - macro: 0.6392778544733275
Accuracy: 0.8797904191616767


# Soft voting

In [28]:
import numpy as np

def sigmoid_array(x): 
    return 1 / (1 + np.exp(-x))

data_pred = [
    sigmoid_array(pred_xlm_r_aug.predictions),
    sigmoid_array(pred_phobert.predictions),
    sigmoid_array(pred_bert4news.predictions)         
]

In [29]:
final = []
for i in range(0, len(y_test)):
    tmp = [0,0,0]
    for d in data_pred:
        tmp = tmp + d[i]
    tmp = tmp / len(data_pred)
    final.append(np.argmax(tmp, axis=-1))

In [30]:
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, precision_score, recall_score, classification_report

cf = confusion_matrix(test_y, final)
print(cf)

evaluation = f1_score(test_y, final, average='micro')

print("F1 - micro: " + str(evaluation))

evaluation = f1_score(test_y, final, average='macro')
print("F1 - macro: " + str(evaluation))

evaluation = accuracy_score(test_y, final)
print("Accuracy: " + str(evaluation))

report = classification_report(test_y, final)
print(report)

[[131   1   0]
 [  4   6  18]
 [  9   6  27]]
F1 - micro: 0.8118811881188119
F1 - macro: 0.6208826481068409
Accuracy: 0.8118811881188119
              precision    recall  f1-score   support

           0       0.91      0.99      0.95       132
           1       0.46      0.21      0.29        28
           2       0.60      0.64      0.62        42

    accuracy                           0.81       202
   macro avg       0.66      0.62      0.62       202
weighted avg       0.78      0.81      0.79       202



In [31]:
new_test = test_data
new_test['predicted_label_id'] = final 

new_test.to_csv('drive/My Drive/CODE/ViSocial/results/vihsd_test_out.csv')