# Library 

In [19]:
pip install tensorflow-gpu==2.1



In [20]:
pip install keras==2.3.1



In [21]:
pip install transformers==4.3.0



In [22]:
pip install h5py==2.10.0



In [23]:
pip install pyvi



In [24]:
pip install sentencepiece



# Load model and support functions

In [25]:
import re 

from pyvi.ViTokenizer import ViTokenizer

STOPWORDS = 'drive/My Drive/CODE/HSD/vietnamese-stopwords-dash.txt'
with open(STOPWORDS, "r") as ins:
    stopwords = []
    for line in ins:
        dd = line.strip('\n')
        stopwords.append(dd)
    stopwords = set(stopwords)

def filter_stop_words(train_sentences, stop_words):
    new_sent = [word for word in train_sentences.split() if word not in stop_words]
    train_sentences = ' '.join(new_sent)
        
    return train_sentences

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

def preprocess(text, tokenized = True, lowercased = True):
    text = ViTokenizer.tokenize(text) if tokenized else text
    # text = filter_stop_words(text, stopwords)
    # text = deEmojify(text)
    text = text.lower() if lowercased else text
    return text

In [26]:
import pickle
import torch

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.models import load_model

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, BertTokenizer, BertForSequenceClassification

from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

class BuildDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# with open('drive/My Drive/CODE/HSD/tokenizer_no_pp.pickle', 'rb') as handle:
#     tokenizer_dnn = pickle.load(handle)

# cnn_aug = load_model('drive/My Drive/CODE/HSD/model_aug/Text_CNN_model_v4_no_pp.h5')

xlm_r = AutoModelForSequenceClassification.from_pretrained("drive/My Drive/CODE/HSD/transformer_model/xlm-r-v4-no-pp", num_labels = 3)
tokenizer_xlm_r = AutoTokenizer.from_pretrained("xlm-roberta-base",use_fast=False)

phobert = AutoModelForSequenceClassification.from_pretrained("drive/My Drive/CODE/HSD/transformer_model/phobert-v4-no-pp", num_labels = 3)
tokenizer_phobert = AutoTokenizer.from_pretrained("vinai/phobert-base",use_fast=False)

bert4news = AutoModelForSequenceClassification.from_pretrained("drive/My Drive/CODE/HSD/transformer_model/bert4news-v9-no-pp", num_labels = 3)
tokenizer_bert4news = AutoTokenizer.from_pretrained("NlpHUST/vibert4news-base-cased",use_fast=False)

xlm_r_aug = AutoModelForSequenceClassification.from_pretrained("drive/My Drive/CODE/HSD/transformer_model_aug/xlm-r-v4-no-pp", num_labels = 3)
tokenizer_xlm_r_aug = AutoTokenizer.from_pretrained("xlm-roberta-base",use_fast=False)

Some weights of the model checkpoint at drive/My Drive/CODE/HSD/transformer_model/xlm-r-v4-no-pp were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.
Some weights of the model checkpoint at drive/My Drive/CODE/HSD/transformer_model_aug/xlm-r-v4-no-pp were not used when initializing XLMRobertaForSeque

In [27]:
import numpy as np

def sigmoid_array(x): 
    return 1 / (1 + np.exp(-x))

def predict_ensembles(text, lowercased=False, tokenized=False):
    labels = {
        0: "CLEAN",
        1: "OFFENSIVE",
        2: "HATE"
    }
    p_text = preprocess(text, lowercased=lowercased, tokenized=tokenized)

    # try:
    #     X1 = tokenizer_dnn.texts_to_sequences([p_text])
    #     X1 = sequence.pad_sequences(X1, maxlen=100)
    #     y1 = cnn_aug.predict(X1)
    # except Exception as e:
    #     print('error with cnn, used xlm-r instead')
    #     X1 = tokenizer_xlm_r([p_text], truncation=True, padding=True, max_length=100)
    #     X1 = BuildDataset(X2, [0])
    #     y1 = Trainer(model=xlm_r).predict(X1).predictions

    X1 = tokenizer_xlm_r_aug([p_text], truncation=True, padding=True, max_length=100)
    X1 = BuildDataset(X1, [0])
    y1 = Trainer(model=xlm_r_aug).predict(X1).predictions
    y1 = sigmoid_array(y1)

    X2 = tokenizer_phobert([p_text], truncation=True, padding=True, max_length=100)
    X2 = BuildDataset(X2, [0])
    y2 = Trainer(model=phobert).predict(X2).predictions
    y2 = sigmoid_array(y2)

    X3 = tokenizer_bert4news([p_text], truncation=True, padding=True, max_length=100)
    X3 = BuildDataset(X3, [0])
    y3 = Trainer(model=bert4news).predict(X3).predictions
    y3 = sigmoid_array(y3)

    y_pred = (y1 + y2 + y3) / 3

    return labels[np.argmax(y_pred, axis=-1)[0]]

# Test

## Example 

In [12]:
print(predict_ensembles("Chời ơi cái quần què dì đây"))

OFFENSIVE


In [13]:
print(predict_ensembles("Con bóng Long Xiên"))

HATE


In [14]:
print(predict_ensembles("Cô ba vàng mã đi dựt nợ, xạo l"))

HATE


In [15]:
print(predict_ensembles("Grrr, ngủ đi. Cắn lon bây giờ"))

OFFENSIVE


In [16]:
print(predict_ensembles("Dĩ dãng dơ dáy dễ dì dấu diếm"))

HATE


## Manually inupt

In [46]:
a = input()
print(predict_ensembles(a))

csgt rách 


CLEAN
