In [None]:
# Dataset analysis
import pandas as pd 
from pyvi.ViTokenizer import ViTokenizer

DATA = 'drive/My Drive/CODE/Hate speech detection/data/clean_dataset.csv'
DATA_NEW_10k = 'drive/My Drive/CODE/Hate speech detection/data/clean_dataset_new_10k.csv'
data = pd.read_csv(DATA, index_col=False)
data_new = pd.read_csv(DATA_NEW_10k, index_col=False)

data = pd.concat([data, data_new])

data['label_id'] = data['label_id'].fillna(-1).astype(int)

label0 = data.loc[data['label_id']==0]
label1 = data.loc[data['label_id']==1]
label2 = data.loc[data['label_id']==2]

print("Total data:", len(data))
print("Total data in label 0: ", len(label0))
print("Total data in label 1: ", len(label1))
print("Total data in label 2: ", len(label2))

def get_total_words(dt):
    texts = dt['free_text']
    count = 0
    for t in texts:
        tokenized_t = ViTokenizer.tokenize(str(t))
        list_words = tokenized_t.split()
        count = count + len(list_words)

    return count


print("Vocabulary size in labels 0:", get_total_words(label0))
print("Vocabulary size in labels 1:", get_total_words(label1))
print("Vocabulary size in labels 2:", get_total_words(label2))
print("Total Vocabulary size:", get_total_words(data))

print("Average words in labels 0:", get_total_words(label0)/len(label0))
print("Average words in labels 1:", get_total_words(label1)/len(label1))
print("Average words in labels 2:", get_total_words(label2)/len(label2))
print("Average words:", get_total_words(data)/len(data))

data.to_csv('drive/My Drive/CODE/Hate speech detection/data/hsd_data_new.csv', index=False)

Total data: 31337
Total data in label 0:  27045
Total data in label 1:  2463
Total data in label 2:  1829
Vocabulary size in labels 0: 425002
Vocabulary size in labels 1: 28350
Vocabulary size in labels 2: 37025
Total Vocabulary size: 490377
Average words in labels 0: 15.7146237751895
Average words in labels 1: 11.510353227771011
Average words in labels 2: 20.243302351011483
Average words: 15.64849857995341


In [None]:
# GRU - Gate recurrent units

import numpy as np
import pandas as pd
from keras.layers import Dense, Input, Bidirectional, GRU
from keras.layers import Embedding
from keras.preprocessing import text, sequence
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, SpatialDropout1D
from keras.models import Model
from keras import backend as K
from keras.utils import to_categorical
from sklearn.metrics import f1_score, confusion_matrix

from pyvi.ViTokenizer import ViTokenizer
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from unidecode import unidecode

# pre-process function
def preprocess(text):
    text = ViTokenizer.tokenize(text)
    # text = unidecode(text)
    text = text.lower()
    return text

# Configuration - please change theses setting compatible with yours
EMBEDDING_FILE = 'drive/My Drive/CODE/Hate speech detection/embedding/cc.vi.300.vec'
MODEL_FILE = 'drive/My Drive/CODE/Hate speech detection/model_social/GRU_model_ccSC.h5'
DATA = 'drive/My Drive/CODE/Hate speech detection/data/clean_dataset.csv'

max_features = 11221
maxlen = 1000
embed_size = 300
batch_size = 1024
epochs = 10

# read data
data = pd.read_csv(DATA, index_col=False)

O_X = data['free_text']
O_y = data['label_id']

train_set = O_X
target_set = O_y

# --------------TRICH XUAT DAC TRUNG -------------------------
tokenizer = text.Tokenizer(num_words=None, lower=True, filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n')
tokenizer.fit_on_texts(train_set.astype(str))


#--------------END TRICH XUAT DAC TRUNG -------------------------

embeddings_index = {}
with open(EMBEDDING_FILE, encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

word_index = tokenizer.word_index
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, embed_size))
max_features = num_words

for word, i in word_index.items():
    if i >= max_features:
        continue

    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# ------------------- XAY DUNG MO HINH MANG NEURAL -----------------------
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(GRU(80, return_sequences=True))(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
conc = concatenate([avg_pool, max_pool])
outp = Dense(3, activation="sigmoid")(conc)

model = Model(inputs=inp, outputs=outp)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# ------------------- END XAY DUNG MO HINH MANG NEURAL -----------------------

# Kfold cross validation
results = []
confuses = []
kfold = StratifiedShuffleSplit(n_splits=5, test_size=0.2)

count = 1
X = train_set
y = target_set

acc = 0

for train, test in kfold.split(X, y):
    X_train_fold = X[train]
    y_train_fold = y[train]

    X_test_fold = X[test]
    y_test_fold = y[test]

    X_train_fold = [preprocess(str(p)) for p in list(X[train])]
    X_train_fold = tokenizer.texts_to_sequences(X_train_fold)
    X_train_fold = sequence.pad_sequences(X_train_fold, maxlen=maxlen)

    X_test_fold = [preprocess(str(p)) for p in list(X[test])]
    X_test_fold = tokenizer.texts_to_sequences(X_test_fold)
    X_test_fold = sequence.pad_sequences(X_test_fold, maxlen=maxlen)

    y_train_fold = to_categorical(y_train_fold, num_classes=3)
    y_test_fold = y_test_fold

    model.fit(X_train_fold, y_train_fold, batch_size=batch_size, epochs=epochs, verbose=1)
    prediction = model.predict(X_test_fold, batch_size=batch_size, verbose=1)
    test_pred = prediction.argmax(axis=-1)
    evaluate = f1_score(y_test_fold, test_pred, average='macro')
    confuse = confusion_matrix(y_test_fold, test_pred, labels=[0, 1, 2])

    print('===============================================')
    print("FOLD {}: {}".format(count, evaluate))
    print(confuse)
    results.append(evaluate)
    confuses.append(confuse)
    print('===============================================')

    if evaluate > acc:
        model.save('drive/My Drive/CODE/Hate speech detection/model_social/GRU/gru_model.h5')
        acc = evaluate

    count = count + 1

print("average acc: {}".format(str(np.mean(results))))
print("average conf mat: {}".format(np.mean(confuses, axis=-3)))
print("Best accuracy: {}".format(acc))

Using TensorFlow backend.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
FOLD 1: 0.6535180730154556
[[3684   22   17]
 [ 121   62   21]
 [  56   13   73]]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
FOLD 2: 0.7064977830679461
[[3675   34   14]
 [  87   88   29]
 [  40   17   85]]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
FOLD 3: 0.7761957165200927
[[3690   24    9]
 [  76  107   21]
 [  23   18  101]]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
FOLD 4: 0.8431994987285479
[[3697   18    8]
 [  43  144   17]
 [  19   15  108]]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
FOLD 5: 0.8780400718404611
[[3705   15    3]
 [  37  153   14]
 [   9   15  118]]
average acc: 0.7714902286345007
ave

In [None]:
# Text CNN 
import numpy as np
import pandas as pd
from keras.layers import Dense, Input, Bidirectional, GRU, Reshape, Conv2D, MaxPool2D, Concatenate, Flatten, Dropout
from keras.layers import Embedding
from keras.preprocessing import text, sequence
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, SpatialDropout1D
from keras.models import Model
from keras import backend as K
from keras.utils import to_categorical
from sklearn.metrics import f1_score, confusion_matrix

from pyvi.ViTokenizer import ViTokenizer
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from unidecode import unidecode

# pre-process function
def preprocess(text):
    text = text.split(" ")
    text = unidecode(text)
    text = text.lower()
    return text


# configuration 
EMBEDDING_FILE = 'drive/My Drive/CODE/Hate speech detection/embedding/cc.vi.300.vec'
MODEL_FILE = 'drive/My Drive/CODE/Hate speech detection/model/TextCNN_model_ccSC_new10k.h5'
DATA = 'drive/My Drive/CODE/Hate speech detection/data/clean_dataset.csv'

max_features = 20987
maxlen = 1000

embed_size = 300
batch_size = 1024
epochs = 10

# read data
data = pd.read_csv(DATA)
print(len(data))


O_X = data['free_text']
O_y = data['label_id']

train_set = O_X
target_set = O_y

# --------------TRICH XUAT DAC TRUNG -------------------------
# tokenizer = text.Tokenizer(num_words=None, lower=True, filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n')
tokenizer = text.Tokenizer(num_words=None, lower=True)
tokenizer.fit_on_texts(train_set.astype(str))


# --------------END TRICH XUAT DAC TRUNG -------------------------
embeddings_index = {}
with open(EMBEDDING_FILE, encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

word_index = tokenizer.word_index
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, embed_size))

max_features = num_words

for word, i in word_index.items():
    if i >= max_features:
        continue

    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


# ------------------- XAY DUNG MO HINH MANG NEURAL -----------------------
filter_sizes = [1,2,3,5]
num_filters = 32

inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = SpatialDropout1D(0.4)(x)
x = Reshape((maxlen, embed_size, 1))(x)

conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embed_size), kernel_initializer='normal',
                activation='elu')(x)
conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embed_size), kernel_initializer='normal',
                activation='elu')(x)
conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embed_size), kernel_initializer='normal',
                activation='elu')(x)
conv_3 = Conv2D(num_filters, kernel_size=(filter_sizes[3], embed_size), kernel_initializer='normal',
                activation='elu')(x)

maxpool_0 = MaxPool2D(pool_size=(maxlen - filter_sizes[0] + 1, 1))(conv_0)
maxpool_1 = MaxPool2D(pool_size=(maxlen - filter_sizes[1] + 1, 1))(conv_1)
maxpool_2 = MaxPool2D(pool_size=(maxlen - filter_sizes[2] + 1, 1))(conv_2)
maxpool_3 = MaxPool2D(pool_size=(maxlen - filter_sizes[3] + 1, 1))(conv_3)

z = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3])
z = Flatten()(z)
z = Dropout(0.1)(z)


outp = Dense(3, activation="sigmoid")(z)

model = Model(inputs=inp, outputs=outp)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# ------------------- END XAY DUNG MO HINH MANG NEURAL -----------------------

# Kfold cross validation
results = []
confuses = []
kfold = StratifiedShuffleSplit(n_splits=5, test_size=0.2)

count = 1
X = train_set
y = target_set

acc = 0
for train, test in kfold.split(X, y):
    X_train_fold = X[train]
    y_train_fold = y[train]

    X_test_fold = X[test]
    y_test_fold = y[test]

    X_train_fold = [preprocess(str(p)) for p in list(X[train])]
    X_train_fold = tokenizer.texts_to_sequences(X_train_fold)
    X_train_fold = sequence.pad_sequences(X_train_fold, maxlen=maxlen)

    X_test_fold = [preprocess(str(p)) for p in list(X[test])]
    X_test_fold = tokenizer.texts_to_sequences(X_test_fold)
    X_test_fold = sequence.pad_sequences(X_test_fold, maxlen=maxlen)

    y_train_fold = to_categorical(y_train_fold, num_classes=3)
    y_test_fold = y_test_fold

    model.fit(X_train_fold, y_train_fold, batch_size=batch_size, epochs=epochs, verbose=1)
    prediction = model.predict(X_test_fold, batch_size=batch_size, verbose=1)
    test_pred = prediction.argmax(axis=-1)
    evaluate = f1_score(y_test_fold, test_pred, average='macro')
    confuse = confusion_matrix(y_test_fold, test_pred, labels=[0, 1, 2])

    print('===============================================')
    print("FOLD {}: {}".format(count, evaluate))
    print(confuse)
    results.append(evaluate)
    confuses.append(confuse)
    print('===============================================')

    if evaluate > acc:
        model.save('drive/My Drive/CODE/Hate speech detection/model_social/TextCNN/textcnn_model_new_10k.h5')
        acc = evaluate
    count = count + 1

print("average acc: {}".format(str(np.mean(results))))
print("average conf mat: {}".format(np.mean(confuses, axis=-3)))
print("Best accuracy: {}".format(acc))

20345


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
FOLD 1: 0.6339495489825886
[[3706   12    5]
 [ 125   59   20]
 [  62   20   60]]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
FOLD 2: 0.7238596985331908
[[3686   21   16]
 [  87   97   20]
 [  46   16   80]]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
FOLD 3: 0.8425979096643826
[[3708   11    4]
 [  57  135   12]
 [  30    9  103]]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
FOLD 4: 0.904294585995897
[[3704   13    6]
 [  41  157    6]
 [  14    4  124]]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
FOLD 5: 0.9450676018053382
[[3711   11    1]
 [  25  177    2]
 [   6    5  131]]
average acc: 0.8099538689962793
aver

In [None]:
# Bi LSTM 

import numpy as np
import pandas as pd
from keras.layers import Dense, Input, Bidirectional, GRU, Reshape, Conv2D, MaxPool2D, Concatenate, Flatten, Dropout, \
    GlobalMaxPool1D, LSTM
from keras.layers import Embedding
from keras.preprocessing import text, sequence
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, SpatialDropout1D
from keras.models import Model
from keras import backend as K
from keras.utils import to_categorical
from sklearn.metrics import f1_score

from pyvi.ViTokenizer import ViTokenizer
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from unidecode import unidecode

# Pre-process function
def preprocess(text):
    text = ViTokenizer.tokenize(text)
    text = unidecode(text)
    text = text.lower()
    return text


EMBEDDING_FILE = 'drive/My Drive/CODE/Hate speech detection/embedding/cc.vi.300.vec'
MODEL_FILE = 'drive/My Drive/CODE/Hate speech detection/model/BiLSTM_model_ccSC.h5'

DATA = 'drive/My Drive/CODE/Hate speech detection/data/clean_dataset.csv'

max_features = 11221
maxlen = 1000
embed_size = 300
batch_size = 1024
epochs = 10

# read data
data = pd.read_csv(DATA)

O_X = data['free_text']
O_y = data['label_id']

train_set = O_X
target_set = O_y

# --------------TRICH XUAT DAC TRUNG -------------------------
# Vectorize text + Prepare GloVe Embedding
tokenizer = text.Tokenizer(num_words=None, lower=True, filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n')
tokenizer.fit_on_texts(train_set.astype(str))


# --------------END TRICH XUAT DAC TRUNG -------------------------

embeddings_index = {}
with open(EMBEDDING_FILE, encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

word_index = tokenizer.word_index
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, embed_size))
max_features = num_words

for word, i in word_index.items():
    if i >= max_features:
        continue

    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# ------------------- XAY DUNG MO HINH MANG NEURAL -----------------------
inp = Input(shape=(maxlen, ))
x = Embedding(max_features, embed_size)(inp)
x = Bidirectional(LSTM(50, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(3, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# ------------------- END XAY DUNG MO HINH MANG NEURAL -----------------------

# Kfold cross validation
results = []
confuses = []
kfold = StratifiedShuffleSplit(n_splits=5, test_size=0.2)

count = 1
X = train_set
y = target_set

acc = 0
for train, test in kfold.split(X, y):
    X_train_fold = X[train]
    y_train_fold = y[train]

    X_test_fold = X[test]
    y_test_fold = y[test]

    X_train_fold = [preprocess(str(p)) for p in list(X[train])]
    X_train_fold = tokenizer.texts_to_sequences(X_train_fold)
    X_train_fold = sequence.pad_sequences(X_train_fold, maxlen=maxlen)

    X_test_fold = [preprocess(str(p)) for p in list(X[test])]
    X_test_fold = tokenizer.texts_to_sequences(X_test_fold)
    X_test_fold = sequence.pad_sequences(X_test_fold, maxlen=maxlen)

    y_train_fold = to_categorical(y_train_fold, num_classes=3)
    y_test_fold = y_test_fold

    model.fit(X_train_fold, y_train_fold, batch_size=batch_size, epochs=epochs, verbose=1)
    prediction = model.predict(X_test_fold, batch_size=batch_size, verbose=1)
    test_pred = prediction.argmax(axis=-1)
    evaluate = f1_score(y_test_fold, test_pred, average='macro')
    confuse = confusion_matrix(y_test_fold, test_pred, labels=[0, 1, 2])

    print('===============================================')
    print("FOLD {}: {}".format(count, evaluate))
    print(confuse)
    results.append(evaluate)
    confuses.append(confuse)
    print('===============================================')

    if evaluate > acc:
        model.save('drive/My Drive/CODE/Hate speech detection/model_social/BiLSTM/bilstm_model.h5')
        acc = evaluate
    
    count = count + 1

print("average acc: {}".format(str(np.mean(results))))
print("average conf mat: {}".format(np.mean(confuses, axis=-3)))
print("Best accuracy: {}".format(acc))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
FOLD 1: 0.5884989887249847
[[5141  179   89]
 [ 269  161   63]
 [ 113   91  162]]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
FOLD 2: 0.6731338404908982
[[5219  122   68]
 [ 192  209   92]
 [  96   52  218]]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
FOLD 3: 0.7690390288716608
[[5286   88   35]
 [ 152  297   44]
 [  82   46  238]]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
FOLD 4: 0.825934514607532
[[5297   80   32]
 [ 106  349   38]
 [  45   43  278]]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
FOLD 5: 0.8572682289639184
[[5342   41   26]
 [ 115  347   31]
 [  46   23  297]]
average acc: 0.7427749203317988
aver

In [None]:
# LSTM 

import numpy as np
import pandas as pd
from keras.layers import Dense, Input, Bidirectional, GRU, Reshape, Conv2D, MaxPool2D, Concatenate, Flatten, Dropout, \
    GlobalMaxPool1D, LSTM
from keras.layers import Embedding
from keras.preprocessing import text, sequence
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, SpatialDropout1D
from keras.models import Model
from keras import backend as K
from keras.utils import to_categorical

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Activation
from keras.layers.normalization import BatchNormalization
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints

from sklearn.metrics import f1_score

from pyvi.ViTokenizer import ViTokenizer
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from unidecode import unidecode

# support function
def preprocess(text):
    text = ViTokenizer.tokenize(text)
    text = unidecode(text)
    text = text.lower()
    return text

class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
        """
        self.supports_masking = True
        #self.init = initializations.get('glorot_uniform')
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        # eij = K.dot(x, self.W) TF backend doesn't support it

        # features_dim = self.W.shape[0]
        # step_dim = x._keras_shape[1]

        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
    #print weigthted_input.shape
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        #return input_shape[0], input_shape[-1]
        return input_shape[0],  self.features_dim

EMBEDDING_FILE = 'drive/My Drive/CODE/Hate speech detection/embedding/cc.vi.300.vec'
MODEL_FILE = 'drive/My Drive/CODE/Hate speech detection/model/LSTM_model_ccSC.h5'

DATA = 'drive/My Drive/CODE/Hate speech detection/data/hsd_data_new.csv'
# DATA = 'drive/My Drive/CODE/Hate speech detection/data/clean_dataset.csv'

max_features = 11221
maxlen = 1000
embed_size = 300
batch_size = 1024
epochs = 10

num_lstm = 300
num_dense = 256
rate_drop_lstm = 0.25
rate_drop_dense = 0.25

act = 'relu'

# read data
data = pd.read_csv(DATA)

O_X = data['free_text']
O_y = data['label_id']

train_set = O_X
target_set = O_y

# --------------TRICH XUAT DAC TRUNG -------------------------
# Vectorize text + Prepare GloVe Embedding
tokenizer = text.Tokenizer(num_words=None, lower=True, filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n')
tokenizer.fit_on_texts(train_set.astype(str))


# --------------END TRICH XUAT DAC TRUNG -------------------------

embeddings_index = {}
with open(EMBEDDING_FILE, encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

word_index = tokenizer.word_index
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, embed_size))
max_features = num_words

for word, i in word_index.items():
    if i >= max_features:
        continue

    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# ------------------- XAY DUNG MO HINH MANG NEURAL -----------------------
inp = Input(shape=(maxlen, ))
embedded_sequences = Embedding(max_features, embed_size)(inp)
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm,return_sequences=True)
x = lstm_layer(embedded_sequences)
x = Dropout(rate_drop_dense)(x)
merged = Attention(maxlen)(x)
merged = Dense(num_dense, activation=act)(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)
preds = Dense(6, activation='sigmoid')(merged)

model = Model(inputs=inp, outputs=preds)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# ------------------- END XAY DUNG MO HINH MANG NEURAL -----------------------

# Kfold
results = []
confuses = []
kfold = StratifiedShuffleSplit(n_splits=5, test_size=0.2)

count = 1
X = train_set
y = target_set

acc = 0
for train, test in kfold.split(X, y):
    X_train_fold = X[train]
    y_train_fold = y[train]

    X_test_fold = X[test]
    y_test_fold = y[test]

    X_train_fold = [preprocess(str(p)) for p in list(X[train])]
    X_train_fold = tokenizer.texts_to_sequences(X_train_fold)
    X_train_fold = sequence.pad_sequences(X_train_fold, maxlen=maxlen)

    X_test_fold = [preprocess(str(p)) for p in list(X[test])]
    X_test_fold = tokenizer.texts_to_sequences(X_test_fold)
    X_test_fold = sequence.pad_sequences(X_test_fold, maxlen=maxlen)

    y_train_fold = to_categorical(y_train_fold, num_classes=3)
    y_test_fold = y_test_fold

    model.fit(X_train_fold, y_train_fold, batch_size=batch_size, epochs=epochs, verbose=1)
    prediction = model.predict(X_test_fold, batch_size=batch_size, verbose=1)
    test_pred = prediction.argmax(axis=-1)
    evaluate = f1_score(y_test_fold, test_pred, average='macro')
    confuse = confusion_matrix(y_test_fold, test_pred, labels=[0, 1, 2])

    print('===============================================')
    print("FOLD {}: {}".format(count, evaluate))
    print(confuse)
    results.append(evaluate)
    confuses.append(confuse)
    print('===============================================')

    if evaluate > acc:
        model.save('drive/My Drive/CODE/Hate speech detection/model_social/LSTM/bilstm_model.h5')
        acc = evaluate
    
    count = count + 1

print("average acc: {}".format(str(np.mean(results))))
print("average conf mat: {}".format(np.mean(confuses, axis=-3)))
print("Best accuracy: {}".format(acc))

In [None]:
# SVM
import pandas as pd

# read data
from pyvi.ViTokenizer import ViTokenizer
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import FeatureUnion
import numpy as np
from unidecode import unidecode
from joblib import dump
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import FunctionTransformer


def preprocess(text):
    text = ViTokenizer.tokenize(text)
    # text = unidecode(text)
    text = text.lower()
    return text


BADWORDS = 'drive/My Drive/CODE/Hate speech detection/bad_words.txt'
MODEL_FILE = 'drive/My Drive/CODE/Hate speech detection/MODEL/SVM.joblib'
STOPWORDS = 'drive/My Drive/CODE/Hate speech detection/stopwords.txt'

original_data = pd.read_csv('drive/My Drive/CODE/Hate speech detection/data/clean_dataset.csv')

O_X = original_data['free_text']
O_y = original_data['label_id']

with open(BADWORDS, "r") as ins:
    badwords = []
    for line in ins:
        dd = line.strip('\n')
        badwords.append(preprocess(dd))

# O_X = [preprocess(str(t)) for t in O_X]

# features extraction
with open(STOPWORDS, "r") as ins:
    stop_words = []
    for line in ins:
        dd = line.strip('\n')
        stop_words.append(dd)


vect_word = TfidfVectorizer(max_features=20000, lowercase=True, analyzer='word',
                        stop_words=stop_words, ngram_range=(1, 3), dtype=np.float32)

vect_char = TfidfVectorizer(max_features=40000, lowercase=True, analyzer='char',
                        stop_words=stop_words, ngram_range=(3, 6), dtype=np.float32)

bad_words_count = TfidfVectorizer(vocabulary=set(badwords), lowercase=True)

features_extractor = FeatureUnion([                              
    ("vect_word", vect_word),
    ("vect_char", vect_char),
    ("bad_words_count", bad_words_count), 
])

# Train and test division
train_set = O_X
target_set = O_y

features_extractor.fit([preprocess(str(t)) for t in train_set])

# Build Model
model = svm.SVC(kernel='linear', C=1)

# Kfold cross validation
results = []
confuses = []
kfold = StratifiedShuffleSplit(n_splits=5, test_size=0.2)

count = 1
X = train_set
y = target_set

acc = 0
for train, test in kfold.split(X, y):
    X_train_fold = X[train]
    y_train_fold = y[train]

    X_test_fold = X[test]
    y_test_fold = y[test]

    X_train_fold = [preprocess(str(t)) for t in X_train_fold]
    X_train_fold = features_extractor.transform(X_train_fold)

    X_test_fold = [preprocess(str(t)) for t in X_test_fold]
    X_test_fold = features_extractor.transform(X_test_fold)

    model.fit(X_train_fold, y_train_fold)
    prediction = model.predict(X_test_fold)

    test_pred = prediction

    evaluate = f1_score(y_test_fold, test_pred, average='macro')
    confuse = confusion_matrix(y_test_fold, test_pred, labels=[0, 1, 2])

    print('===============================================')
    print("FOLD {}: {}".format(count, evaluate))
    print(confuse)
    results.append(evaluate)
    confuses.append(confuse)
    print('===============================================')

    if evaluate > acc:
        dump(model, 'drive/My Drive/CODE/Hate speech detection/model_social/SVM/svm_model.h5')
        acc = evaluate
        
    count = count + 1

print("average acc: {}".format(str(np.mean(results))))
print(confuses)
print("average conf mat: {}".format(np.mean(confuses, axis=-3)))
print("Best accuracy: {}".format(acc))



FOLD 1: 0.6572873941816839
[[3703   10   10]
 [ 121   68   15]
 [  59   20   63]]
FOLD 2: 0.6580362774100347
[[3683   24   16]
 [ 119   65   20]
 [  47   21   74]]
FOLD 3: 0.647673545712958
[[3688   21   14]
 [ 121   66   17]
 [  45   30   67]]
FOLD 4: 0.6722246266202124
[[3678   27   18]
 [ 118   70   16]
 [  50   16   76]]
FOLD 5: 0.6866230291881834
[[3681   24   18]
 [ 111   72   21]
 [  44   15   83]]
average acc: 0.6643689746226145
[array([[3703,   10,   10],
       [ 121,   68,   15],
       [  59,   20,   63]]), array([[3683,   24,   16],
       [ 119,   65,   20],
       [  47,   21,   74]]), array([[3688,   21,   14],
       [ 121,   66,   17],
       [  45,   30,   67]]), array([[3678,   27,   18],
       [ 118,   70,   16],
       [  50,   16,   76]]), array([[3681,   24,   18],
       [ 111,   72,   21],
       [  44,   15,   83]])]
average conf mat: [[3686.6   21.2   15.2]
 [ 118.    68.2   17.8]
 [  49.    20.4   72.6]]
Best accuracy: 0.6866230291881834


In [None]:
# Logistic 

import pandas as pd
from pyvi.ViTokenizer import ViTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import FeatureUnion
import numpy as np
from unidecode import unidecode
from joblib import dump


def preprocess(text):
    text = ViTokenizer.tokenize(text)
    # text = unidecode(text)
    text = text.lower()
    return text


MODEL_FILE = 'drive/My Drive/CODE/Hate speech detection/MODEL/Logistic.joblib'
STOPWORDS = 'drive/My Drive/CODE/Hate speech detection/stopwords.txt'
original_data = pd.read_csv('drive/My Drive/CODE/Hate speech detection/data/clean_dataset.csv')
BADWORDS = 'drive/My Drive/CODE/Hate speech detection/bad_words.txt'

with open(BADWORDS, "r") as ins:
    badwords = []
    for line in ins:
        dd = line.strip('\n')
        badwords.append(preprocess(dd))

O_X = original_data['free_text']
O_y = original_data['label_id']

# features extraction
with open(STOPWORDS, "r") as ins:
    stop_words = []
    for line in ins:
        stop_words.append(line.strip('\n'))


vect_word = TfidfVectorizer(max_features=20000, lowercase=True, analyzer='word',
                        stop_words=stop_words, ngram_range=(1, 3), dtype=np.float32)

vect_char = TfidfVectorizer(max_features=40000, lowercase=True, analyzer='char',
                        stop_words=stop_words, ngram_range=(3, 6), dtype=np.float32)

bad_words_count = TfidfVectorizer(vocabulary=set(badwords), lowercase=True)

features_extractor = FeatureUnion([
    ("vect_word", vect_word),
    ("vect_char", vect_char),
    ("bad_words_count", bad_words_count)
])

# Train and test division
train_set = O_X
target_set = O_y

features_extractor.fit([preprocess(str(t)) for t in train_set])

# Build Model
model = LogisticRegression(C=2, class_weight='balanced')

# Kfold cross validation
results = []
confuses = []
kfold = StratifiedShuffleSplit(n_splits=5, test_size=0.2)

count = 1
X = train_set
y = target_set

acc = 0

for train, test in kfold.split(X, y):
    X_train_fold = X[train]
    y_train_fold = y[train]

    X_test_fold = X[test]
    y_test_fold = y[test]

    X_train_fold = [preprocess(str(p)) for p in X_train_fold]
    X_train_fold = features_extractor.transform(X_train_fold)
    
    X_test_fold = [preprocess(str(p)) for p in X_test_fold]
    X_test_fold = features_extractor.transform(X_test_fold)
    

    model.fit(X_train_fold, y_train_fold)
    prediction = model.predict(X_test_fold)

    test_pred = prediction
    
    evaluate = f1_score(y_test_fold, test_pred, average='macro')
    confuse = confusion_matrix(y_test_fold, test_pred, labels=[0, 1, 2])

    print('===============================================')
    print("FOLD {}: {}".format(count, evaluate))
    print(confuse)
    results.append(evaluate)
    confuses.append(confuse)
    print('===============================================')

    if evaluate > acc:
        dump(model, 'drive/My Drive/CODE/Hate speech detection/model_social/Logistic/logistic_model.h5')
        acc = evaluate

    count = count + 1

print("average acc: {}".format(str(np.mean(results))))
print(confuses)
print("average conf mat: {}".format(np.mean(confuses, axis=-3)))
print("Best accuracy: {}".format(acc))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


FOLD 1: 0.6351474258640382
[[3535  138   50]
 [  82   91   31]
 [  28   29   85]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


FOLD 2: 0.6765822217757322
[[3556  118   49]
 [  86   94   24]
 [  23   19  100]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


FOLD 3: 0.6418723416760023
[[3543  131   49]
 [  84  100   20]
 [  29   35   78]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


FOLD 4: 0.6542610247540747
[[3550  128   45]
 [  76   95   33]
 [  26   26   90]]
FOLD 5: 0.6685020794405822
[[3577   93   53]
 [  86   87   31]
 [  23   20   99]]
average acc: 0.655273018702086
[array([[3535,  138,   50],
       [  82,   91,   31],
       [  28,   29,   85]]), array([[3556,  118,   49],
       [  86,   94,   24],
       [  23,   19,  100]]), array([[3543,  131,   49],
       [  84,  100,   20],
       [  29,   35,   78]]), array([[3550,  128,   45],
       [  76,   95,   33],
       [  26,   26,   90]]), array([[3577,   93,   53],
       [  86,   87,   31],
       [  23,   20,   99]])]
average conf mat: [[3552.2  121.6   49.2]
 [  82.8   93.4   27.8]
 [  25.8   25.8   90.4]]
Best accuracy: 0.6765822217757322


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
# Naive Bayes

import pandas as pd
from pyvi.ViTokenizer import ViTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import FeatureUnion
import numpy as np
from unidecode import unidecode
from joblib import dump
from sklearn.model_selection import StratifiedShuffleSplit


def preprocess(text):
    text = ViTokenizer.tokenize(text)
    # text = unidecode(text)
    text = text.lower()
    return text


MODEL_FILE = 'drive/My Drive/CODE/Hate speech detection/MODEL/Logistic.joblib'
STOPWORDS = 'drive/My Drive/CODE/Hate speech detection/stopwords.txt'
BADWORDS = 'drive/My Drive/CODE/Hate speech detection/bad_words.txt'

with open(BADWORDS, "r") as ins:
    badwords = []
    for line in ins:
        dd = line.strip('\n')
        badwords.append(preprocess(dd))

original_data = pd.read_csv('drive/My Drive/CODE/Hate speech detection/data/clean_dataset.csv')

O_X = original_data['free_text']
O_y = original_data['label_id']

# features extraction
with open(STOPWORDS, "r") as ins:
    stop_words = []
    for line in ins:
        stop_words.append(line.strip('\n'))


vect_word = TfidfVectorizer(max_features=20000, lowercase=True, analyzer='word',
                        stop_words=stop_words, ngram_range=(1, 3), dtype=np.float32)

vect_char = TfidfVectorizer(max_features=40000, lowercase=True, analyzer='char',
                        stop_words=stop_words, ngram_range=(3, 6), dtype=np.float32)

bad_words_count = TfidfVectorizer(vocabulary=(set(badwords)), lowercase=True)

features_extractor = FeatureUnion([
    ("vect_word", vect_word),
    ("vect_char", vect_char),
    ("bad_words_count", bad_words_count)
])

# Train and test division
train_set = O_X
target_set = O_y


features_extractor.fit([preprocess(str(t)) for t in train_set])


# Build Model
model = MultinomialNB(alpha=0.1)

# Kfold cross validation
results = []
confuses = []
kfold = StratifiedShuffleSplit(n_splits=5, test_size=0.2)

count = 1
X = train_set
y = target_set

acc = 0
for train, test in kfold.split(X, y):
    X_train_fold = X[train]
    y_train_fold = y[train]

    X_test_fold = X[test]
    y_test_fold = y[test]

    X_train_fold = [preprocess(str(p)) for p in X_train_fold]
    X_train_fold = features_extractor.transform(X_train_fold)
    
    X_test_fold = [preprocess(str(p)) for p in X_test_fold]
    X_test_fold = features_extractor.transform(X_test_fold)
    

    model.fit(X_train_fold, y_train_fold)
    prediction = model.predict(X_test_fold)

    test_pred = prediction
    
    evaluate = f1_score(y_test_fold, test_pred, average='macro')
    confuse = confusion_matrix(y_test_fold, test_pred, labels=[0, 1, 2])

    print('===============================================')
    print("FOLD {}: {}".format(count, evaluate))
    print(confuse)
    results.append(evaluate)
    confuses.append(confuse)
    print('===============================================')

    if evaluate > acc:
        dump(model, 'drive/My Drive/CODE/Hate speech detection/model_social/Naive Bayes/naive_bayes_model.h5')
        acc = evaluate

    count = count + 1

print("average acc: {}".format(str(np.mean(results))))
print(confuses)
print("average conf mat: {}".format(np.mean(confuses, axis=-3)))
print("Best accuracy: {}".format(acc))



FOLD 1: 0.6228704416847504
[[3476  154   93]
 [  66   93   45]
 [  26   18   98]]
FOLD 2: 0.6371886559944698
[[3460  171   92]
 [  66  101   37]
 [  21   18  103]]
FOLD 3: 0.6054809730792691
[[3437  188   98]
 [  71   89   44]
 [  25   19   98]]
FOLD 4: 0.6255473053590049
[[3438  172  113]
 [  59  107   38]
 [  25   19   98]]
FOLD 5: 0.5858524487501354
[[3397  206  120]
 [  74   84   46]
 [  20   22  100]]
average acc: 0.6153879649735259
[array([[3476,  154,   93],
       [  66,   93,   45],
       [  26,   18,   98]]), array([[3460,  171,   92],
       [  66,  101,   37],
       [  21,   18,  103]]), array([[3437,  188,   98],
       [  71,   89,   44],
       [  25,   19,   98]]), array([[3438,  172,  113],
       [  59,  107,   38],
       [  25,   19,   98]]), array([[3397,  206,  120],
       [  74,   84,   46],
       [  20,   22,  100]])]
average conf mat: [[3441.6  178.2  103.2]
 [  67.2   94.8   42. ]
 [  23.4   19.2   99.4]]
Best accuracy: 0.6371886559944698


In [None]:
# BiLSTM + CNN
import numpy as np
import pandas as pd
from keras.layers import Dense,Input,LSTM,Bidirectional,Activation,Conv1D,GRU
from keras.callbacks import Callback
from keras.layers import Dropout,Embedding,GlobalMaxPooling1D, MaxPooling1D, Add, Flatten
from keras.preprocessing import text, sequence
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras.models import Model
from keras.optimizers import Adam
from keras import backend as K
from keras.utils import to_categorical
from sklearn.metrics import f1_score, confusion_matrix

from pyvi.ViTokenizer import ViTokenizer
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from unidecode import unidecode

# pre-process function
def preprocess(text):
    text = ViTokenizer.tokenize(text)
    # text = unidecode(text)
    text = text.lower()
    return text


EMBEDDING_FILE = 'drive/My Drive/CODE/Hate speech detection/embedding/cc.vi.300.vec'
MODEL_FILE = 'drive/My Drive/CODE/Hate speech detection/BiLST_TextCNN_model_ccSC.h5'
DATA = 'drive/My Drive/CODE/Hate speech detection/data/clean_dataset.csv'

max_features = 11221
maxlen = 1000
embed_size = 300
batch_size = 1024
epochs = 10

# read data
data = pd.read_csv(DATA)

O_X = data['free_text']
O_y = data['label_id']

train_set = O_X
target_set = O_y


# --------------TRICH XUAT DAC TRUNG -------------------------
tokenizer = text.Tokenizer(num_words=max_features, lower=True)
tokenizer.fit_on_texts(train_set.astype(str))


# --------------END TRICH XUAT DAC TRUNG -------------------------

embeddings_index = {}
with open(EMBEDDING_FILE, encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

word_index = tokenizer.word_index
num_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue

    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# ------------------- XAY DUNG MO HINH MANG NEURAL -----------------------
sequence_input = Input(shape=(maxlen, ))
x = Embedding(max_features, embed_size, weights=[embedding_matrix],trainable = False)(sequence_input)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(GRU(128, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool]) 
preds = Dense(3, activation="sigmoid")(x)
model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-3),metrics=['accuracy'])
# ------------------- END XAY DUNG MO HINH MANG NEURAL -----------------------


# Kfold cross validation
results = []
confuses = []
kfold = StratifiedShuffleSplit(n_splits=5, test_size=0.2)

count = 1
X = train_set
y = target_set

acc = 0

for train, test in kfold.split(X, y):
    X_train_fold = X[train]
    y_train_fold = y[train]

    X_test_fold = X[test]
    y_test_fold = y[test]

    X_train_fold = [preprocess(str(p)) for p in list(X[train])]
    X_train_fold = tokenizer.texts_to_sequences(X_train_fold)
    X_train_fold = sequence.pad_sequences(X_train_fold, maxlen=maxlen)

    X_test_fold = [preprocess(str(p)) for p in list(X[test])]
    X_test_fold = tokenizer.texts_to_sequences(X_test_fold)
    X_test_fold = sequence.pad_sequences(X_test_fold, maxlen=maxlen)

    y_train_fold = to_categorical(y_train_fold, num_classes=3)
    y_test_fold = y_test_fold

    model.fit(X_train_fold, y_train_fold, batch_size=batch_size, epochs=epochs, verbose=1)
    prediction = model.predict(X_test_fold, batch_size=batch_size, verbose=1)
    test_pred = prediction.argmax(axis=-1)
    evaluate = f1_score(y_test_fold, test_pred, average='macro')
    confuse = confusion_matrix(y_test_fold, test_pred, labels=[0, 1, 2])

    print('===============================================')
    print("FOLD {}: {}".format(count, evaluate))
    print(confuse)
    results.append(evaluate)
    confuses.append(confuse)
    print('===============================================')

    if evaluate > acc:
        model.save('drive/My Drive/CODE/Hate speech detection/model_social/BiLSTM-CNN/bilstm_cnn_model.h5')
        acc = evaluate
    count = count + 1

print("average acc: {}".format(str(np.mean(results))))
print("average conf mat: {}".format(np.mean(confuses, axis=-3)))











Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
FOLD 1: 0.6208266103807035
[[3714    5    4]
 [ 134   50   20]
 [  75    9   58]]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
FOLD 2: 0.6847656340632519
[[3671   28   24]
 [ 108   72   24]
 [  36   18   88]]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
FOLD 3: 0.7071129362416483
[[3683   34    6]
 [ 109   86    9]
 [  42   24   76]]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
FOLD 4: 0.7437169312169312
[[3693   12   18]
 [  98   94   12]
 [  46   10   86]]
Epoch

In [None]:
# SVM + CNN 
import numpy as np
import pandas as pd
from keras.layers import Dense, Input, Bidirectional, GRU, Reshape, Conv2D, MaxPool2D, Concatenate, Flatten, Dropout
from keras.layers import Embedding
from keras.preprocessing import text, sequence
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, SpatialDropout1D
from keras.models import Model
from keras import backend as K
from keras.utils import to_categorical
from sklearn.metrics import f1_score, confusion_matrix

from pyvi.ViTokenizer import ViTokenizer
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from unidecode import unidecode

from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import FeatureUnion
from joblib import dump

# support function
def preprocess_deep(text):
    text = ViTokenizer.tokenize(text)
    text = unidecode(text)
    return text

def preprocess(text):
    text = ViTokenizer.tokenize(text)
    text = unidecode(text)
    text = text.lower()
    return text


#=========================== ENV ========================================
DATA = 'drive/My Drive/CODE/Hate speech detection/data/hatespeech_data.csv'
STOPWORDS = 'drive/My Drive/CODE/Hate speech detection/stopwords.txt'
EMBEDDING_FILE = 'drive/My Drive/CODE/Hate speech detection/embedding/cc.vi.300.vec'
max_features = 11221
maxlen = 1000
embed_size = 300
batch_size = 1024
epochs = 10

#=========================== TRAIN DAT ==================================
data = pd.read_csv(DATA)

O_X = data['free_text']
O_y = data['label_id']

train_set = O_X
target_set = O_y

#=========================== SVM feature extractor =====================
with open(STOPWORDS, "r") as ins:
    stop_words = []
    for line in ins:
        stop_words.append(line.strip('\n'))

BADWORDS = 'drive/My Drive/CODE/Hate speech detection/bad_words.txt'
with open(BADWORDS, "r") as ins:
    badwords = []
    for line in ins:
        dd = line.strip('\n')
        badwords.append(preprocess(dd))


vect_word = TfidfVectorizer(max_features=20000, lowercase=True, analyzer='word',
                        stop_words=stop_words, ngram_range=(1, 3), dtype=np.float32)

vect_char = TfidfVectorizer(max_features=40000, lowercase=True, analyzer='char',
                        stop_words=stop_words, ngram_range=(3, 6), dtype=np.float32)

bad_words_count = TfidfVectorizer(vocabulary=list(set(badwords)), lowercase=True)

features_extractor = FeatureUnion([
    ("vect_word", vect_word),
    ("vect_char", vect_char),
    ("bad_words_count", bad_words_count)
])
features_extractor.fit([preprocess(str(t)) for t in train_set])

#============================== TextCNN feature extractor =================
tokenizer = text.Tokenizer(num_words=max_features, lower=True)
tokenizer.fit_on_texts(train_set.astype(str))

embeddings_index = {}
with open(EMBEDDING_FILE, encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

word_index = tokenizer.word_index
num_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue

    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


#==================== Text CNN neural model =============================
filter_sizes = [1,2,3,5]
num_filters = 32

inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = SpatialDropout1D(0.4)(x)
x = Reshape((maxlen, embed_size, 1))(x)

conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embed_size), kernel_initializer='normal',
                activation='elu')(x)
conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embed_size), kernel_initializer='normal',
                activation='elu')(x)
conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embed_size), kernel_initializer='normal',
                activation='elu')(x)
conv_3 = Conv2D(num_filters, kernel_size=(filter_sizes[3], embed_size), kernel_initializer='normal',
                activation='elu')(x)

maxpool_0 = MaxPool2D(pool_size=(maxlen - filter_sizes[0] + 1, 1))(conv_0)
maxpool_1 = MaxPool2D(pool_size=(maxlen - filter_sizes[1] + 1, 1))(conv_1)
maxpool_2 = MaxPool2D(pool_size=(maxlen - filter_sizes[2] + 1, 1))(conv_2)
maxpool_3 = MaxPool2D(pool_size=(maxlen - filter_sizes[3] + 1, 1))(conv_3)

z = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3])
z = Flatten()(z)
z = Dropout(0.1)(z)


outp = Dense(3, activation="sigmoid")(z)

text_cnn_model = Model(inputs=inp, outputs=outp)
text_cnn_model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# ========================== SVM model =====================================
svm_model = svm.SVC(kernel='linear', C=1, probability=True)


# Kfold cross validation
results = []
confuses = []
kfold = StratifiedShuffleSplit(n_splits=5, test_size=0.2)

count = 1
X = train_set
y = target_set

for train, test in kfold.split(X, y):
  X_train_fold = X[train]
  y_train_fold = y[train]

  X_test_fold = X[test]
  y_test_fold = y[test].values
  
  # svm
  svm_X_train_fold = [preprocess(str(p)) for p in X_train_fold]
  svm_X_train_fold = features_extractor.transform(svm_X_train_fold)
  
  svm_X_test_fold = [preprocess(str(p)) for p in X_test_fold]
  svm_X_test_fold = features_extractor.transform(svm_X_test_fold)

  svm_model.fit(svm_X_train_fold, y_train_fold)
  svm_prediction = svm_model.predict_proba(svm_X_test_fold)
  svm_prediction_label = svm_model.predict(svm_X_test_fold)

  # TextCNN 
  cnn_X_train_fold = [preprocess_deep(str(p)) for p in list(X[train])]
  cnn_X_train_fold = tokenizer.texts_to_sequences(cnn_X_train_fold)
  cnn_X_train_fold = sequence.pad_sequences(cnn_X_train_fold, maxlen=maxlen)

  cnn_X_test_fold = [preprocess_deep(str(p)) for p in list(X[test])]
  cnn_X_test_fold = tokenizer.texts_to_sequences(cnn_X_test_fold)
  cnn_X_test_fold = sequence.pad_sequences(cnn_X_test_fold, maxlen=maxlen)

  cnn_y_train_fold = to_categorical(y_train_fold, num_classes=3)
  cnn_y_test_fold = cnn_y_train_fold

  text_cnn_model.fit(cnn_X_train_fold, cnn_y_train_fold, batch_size=batch_size, epochs=epochs, verbose=1)
  cnn_prediction = text_cnn_model.predict(cnn_X_test_fold, batch_size=batch_size, verbose=1)
  cnn_prediction_label = cnn_prediction.argmax(axis=-1)

  cnn_evaluate = f1_score(y_test_fold, cnn_prediction_label, average='macro')
  svm_evaluate = f1_score(y_test_fold, svm_prediction_label, average='macro')

  print(cnn_evaluate, svm_evaluate)
  full_predict = []
  for i in range(0, len(svm_prediction)):
    k = np.mean(np.array([svm_prediction[i]*svm_evaluate] + [cnn_prediction[i]*cnn_evaluate]), axis=0)
    full_predict.append(k.argmax(axis=-1))

#   for i in range(0, len(svm_prediction)):
#     if svm_prediction[i] == 0:
#       if cnn_prediction[i] == 1:
#         full_predict.append(cnn_prediction[i])
#       else:
#         full_predict.append(svm_prediction[i])
#     else:
#       full_predict.append(cnn_prediction[i])

  evaluate = f1_score(y_test_fold, full_predict, average='macro')
  confuse = confusion_matrix(y_test_fold, full_predict, labels=[0, 1, 2])

  print('===============================================')
  print("FOLD {}: {}".format(count, evaluate))
  print(confuse)
  results.append(evaluate)
  confuses.append(confuse)
  print('===============================================')

  count = count + 1

print("average acc: {}".format(str(np.mean(results))))
print(confuses)
print("average conf mat: {}".format(np.mean(confuses, axis=-3)))


Using TensorFlow backend.














Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.




Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.6617138433428232 0.6578158831230843
FOLD 1: 0.664375754525667
[[3699   11   13]
 [ 116   69   19]
 [  56   18   68]]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.7026724297405482 0.6388371364965847
FOLD 2: 0.6610236677780791
[[3689   22   12]
 [ 121   69   14]
 [  51   23   68]]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.8369881139781733 0.6916607006719065
FOLD 3: 0.7876391115820739
[[3694   17   12]
 [  90  107    7]
 [  34   11   97]]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 

In [None]:
# example feature

import pandas as pd

# read data
from pyvi.ViTokenizer import ViTokenizer
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import FeatureUnion
import numpy as np
from unidecode import unidecode
from joblib import dump
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import FunctionTransformer


def preprocess(text):
    text = ViTokenizer.tokenize(text)
    # text = unidecode(text)
    text = text.lower()
    return text


BADWORDS = 'drive/My Drive/HSD/bad_words.txt'
with open(BADWORDS, "r") as ins:
    badwords = []
    for line in ins:
        dd = line.strip('\n')
        badwords.append(preprocess(dd))

print(badwords)

MODEL_FILE = 'drive/My Drive/HSD/MODEL/SVM.joblib'
STOPWORDS = 'drive/My Drive/HSD/stopwords.txt'

original_data = pd.read_csv('drive/My Drive/HSD/data/hatespeech_data.csv')

O_X = original_data['free_text']
O_y = original_data['label_id']

# O_X = [preprocess(str(t)) for t in O_X]

# features extraction
with open(STOPWORDS, "r") as ins:
    stop_words = []
    for line in ins:
        stop_words.append(line.strip('\n'))


vect_word = TfidfVectorizer(max_features=20000, lowercase=True, analyzer='word',
                        stop_words=stop_words, ngram_range=(1, 3), dtype=np.float32)

vect_char = TfidfVectorizer(max_features=40000, lowercase=True, analyzer='char',
                        stop_words=stop_words, ngram_range=(3, 6), dtype=np.float32)

# bad_words_count = CountVectorizer(max_df=0.85,stop_words=badwords,max_features=10000)
bad_words_count = TfidfVectorizer(vocabulary=list(set(badwords)), lowercase=True)
len_of_text = FunctionTransformer(len)

features_extractor = FeatureUnion([
    ("vect_word", vect_word),
    ("vect_char", vect_char),
    ("bad_words_count", bad_words_count),
])

# THEM VO FEATURE VE DO DAI CAU

# Train and test division
train_set = O_X
target_set = O_y

# OX_train, OX_test, Oy_train, Oy_test = train_test_split(train_set, target_set, test_size=0.1, random_state=42)

training_data = [preprocess(str(t)) for t in train_set]

features_extractor.fit(training_data)
len_of_text.fit(training_data)

print(len_of_text.transform(training_data))

transformed_text = features_extractor.transform(training_data)
# print((transformed_text.get_shape()))
# def get_new_coeff(p_text, badwords, transformed_text):
#     coeff = []
#     # p_text = [preprocess(str(t)) for t in text]
#     for t in p_text:
#         num_of_bad_word = 1
#         for w in t.split():
#             if w in badwords:
#                 num_of_bad_word = num_of_bad_word + 1
#         coeff.append(num_of_bad_word/len(t))

#     for i in range(0, len(coeff)):
#         transformed_text[i] = transformed_text[i] * coeff[i]
#     # return np.array(coeff)
#     return transformed_text

# coeff = get_new_coeff(train_set, badwords)


# print(transformed_text)

In [None]:
# merge 2 dataset
import pandas as pd

a = pd.read_csv('drive/My Drive/CODE/Hate speech detection/raw_data/02_train_text.csv')
b = pd.read_csv('drive/My Drive/CODE/Hate speech detection/raw_data/03_train_label.csv')

c = a.merge(b, on='id')
c.to_csv('drive/My Drive/CODE/Hate speech detection/dataset/hsd_data.csv', index=False)

In [None]:
# error analysis Traditional model
import numpy as np
import pandas as pd
from pyvi.ViTokenizer import ViTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.externals import joblib

MODEL_FILE = 'drive/My Drive/CODE/Hate speech detection/model_social/SVM/svm_model.h5'

DATA = 'drive/My Drive/CODE/Hate speech detection/clean_dataset.csv'

def preprocess(text):
    text = ViTokenizer.tokenize(str(text))
    # text = unidecode(text)
    text = text.lower()
    return text

STOPWORDS = 'drive/My Drive/CODE/Hate speech detection/stopwords.txt'
data = pd.read_csv('drive/My Drive/CODE/Hate speech detection/data/clean_dataset.csv')

label0 = data.loc[data['label_id']==0]
label1 = data.loc[data['label_id']==1]
label2 = data.loc[data['label_id']==2]

frames = [label0.head(300), label1.head(300), label2.head(100)]
original_data = pd.concat(frames)

O_X = original_data['free_text']
O_y = original_data['label_id']

BADWORDS = 'drive/My Drive/CODE/Hate speech detection/bad_words.txt'
with open(BADWORDS, "r") as ins:
    badwords = []
    for line in ins:
        dd = line.strip('\n')
        badwords.append(preprocess(dd))

with open(STOPWORDS, "r") as ins:
    stop_words = []
    for line in ins:
        dd = line.strip('\n')
        stop_words.append(dd)

# features extraction
vect_word = TfidfVectorizer(max_features=20000, lowercase=True, analyzer='word',
                        stop_words=stop_words, ngram_range=(1, 3), dtype=np.float32)

vect_char = TfidfVectorizer(max_features=40000, lowercase=True, analyzer='char',
                        stop_words=stop_words, ngram_range=(3, 6), dtype=np.float32)

bad_words_count = TfidfVectorizer(vocabulary=list(set(badwords)), lowercase=True)

features_extractor = FeatureUnion([                              
    ("vect_word", vect_word),
    ("vect_char", vect_char),
    ("bad_words_count", bad_words_count), 
])

training_data = [preprocess(t) for t in O_X]
features_extractor.fit([preprocess(t) for t in data['free_text']])
train_data = features_extractor.transform(training_data)

loaded_model = joblib.load(MODEL_FILE)
O_yp=loaded_model.predict(train_data)

original_data['new_label_id'] = O_yp
original_data.to_csv("drive/My Drive/CODE/Hate speech detection/result_data/result_tradional_model.csv", index=False)



In [None]:
# error analysis Deep neural model
import numpy as np
import pandas as pd
from keras.layers import Dense, Input, Bidirectional, GRU, Reshape, Conv2D, MaxPool2D, Concatenate, Flatten, Dropout
from keras.layers import Embedding
from keras.preprocessing import text, sequence
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, SpatialDropout1D
from keras.models import Model
from keras import backend as K
from keras.utils import to_categorical
from sklearn.metrics import f1_score, confusion_matrix

from pyvi.ViTokenizer import ViTokenizer
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from unidecode import unidecode
from keras.models import load_model

# support function
def preprocess(text):
    text = ViTokenizer.tokenize(text)
    # text = unidecode(text)
    # text = text.lower()
    return text


EMBEDDING_FILE = 'drive/My Drive/CODE/Hate speech detection/embedding/cc.vi.300.vec'
MODEL_FILE = 'drive/My Drive/CODE/Hate speech detection/model_social/TextCNN/textcnn_model.h5'

DATA = 'drive/My Drive/CODE/Hate speech detection/data/clean_dataset.csv'

max_features = 11221
maxlen = 1000
embed_size = 300
batch_size = 1024
epochs = 10

# read data
data = pd.read_csv(DATA)

label0 = data.loc[data['label_id']==0]
label1 = data.loc[data['label_id']==1]
label2 = data.loc[data['label_id']==2]

frames = [label0.head(300), label1.head(300), label2.head(200)]
original_data = pd.concat(frames)

O_X = original_data['free_text']
O_y = original_data['label_id']


# --------------TRICH XUAT DAC TRUNG -------------------------
tokenizer = text.Tokenizer(num_words=max_features, lower=True)
tokenizer.fit_on_texts(data['free_text'].astype(str))

# --------------END TRICH XUAT DAC TRUNG -------------------------

embeddings_index = {}
with open(EMBEDDING_FILE, encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

word_index = tokenizer.word_index
num_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue

    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


train_x = tokenizer.texts_to_sequences(O_X)
train_x = sequence.pad_sequences(train_x, maxlen=maxlen)
train_y = to_categorical(O_y, num_classes=3)

model = load_model(MODEL_FILE)

pred = model.predict(train_x, batch_size=batch_size, verbose=1)
O_yp = pred.argmax(axis=-1)

original_data['new_label_id'] = O_yp
original_data.to_csv("drive/My Drive/CODE/Hate speech detection/result_data/result_deep_model.csv", index=False)

Using TensorFlow backend.






Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.










Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
pip install unidecode

Collecting unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |█▍                              | 10kB 22.0MB/s eta 0:00:01[K     |██▊                             | 20kB 3.0MB/s eta 0:00:01[K     |████▏                           | 30kB 4.0MB/s eta 0:00:01[K     |█████▌                          | 40kB 4.2MB/s eta 0:00:01[K     |██████▉                         | 51kB 3.5MB/s eta 0:00:01[K     |████████▎                       | 61kB 3.9MB/s eta 0:00:01[K     |█████████▋                      | 71kB 4.1MB/s eta 0:00:01[K     |███████████                     | 81kB 4.5MB/s eta 0:00:01[K     |████████████▍                   | 92kB 4.8MB/s eta 0:00:01[K     |█████████████▊                  | 102kB 4.8MB/s eta 0:00:01[K     |███████████████▏                | 112kB 4.8MB/s eta 0:00:01[K     |████████████████▌               | 122kB 4.8MB/

In [None]:
pip install pyvi

Collecting pyvi
[?25l  Downloading https://files.pythonhosted.org/packages/10/e1/0e5bc6b5e3327b9385d6e0f1b0a7c0404f28b74eb6db59a778515b30fd9c/pyvi-0.1-py2.py3-none-any.whl (8.5MB)
[K     |████████████████████████████████| 8.5MB 2.8MB/s 
Collecting sklearn-crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3
[?25l  Downloading https://files.pythonhosted.org/packages/95/99/869dde6dbf3e0d07a013c8eebfb0a3d30776334e0097f8432b631a9a3a19/python_crfsuite-0.9.7-cp36-cp36m-manylinux1_x86_64.whl (743kB)
[K     |████████████████████████████████| 747kB 42.1MB/s 
Installing collected packages: python-crfsuite, sklearn-crfsuite, pyvi
Successfully installed python-crfsuite-0.9.7 pyvi-0.1 sklearn-crfsuite-0.3.6
