In [None]:
import pandas as pd
import logging
import sys
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from keras import regularizers
from keras import metrics
from keras.layers import Dense, LSTM, Conv1D, Dropout, MaxPooling1D, Flatten, Flatten, GRU, Average
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.layers.embeddings import Embedding
from keras.models import Sequential
from keras.models import load_model
from keras.optimizers import Adam
from keras.preprocessing import sequence
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import emoji
import re
import glob
logging.basicConfig(format='%(levelname)s %(asctime)s: %(message)s', level=logging.INFO)

In [None]:
data = pd.read_csv("../input/sinhala-hate-speech/final.csv")
print("Before", data.shape)
data = data[pd.notnull(data['full_text_without_emoji'])]
data = data[pd.notnull(data['label'])]
print("After:", data.shape)
data.head()

In [None]:
def tokenize(text: str) -> list:
    # text characters to split is from: https://github.com/madurangasiriwardena/corpus.sinhala.tools
    emojis = ''.join(emj for emj in emoji.UNICODE_EMOJI.keys())
    return [token for token in
            re.split(r'[.…,‌ ¸‚\"/|—¦”‘\'“’´!@#$%^&*+\-£?˜()\[\]{\}:;–Ê  �‪‬‏0123456789' + emojis + ']', text)
            if token != ""]
tokenize("අනේ හුකන්න කියපන් උන්ට")

In [None]:
def tokenize_corpus(corpus: list) -> list:
    return [tokenize(text) for text in corpus]
tokenized_corpus = tokenize_corpus(["අනේ හුකන්න කියපන් උන්ට", "පාහර බැල්ලි එනෝ මෙතන මට වැඩ කියාදෙන්න හෙළුවෙන්"])
tokenized_corpus

In [None]:
MAX_WORD_COUNT = 60 
DATA_SET_CLASSES = {
    0: [0, 1],
    1: [1, 0]
}

In [None]:
def transform_class_to_one_hot_representation(classes: list):
    return np.array([DATA_SET_CLASSES[cls] for cls in classes])
transform_class_to_one_hot_representation([1,0,1])

In [None]:
def build_dictionary(corpus_token: list) -> dict:
    word_frequency = {}
    dictionary = {}

    for tweet in corpus_token:
        for token in tweet:
            if token in word_frequency:
                word_frequency[token] += 1
            else:
                word_frequency[token] = 1

    frequencies = list(word_frequency.values())
    unique_words = list(word_frequency.keys())

    # sort words by its frequency
    frequency_indexes = np.argsort(frequencies)[::-1]  # reverse for descending
    for index, frequency_index in enumerate(frequency_indexes):
        # 0 is not used and 1 is for UNKNOWN
        dictionary[unique_words[frequency_index]] = index + 2

    return dictionary
print(tokenized_corpus)
dictionary = build_dictionary(tokenized_corpus)
dictionary

In [None]:
def transform_to_dictionary_values(corpus_token: list, dictionary: dict) -> list:
    x_corpus = []
    for tweet in corpus_token:
        # 1 is for unknown (not in dictionary)
        x_corpus.append([dictionary[token] if token in dictionary else 1 for token in tweet])

    return x_corpus
transform_to_dictionary_values(tokenized_corpus, dictionary)

In [None]:
data_set = data.values

In [None]:
DATA_SET_TEXT = 40
logging.info("Tokenizing the corpus")
corpus_token = tokenize_corpus(data_set[:, DATA_SET_TEXT])

In [None]:
logging.info("Building the dictionary")
dictionary = build_dictionary(corpus_token)
dictionary_length = len(dictionary) + 2  # 0 is not used and 1 is for UNKNOWN
dictionary_length

In [None]:
logging.info("Transforming the corpus to dictionary values")
x_corpus = transform_to_dictionary_values(corpus_token, dictionary)

In [None]:
DATA_SET_CLASS = 29
y_corpus = transform_class_to_one_hot_representation(data_set[:, DATA_SET_CLASS])
y_corpus

In [None]:
x_corpus = sequence.pad_sequences(x_corpus, maxlen=MAX_WORD_COUNT )
print(len(x_corpus))

In [None]:
# ################## Deep Neural Network ###################### #
FOLDS_COUNT = 10
MAX_EPOCHS = 3
VALIDATION_TEST_SIZE = 0.12
max_word_count = MAX_WORD_COUNT

from keras.models import Model, Input
inpiut_shape = (17532,)
model_input = Input(shape=inpiut_shape)

# splitting data for 5-fold cross validation
k_fold = StratifiedKFold(n_splits=FOLDS_COUNT, shuffle=True, random_state=18)
# to split, raw format (integer) is required
y_corpus_raw = [0 if cls[1] == 1 else 1 for cls in y_corpus]

In [None]:
def get_lstm_model():
    model = Sequential()
    model.add(Embedding(input_dim=dictionary_length, output_dim=60, input_length=max_word_count))
    model.add(LSTM(600))
    model.add(Dense(units=max_word_count, activation='tanh', kernel_regularizer=regularizers.l2(0.04),
                    activity_regularizer=regularizers.l2(0.015)))
    model.add(Dense(units=max_word_count, activation='relu', kernel_regularizer=regularizers.l2(0.01),
                    bias_regularizer=regularizers.l2(0.01)))
    model.add(Dense(2, activation='softmax', kernel_regularizer=regularizers.l2(0.001)))
    adam_optimizer = Adam(lr=0.001, decay=0.0001)
    model.compile(loss='binary_crossentropy', optimizer=adam_optimizer, metrics=[metrics.CategoricalAccuracy(), metrics.AUC(), metrics.Precision(), metrics.Recall()])

    return model
    
    
#     x = Embedding(input_dim=dictionary_length, output_dim=60, input_length=max_word_count)(model_input)
#     x = LSTM(600)(x)
#     x = Dense(units=max_word_count, activation='tanh', kernel_regularizer=regularizers.l2(0.04),
#                     activity_regularizer=regularizers.l2(0.015))(x)
#     x = Dense(units=max_word_count, activation='relu', kernel_regularizer=regularizers.l2(0.01),
#                     bias_regularizer=regularizers.l2(0.01))(x)
#     x = Dense(2, activation='softmax', kernel_regularizer=regularizers.l2(0.001))(x)
#     return Model(model_input, x, name='lstm_model')

In [None]:
lstm_model = get_lstm_model()

In [None]:
def get_cnn_model():
    model = Sequential()
    model.add(Embedding(input_dim=dictionary_length, output_dim=60, input_length=max_word_count))
    model.add(Conv1D(32, 3, padding='same', activation='relu'))
    model.add(MaxPooling1D())
    model.add(Flatten())
    model.add(Dense(250, activation='relu'))
    model.add(Dense(2, activation='sigmoid'))
    adam_optimizer = Adam(lr=0.001, decay=0.0001)
    model.compile(loss='binary_crossentropy', optimizer=adam_optimizer, metrics=[metrics.CategoricalAccuracy(), metrics.AUC(), metrics.Precision(), metrics.Recall()])
    return model

#     x = Embedding(input_dim=dictionary_length, output_dim=60, input_length=max_word_count)(model_input)
#     x = Conv1D(32, 3, padding='same', activation='relu')(x)
#     x = MaxPooling1D()(x)
#     x = Flatten()(x)
#     x = Dense(250, activation='relu')(x)
#     x = Dense(2, activation='sigmoid')(x)
#     return Model(model_input, x, name='cnn_model')

In [None]:
cnn_model = get_cnn_model()

In [None]:
def get_bi_gru_model():
    model = Sequential()
    model.add(Embedding(input_dim=dictionary_length, output_dim=60, input_length=max_word_count))
    model.add(GRU(units=256, return_sequences=True))
    model.add(MaxPooling1D())
    model.add(Flatten())
    model.add(Dense(250, activation='relu'))
    model.add(Dense(2, activation='sigmoid'))
    adam_optimizer = Adam(lr=0.001, decay=0.0001)
    model.compile(loss='binary_crossentropy', optimizer=adam_optimizer, metrics=[metrics.CategoricalAccuracy(), metrics.AUC(), metrics.Precision(), metrics.Recall()])
    return model
#     x = Embedding(input_dim=dictionary_length, output_dim=60, input_length=max_word_count)(model_input)
#     x = GRU(units=256, return_sequences=True)(x)
#     x = MaxPooling1D()(x)
#     x = Flatten()(x)
#     x = Dense(250, activation='relu')(x)
#     x = Dense(2, activation='sigmoid')(x)
#     return Model(model_input, x, name='gru_model')

In [None]:
gru_model = get_bi_gru_model()

In [None]:
# x_train, x_test, y_train, y_test = train_test_split(x_corpus, y_corpus,
#                                                           test_size=VALIDATION_TEST_SIZE, random_state=94)

In [None]:
NUM_EPOCHS = 3
def compile_and_train(model, num_epochs): 
#     adam_optimizer = Adam(lr=0.001, decay=0.0001)
#     model.compile(loss='binary_crossentropy', optimizer=adam_optimizer, metrics=[metrics.Accuracy(), metrics.AUC(), metrics.Precision(), metrics.Recall()])
#     filepath = 'weights/' + model.name + '.{epoch:02d}-{loss:.2f}.hdf5'
#     print(filepath)
#     checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=0, save_weights_only=True,
#                                                  save_best_only=True, mode='auto', period=1)
#     tensor_board = TensorBoard(log_dir='logs/', histogram_freq=0, batch_size=32)
    history = model.fit(x=x_train, y=y_train, batch_size=32, 
                     epochs=num_epochs, verbose=1, validation_split=0.2)
    print(history)
#     weight_files = glob.glob(os.path.join(os.getcwd(), 'weights/*'))
#     weight_file = max(weight_files, key=os.path.getctime) # most recent file
#     return history, weight_file

In [None]:
# compile_and_train(lstm_model, NUM_EPOCHS)

In [None]:
# compile_and_train(cnn_model, NUM_EPOCHS)

In [None]:
# compile_and_train(gru_model, NUM_EPOCHS)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

def evaluate_perf(pred, actual):
    actual_list = [i[1] for i in actual]
    accuracy = accuracy_score(actual_list, pred)
    precision = precision_score(actual_list, pred, average='macro')
    recall = recall_score(actual_list, pred, average='macro')
    f_score = f1_score(actual_list, pred, average='macro')
    auc = roc_auc_score(actual_list, pred)
    return { "accuracy": accuracy, "precision": precision, "recall": recall, "f_score": f_score, 'auc': auc}

In [None]:
# print(evaluate_accuracy(lstm_model))
# print(evaluate_accuracy(cnn_model))
# print(evaluate_accuracy(gru_model))

In [None]:
# LSTMN_WEIGHT_FILE = os.path.join(os.getcwd(), 'weights', 'sequential_4.{epoch:02d}-{loss:.2f}.hdf5')
# CNN_WEIGHT_FILE = os.path.join(os.getcwd(), 'weights', 'sequential_5.{epoch:02d}-{loss:.2f}.hdf5')
# GRU_WEIGHT_FILE = os.path.join(os.getcwd(), 'weights', 'sequential_6.{epoch:02d}-{loss:.2f}.hdf5')

# conv_pool_cnn_model = conv_pool_cnn(model_input)
# all_cnn_model = all_cnn(model_input)
# nin_cnn_model = nin_cnn(model_input)

# conv_pool_cnn_model.load_weights(CONV_POOL_CNN_WEIGHT_FILE)
# all_cnn_model.load_weights(ALL_CNN_WEIGHT_FILE)
# nin_cnn_model.load_weights(NIN_CNN_WEIGHT_FILE)

models = [lstm_model, cnn_model, gru_model]

In [None]:
import numpy as np
from scipy import stats
def ensemble_pred(models, test_set):
    predictions = []
    for model in models:
        pred = model.predict(test_set, batch_size = 32)
        pred = list(np.argmax(pred, axis=1))
#         print(pred)
        predictions.append(pred)
    predictions_np = np.array(predictions)
    m = stats.mode(predictions_np)
    return m.mode[0].tolist()

In [None]:
# ens_pred = ensemble_pred(models, x_test)
# ens_pred
# type(ens_pred)

In [None]:
# evaluate_perf(ens_pred, y_test)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
import numpy
from sklearn.metrics import classification_report

cvscores = []
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
y_corpus_raw = [0 if cls[1] == 1 else 1 for cls in y_corpus]

for train_n_validation_indexes, test_indexes  in kfold.split(x_corpus, y_corpus_raw):
    x_train_n_validation = x_corpus[train_n_validation_indexes]
    y_train_n_validation = y_corpus[train_n_validation_indexes]
    x_test = x_corpus[test_indexes]
    y_test = y_corpus[test_indexes]

    # train and validation data sets
#     x_train, x_valid, y_train, y_valid = train_test_split(x_train_n_validation, y_train_n_validation,
#                                                           test_size=VALIDATION_TEST_SIZE, random_state=94)
    lstm_model = get_lstm_model()
    cnn_model = get_cnn_model()
    gru_model = get_bi_gru_model()
    lstm_model.fit(x_train_n_validation, y_train_n_validation, epochs=3, batch_size=10, verbose=1)
    cnn_model.fit(x_train_n_validation, y_train_n_validation, epochs=3, batch_size=10, verbose=1)
    gru_model.fit(x_train_n_validation, y_train_n_validation, epochs=3, batch_size=10, verbose=1)
#     model.fit(x_train, y_train, epochs=3, batch_size=10, verbose=1)
        # evaluate the model
#     scores = model.evaluate(x_test, y_test, verbose=0)
#     print(model.metrics_names)
#     print(scores)
#     print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    prediction = ensemble_pred([lstm_model, cnn_model, gru_model], x_test)
    print(classification_report(list(np.argmax(y_test, axis=1)), prediction, labels=[0, 1]))
    scores = evaluate_perf(prediction, y_test)
    print("score: ", scores)
    cvscores.append(scores)
#     cvscores.append(scores[1] * 100)
# print("%.2f%% (+/- %.2f%%)" % (numpy.mean(cvscores), numpy.std(cvscores)))

# evaluate_accuracy(lstm_model)
# def evaluate_accuracy(model):
# #     print(DATA_SET_CLASSES)
#     pred = model.predict(x_test, batch_size = 32)
#     pred = list(np.argmax(pred, axis=1))
# #     print(pred)
# #     pred = [DATA_SET_CLASSES[i][0] for i in pred]
# #     print(pred)
#     y_test_list = [i[1] for i in y_test]
# #     print(y_test_list)
#     return accuracy_score(y_test_list, pred)

In [None]:
acc = []
pre = []
recall = []
fscore = []
auc = []
for i in cvscores:
    acc.append(i['accuracy'])
    pre.append(i['precision'])
    recall.append(i['recall'])
    fscore.append(i['f_score'])
    auc.append(i['auc'])

print("Accuracy:", sum(acc)/len(acc))
print("Precision:", sum(pre)/len(pre))
print("Recall:", sum(recall)/len(recall))
print("fScore:", sum(fscore)/len(fscore))
print("auc:", sum(auc)/len(auc))