In [None]:
import pandas as pd
import logging
import sys
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from keras import regularizers
from keras import metrics
from keras.layers import Dense, Conv1D, Dropout, MaxPooling1D, Flatten
from keras.layers.embeddings import Embedding
from keras.models import Sequential
from keras.models import load_model
from keras.optimizers import Adam
from keras.preprocessing import sequence
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import emoji
import re
logging.basicConfig(format='%(levelname)s %(asctime)s: %(message)s', level=logging.INFO)

In [None]:
data = pd.read_csv("../input/sinhala-hate-speech/final.csv")
print("Before", data.shape)
data = data[pd.notnull(data['full_text_without_emoji'])]
data = data[pd.notnull(data['label'])]
print("After:", data.shape)
data.head()

In [None]:
def tokenize(text: str) -> list:
    # text characters to split is from: https://github.com/madurangasiriwardena/corpus.sinhala.tools
    emojis = ''.join(emj for emj in emoji.UNICODE_EMOJI.keys())
    return [token for token in
            re.split(r'[.…,‌ ¸‚\"/|—¦”‘\'“’´!@#$%^&*+\-£?˜()\[\]{\}:;–Ê  �‪‬‏0123456789' + emojis + ']', text)
            if token != ""]
tokenize("අනේ හුකන්න කියපන් උන්ට")

In [None]:
def tokenize_corpus(corpus: list) -> list:
    return [tokenize(text) for text in corpus]
tokenized_corpus = tokenize_corpus(["අනේ හුකන්න කියපන් උන්ට", "පාහර බැල්ලි එනෝ මෙතන මට වැඩ කියාදෙන්න හෙළුවෙන්"])
tokenized_corpus

In [None]:
MAX_WORD_COUNT = 60 
DATA_SET_CLASSES = {
    0: [0, 1],
    1: [1, 0]
}

In [None]:
def transform_class_to_one_hot_representation(classes: list):
    return np.array([DATA_SET_CLASSES[cls] for cls in classes])
transform_class_to_one_hot_representation([1,0,1])

In [None]:
def build_dictionary(corpus_token: list) -> dict:
    word_frequency = {}
    dictionary = {}

    for tweet in corpus_token:
        for token in tweet:
            if token in word_frequency:
                word_frequency[token] += 1
            else:
                word_frequency[token] = 1

    frequencies = list(word_frequency.values())
    unique_words = list(word_frequency.keys())

    # sort words by its frequency
    frequency_indexes = np.argsort(frequencies)[::-1]  # reverse for descending
    for index, frequency_index in enumerate(frequency_indexes):
        # 0 is not used and 1 is for UNKNOWN
        dictionary[unique_words[frequency_index]] = index + 2

    return dictionary
print(tokenized_corpus)
dictionary = build_dictionary(tokenized_corpus)
dictionary

In [None]:
def transform_to_dictionary_values(corpus_token: list, dictionary: dict) -> list:
    x_corpus = []
    for tweet in corpus_token:
        # 1 is for unknown (not in dictionary)
        x_corpus.append([dictionary[token] if token in dictionary else 1 for token in tweet])

    return x_corpus
transform_to_dictionary_values(tokenized_corpus, dictionary)

In [None]:
data_set = data.values

In [None]:
DATA_SET_TEXT = 40
logging.info("Tokenizing the corpus")
corpus_token = tokenize_corpus(data_set[:, DATA_SET_TEXT])

In [None]:
logging.info("Building the dictionary")
dictionary = build_dictionary(corpus_token)
dictionary_length = len(dictionary) + 2  # 0 is not used and 1 is for UNKNOWN
dictionary_length

In [None]:
logging.info("Transforming the corpus to dictionary values")
x_corpus = transform_to_dictionary_values(corpus_token, dictionary)

In [None]:
DATA_SET_CLASS = 29
y_corpus = transform_class_to_one_hot_representation(data_set[:, DATA_SET_CLASS])
y_corpus

In [None]:
x_corpus = sequence.pad_sequences(x_corpus, maxlen=MAX_WORD_COUNT )
len(x_corpus)

In [None]:
# ################## Deep Neural Network ###################### #
FOLDS_COUNT = 10
MAX_EPOCHS = 3
VALIDATION_TEST_SIZE = 0.12
max_word_count = MAX_WORD_COUNT

# splitting data for 5-fold cross validation
k_fold = StratifiedKFold(n_splits=FOLDS_COUNT, shuffle=True, random_state=18)
# to split, raw format (integer) is required
y_corpus_raw = [0 if cls[1] == 1 else 1 for cls in y_corpus]

In [None]:
fold = 0
cv_scores = []
for train_n_validation_indexes, test_indexes in k_fold.split(x_corpus, y_corpus_raw):
    x_train_n_validation = x_corpus[train_n_validation_indexes]
    y_train_n_validation = y_corpus[train_n_validation_indexes]
    x_test = x_corpus[test_indexes]
    y_test = y_corpus[test_indexes]

    # train and validation data sets
    x_train, x_valid, y_train, y_valid = train_test_split(x_train_n_validation, y_train_n_validation,
                                                          test_size=VALIDATION_TEST_SIZE, random_state=94)

    n_timesteps = len(x_train)
    n_features = len(x_train[0])
    # ################## Deep Neural Network Model ###################### #
    model = Sequential()
    model.add(Embedding(input_dim=dictionary_length, output_dim=60, input_length=max_word_count))
    model.add(Conv1D(32, 3, padding='same', activation='relu'))
    model.add(MaxPooling1D())
    model.add(Flatten())
    model.add(Dense(250, activation='relu'))
    model.add(Dense(2, activation='sigmoid'))
#     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    adam_optimizer = Adam(lr=0.001, decay=0.0001)
    model.compile(loss='binary_crossentropy', optimizer=adam_optimizer, metrics=[metrics.CategoricalAccuracy(), metrics.AUC(), metrics.Precision(), metrics.Recall()])
    
    print(model.summary())
    # ################## Deep Neural Network Model ###################### #

    best_accuracy = 0
    best_loss = 100000
    best_epoch = 0

    epoch_history = {
        'accuracy': [],
        'val_accuracy': [],
        'loss': [],
        'val_loss': [],
        'auc': [],
        'val_auc': []
    }

    history = model.fit(x=x_train_n_validation, y=y_train_n_validation, epochs=MAX_EPOCHS, batch_size=32,
                            verbose=0, shuffle=True)
    scores = model.evaluate(x_test, y_test, verbose=0)
    print(model.metrics_names)
    print("scores:", scores)
    cv_scores.append(scores)
    # for each epoch
#     for epoch in range(MAX_EPOCHS):
#         logging.info("Fold: %d/%d" % (fold, FOLDS_COUNT))
#         logging.info("Epoch: %d/%d" % (epoch, MAX_EPOCHS))
#         history = model.fit(x=x_train, y=y_train, epochs=1, batch_size=1, validation_data=(x_valid, y_valid),
#                             verbose=1, shuffle=True)
#         print("history:", history.history)
        # get validation (test) accuracy and loss
#         accuracy = history.history['val_auc'][0]
#         loss = history.history['val_loss'][0]

#         # set epochs' history
#         epoch_history['auc'].append(history.history['auc'][0])
#         epoch_history['val_auc'].append(history.history['val_auc'][0])
#         epoch_history['loss'].append(history.history['loss'][0])
#         epoch_history['val_loss'].append(history.history['val_loss'][0])

        # select best epoch and save to disk
#         if accuracy >= best_accuracy and loss < best_loss + 0.01:
#             logging.info("Saving model")
#             model.save("%s/model_fold_%d.h5" % (directory, fold))

#             best_accuracy = accuracy
#             best_loss = loss
#             best_epoch = epoch
        # end of epoch

    # Plot training & validation accuracy values
#     plt.plot(epoch_history['auc'])
#     plt.plot(epoch_history['val_auc'])
#     plt.title('Model AUC')
#     plt.ylabel('AUC')
#     plt.xlabel('Epoch')
#     plt.legend(['Train', 'Validation'], loc='upper left')
# #     plt.savefig("%s/plot_model_accuracy_%d" % (directory, fold))
#     plt.show()

#     # Plot training & validation loss values
#     plt.plot(epoch_history['loss'])
#     plt.plot(epoch_history['val_loss'])
#     plt.title('Model loss')
#     plt.ylabel('Loss')
#     plt.xlabel('Epoch')
#     plt.legend(['Train', 'Validation'], loc='upper left')
# #     plt.savefig("%s/plot_model_loss_%d" % (directory, fold))
#     plt.show()
    fold += 1

In [None]:
# model.fit(x=x_train, y=y_train, epochs=1 validation_data=(x_test, y_test),
#                             verbose=1, shuffle=True)

In [None]:
mean_accuracy = 0
mean_auc = 0
mean_precision = 0
mean_recall = 0
for score in cv_scores:
    mean_accuracy += score[1]
    mean_auc += score[2]
    mean_precision += score[3]
    mean_recall += score[4]
mean_accuracy = mean_accuracy / FOLDS_COUNT
mean_auc = mean_auc / FOLDS_COUNT
mean_precision = mean_precision / FOLDS_COUNT
mean_recall = mean_recall / FOLDS_COUNT
mean_f_score = 2*mean_precision*mean_recall/(mean_precision + mean_recall)
print("mean_accuracy", mean_accuracy)
print("mean_auc", mean_auc)
print("mean_precision", mean_precision)
print("mean_recall", mean_recall)
print("mean_f_score", mean_f_score)