In [31]:
import pandas as pd
import logging
import sys
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from keras import regularizers
from keras import metrics
from keras.layers import Dense, LSTM
from keras.layers.embeddings import Embedding
from keras.models import Sequential
from keras.models import load_model
from keras.optimizers import Adam
from keras.preprocessing import sequence
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import emoji
import re
logging.basicConfig(format='%(levelname)s %(asctime)s: %(message)s', level=logging.INFO)

In [2]:
data = pd.read_csv("../input/sinhala-hate-speech/final.csv")
print("Before", data.shape)
data = data[pd.notnull(data['full_text_without_emoji'])]
data = data[pd.notnull(data['label'])]
print("After:", data.shape)
data.head()

Before (4508, 43)
After: (4491, 43)


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,place,in_reply_to_screen_name,id,retweeted,supplemental_language,entities,source,coordinates,...,user_id,favorited,favorite_count,geo,extended_entities,quote_count,in_reply_to_status_id_str,full_text_without_emoji,emoji_count,emoji_meanings
0,0,144,,,1.175814e+18,False,,"{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/iphone"" r...",,...,2503150696.0,False,11,,,0,,අනේ හුකන්න කියපන් උන්ට,0,[]
1,1,30,,_NayanatharaJ,9.545702e+17,False,,"{'hashtags': [{'text': 'හෙළුවෙන්මවනමු', 'indic...","<a href=""http://twitter.com/download/android"" ...",,...,150624162.0,False,2,,,1,9.545627e+17,පාහර බැල්ලි එනෝ මෙතන මට වැඩ කියාදෙන්න හෙළුවෙන්...,0,[]
2,2,200,,,1.353329e+18,False,,"{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""https://mobile.twitter.com"" rel=""nofo...",,...,23873458.0,False,23,,"{'media': [{'id': 1353328788826951682, 'id_str...",0,,ඉස්සෝ කියන්නෙ කොච්චර උෂ්ණ වුනත් කොහොම හැදුවත් ...,0,[]
3,3,8,,,8.343548e+17,False,,"{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/android"" ...",,...,7.27767441892315e+17,False,1,,,0,,මහ බැල්ලි,4,"['dog', 'weary face', 'dog face']"
4,4,150,,IsuriUdeshika,1.332201e+18,False,,"{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/android"" ...",,...,1.31388435850831e+18,False,0,,,0,1.33201e+18,වේසි වේසි,0,[]


In [3]:
def tokenize(text: str) -> list:
    # text characters to split is from: https://github.com/madurangasiriwardena/corpus.sinhala.tools
    emojis = ''.join(emj for emj in emoji.UNICODE_EMOJI.keys())
    return [token for token in
            re.split(r'[.…,‌ ¸‚\"/|—¦”‘\'“’´!@#$%^&*+\-£?˜()\[\]{\}:;–Ê  �‪‬‏0123456789' + emojis + ']', text)
            if token != ""]
tokenize("අනේ හුකන්න කියපන් උන්ට")

['අනේ', 'හුකන්න', 'කියපන්', 'උන්ට']

In [4]:
def tokenize_corpus(corpus: list) -> list:
    return [tokenize(text) for text in corpus]
tokenized_corpus = tokenize_corpus(["අනේ හුකන්න කියපන් උන්ට", "පාහර බැල්ලි එනෝ මෙතන මට වැඩ කියාදෙන්න හෙළුවෙන්"])
tokenized_corpus

[['අනේ', 'හුකන්න', 'කියපන්', 'උන්ට'],
 ['පාහර', 'බැල්ලි', 'එනෝ', 'මෙතන', 'මට', 'වැඩ', 'කියාදෙන්න', 'හෙළුවෙන්']]

In [5]:
MAX_WORD_COUNT = 60 
DATA_SET_CLASSES = {
    0: [0, 1],
    1: [1, 0]
}

In [6]:
def transform_class_to_one_hot_representation(classes: list):
    return np.array([DATA_SET_CLASSES[cls] for cls in classes])
transform_class_to_one_hot_representation([1,0,1])

array([[1, 0],
       [0, 1],
       [1, 0]])

In [7]:
def build_dictionary(corpus_token: list) -> dict:
    word_frequency = {}
    dictionary = {}

    for tweet in corpus_token:
        for token in tweet:
            if token in word_frequency:
                word_frequency[token] += 1
            else:
                word_frequency[token] = 1

    frequencies = list(word_frequency.values())
    unique_words = list(word_frequency.keys())

    # sort words by its frequency
    frequency_indexes = np.argsort(frequencies)[::-1]  # reverse for descending
    for index, frequency_index in enumerate(frequency_indexes):
        # 0 is not used and 1 is for UNKNOWN
        dictionary[unique_words[frequency_index]] = index + 2

    return dictionary
print(tokenized_corpus)
dictionary = build_dictionary(tokenized_corpus)
dictionary

[['අනේ', 'හුකන්න', 'කියපන්', 'උන්ට'], ['පාහර', 'බැල්ලි', 'එනෝ', 'මෙතන', 'මට', 'වැඩ', 'කියාදෙන්න', 'හෙළුවෙන්']]


{'හෙළුවෙන්': 2,
 'කියාදෙන්න': 3,
 'වැඩ': 4,
 'මට': 5,
 'මෙතන': 6,
 'එනෝ': 7,
 'බැල්ලි': 8,
 'පාහර': 9,
 'උන්ට': 10,
 'කියපන්': 11,
 'හුකන්න': 12,
 'අනේ': 13}

In [8]:
def transform_to_dictionary_values(corpus_token: list, dictionary: dict) -> list:
    x_corpus = []
    for tweet in corpus_token:
        # 1 is for unknown (not in dictionary)
        x_corpus.append([dictionary[token] if token in dictionary else 1 for token in tweet])

    return x_corpus
transform_to_dictionary_values(tokenized_corpus, dictionary)

[[13, 12, 11, 10], [9, 8, 7, 6, 5, 4, 3, 2]]

In [9]:
data_set = data.values

In [10]:
DATA_SET_TEXT = 40
logging.info("Tokenizing the corpus")
corpus_token = tokenize_corpus(data_set[:, DATA_SET_TEXT])

In [11]:
logging.info("Building the dictionary")
dictionary = build_dictionary(corpus_token)
dictionary_length = len(dictionary) + 2  # 0 is not used and 1 is for UNKNOWN
dictionary_length

17532

In [12]:
logging.info("Transforming the corpus to dictionary values")
x_corpus = transform_to_dictionary_values(corpus_token, dictionary)

In [13]:
DATA_SET_CLASS = 29
y_corpus = transform_class_to_one_hot_representation(data_set[:, DATA_SET_CLASS])
y_corpus

array([[1, 0],
       [1, 0],
       [0, 1],
       ...,
       [1, 0],
       [0, 1],
       [0, 1]])

In [14]:
x_corpus = sequence.pad_sequences(x_corpus, maxlen=MAX_WORD_COUNT )

In [26]:
# ################## Deep Neural Network ###################### #
FOLDS_COUNT = 3
MAX_EPOCHS = 5
VALIDATION_TEST_SIZE = 0.12
max_word_count = MAX_WORD_COUNT

# splitting data for 5-fold cross validation
k_fold = StratifiedKFold(n_splits=FOLDS_COUNT, shuffle=True, random_state=18)
# to split, raw format (integer) is required
y_corpus_raw = [0 if cls[1] == 1 else 1 for cls in y_corpus]

In [41]:
fold = 0
for train_n_validation_indexes, test_indexes in k_fold.split(x_corpus, y_corpus_raw):
    x_train_n_validation = x_corpus[train_n_validation_indexes]
    y_train_n_validation = y_corpus[train_n_validation_indexes]
    x_test = x_corpus[test_indexes]
    y_test = y_corpus[test_indexes]

    # train and validation data sets
    x_train, x_valid, y_train, y_valid = train_test_split(x_train_n_validation, y_train_n_validation,
                                                          test_size=VALIDATION_TEST_SIZE, random_state=94)

    # ################## Deep Neural Network Model ###################### #
    model = Sequential()
    model.add(Embedding(input_dim=dictionary_length, output_dim=60, input_length=max_word_count))
    model.add(LSTM(600))
    model.add(Dense(units=max_word_count, activation='tanh', kernel_regularizer=regularizers.l2(0.04),
                    activity_regularizer=regularizers.l2(0.015)))
    model.add(Dense(units=max_word_count, activation='relu', kernel_regularizer=regularizers.l2(0.01),
                    bias_regularizer=regularizers.l2(0.01)))
    model.add(Dense(2, activation='softmax', kernel_regularizer=regularizers.l2(0.001)))
    adam_optimizer = Adam(lr=0.001, decay=0.0001)
    model.compile(loss='categorical_crossentropy', optimizer=adam_optimizer, metrics=[metrics.Accuracy(), metrics.AUC(), metrics.Precision(), metrics.Recall()])

    print(model.summary())
    # ################## Deep Neural Network Model ###################### #

    best_accuracy = 0
    best_loss = 100000
    best_epoch = 0

    epoch_history = {
        'accuracy': [],
        'val_accuracy': [],
        'loss': [],
        'val_loss': [],
        'auc': [],
        'val_auc': []
    }

    # for each epoch
    for epoch in range(MAX_EPOCHS):
        logging.info("Fold: %d/%d" % (fold, FOLDS_COUNT))
        logging.info("Epoch: %d/%d" % (epoch, MAX_EPOCHS))
        history = model.fit(x=x_train, y=y_train, epochs=1, batch_size=1, validation_data=(x_valid, y_valid),
                            verbose=1, shuffle=True)
        print("history:", history.history)
        # get validation (test) accuracy and loss
#         accuracy = history.history['val_auc'][0]
#         loss = history.history['val_loss'][0]

#         # set epochs' history
#         epoch_history['auc'].append(history.history['auc'][0])
#         epoch_history['val_auc'].append(history.history['val_auc'][0])
#         epoch_history['loss'].append(history.history['loss'][0])
#         epoch_history['val_loss'].append(history.history['val_loss'][0])

        # select best epoch and save to disk
#         if accuracy >= best_accuracy and loss < best_loss + 0.01:
#             logging.info("Saving model")
#             model.save("%s/model_fold_%d.h5" % (directory, fold))

#             best_accuracy = accuracy
#             best_loss = loss
#             best_epoch = epoch
        # end of epoch

    # Plot training & validation accuracy values
#     plt.plot(epoch_history['auc'])
#     plt.plot(epoch_history['val_auc'])
#     plt.title('Model AUC')
#     plt.ylabel('AUC')
#     plt.xlabel('Epoch')
#     plt.legend(['Train', 'Validation'], loc='upper left')
# #     plt.savefig("%s/plot_model_accuracy_%d" % (directory, fold))
#     plt.show()

#     # Plot training & validation loss values
#     plt.plot(epoch_history['loss'])
#     plt.plot(epoch_history['val_loss'])
#     plt.title('Model loss')
#     plt.ylabel('Loss')
#     plt.xlabel('Epoch')
#     plt.legend(['Train', 'Validation'], loc='upper left')
# #     plt.savefig("%s/plot_model_loss_%d" % (directory, fold))
#     plt.show()
    fold += 1

Model: "sequential_20"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_19 (Embedding)     (None, 60, 60)            1051920   
_________________________________________________________________
lstm_19 (LSTM)               (None, 600)               1586400   
_________________________________________________________________
dense_57 (Dense)             (None, 60)                36060     
_________________________________________________________________
dense_58 (Dense)             (None, 60)                3660      
_________________________________________________________________
dense_59 (Dense)             (None, 2)                 122       
Total params: 2,678,162
Trainable params: 2,678,162
Non-trainable params: 0
_________________________________________________________________
None
history: {'loss': [0.5349509716033936], 'accuracy': [0.0], 'auc_2': [0.9107909202575684], 'precision_1':

In [152]:
# model.fit(x=x_train, y=y_train, epochs=1 validation_data=(x_test, y_test),
#                             verbose=1, shuffle=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f84b00c6f50>