In [61]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

dataset = ('/Users/sebastian/Desktop/EDM Hate Speech/output_utf8_small.csv')
data = pd.read_csv(dataset, sep=',', header=None)
data.columns = ["id", "content", "deleted"]
data.to_csv('/Users/sebastian/Desktop/EDM Hate Speech/' + "1000.csv", index=False)

X, y = data.iloc[:,:], data.iloc[:,:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=34)

X_train.to_csv('/Users/sebastian/Desktop/EDM Hate Speech/' + "Train.csv", index=False)
X_test.to_csv('/Users/sebastian/Desktop/EDM Hate Speech/' + "Test.csv", index=False)

print(X_train.shape)
print(X_test.shape)

print(X_test)



(801, 3)
(201, 3)
      id                                            content  deleted
313  313  Lies Dir mal das Original-Papier der Hartz Kom...    False
138  138  ... sondern ein Nebenprodukt evolutionärer Kon...    False
221  221                                                 ni    False
924  924  Was willst du damit denn aussagen? Das die Neg...     True
479  479  Ich weiß auch nicht wie sie auf 32000 kommen, ...    False
563  563  Und was genau willst Du dann mit Keines der Ki...     True
860  860  Böller bestellen und hoffen, dass er neutral v...     True
814  814  Ein kluger Stratege wie Putin ist mit einem Ps...     True
611  611                                                Blu     True
737  737  Leider sind die Vergleiche vollkommen passend....     True
87    87  Das Gallup-Institut meint, Sie verbreiten Fake...    False
57    57  Warum?Die illegalen Einreisen würden drastisch...    False
477  477  Ich habe eine VT zu dieser Situation, die zwei...    False
551  551  Nette 

In [56]:
import numpy as np
import pandas as pd

from keras.layers import Dense, Input, Bidirectional, Conv1D, GRU, Embedding, GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras.preprocessing import text, sequence
from keras.models import Model
from keras.optimizers import Adam

data_path = '/Users/sebastian/Desktop/EDM Hate Speech/'

EMBEDDING_FILE = data_path + 'multilingual_embeddings.txt'

train = pd.read_csv(data_path + 'Train.csv')
test = pd.read_csv(data_path + 'Test.csv')

x_train = train["content"].fillna("fillna")
x_test = test["content"].fillna("fillna")

x_train = x_train.str.lower()
x_test = x_test.str.lower()

y_train = train[["deleted"]].values

max_features = 14526
maxlen = 150
embed_size = 300

tok = text.Tokenizer(num_words=max_features, lower=True)

tok.fit_on_texts(list(x_train) + list(x_test))

x_train = tok.texts_to_sequences(x_train)
x_test = tok.texts_to_sequences(x_test)

x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

embeddings_index = {}
with open(EMBEDDING_FILE, encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

word_index = tok.word_index

# prepare embedding matrix
num_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embed_size))

for word, i in word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

sequence_input = Input(shape=(maxlen,))

x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(sequence_input)

x = SpatialDropout1D(0.2)(x)
x = Bidirectional(GRU(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.3))(x)
x = Conv1D(64, kernel_size=3, padding="valid", kernel_initializer="glorot_uniform")(x)

avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool])

prediction = Dense(1, activation="sigmoid")(x)
model = Model(sequence_input, prediction)
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=1e-3), metrics=['accuracy'])

batch_size = 32
epochs = 3

bst_model_path = '/Users/sebastian/Desktop/EDM Hate Speech/model1.h5'
history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1)
y_pred = model.predict(x_test, verbose=1, batch_size=512)

model.save_weights(bst_model_path)

print(y_pred.shape)

data = pd.DataFrame(data=y_pred)

data.to_csv(data_path + "prediction.csv", index=False)


Epoch 1/3
Epoch 2/3
Epoch 3/3
(201, 1)


In [58]:
import pandas as pd
from sklearn import metrics

dataset = ('/Users/sebastian/Desktop/EDM Hate Speech/prediction.csv')
data = pd.read_csv(dataset, header=None)
data.columns = ["deleted"]
data.drop(data.index[0], inplace=True)

data.to_csv('/Users/sebastian/Desktop/EDM Hate Speech/' + "prediction.csv", index=False)

In [75]:
import pandas as pd
from sklearn import metrics
from sklearn.metrics import classification_report

categories = ["deleted"]


def to_binary(predictions):
    for category in categories:
        predictions[category] = [1 if row > 0.5 else 0 for row in predictions[category]]
    return predictions


def get_f1_results(truth, predictions, name):
    target_names = ['not toxic','toxic']
    predictions = to_binary(predictions)

    print(name + ": " + str(metrics.classification_report(truth[categories], predictions[categories], target_names=target_names)))
    print("Micro: " + str(metrics.f1_score(truth[categories], predictions[categories], average='micro')))
    print("Macro: " + str(metrics.f1_score(truth[categories], predictions[categories], average='macro')))
    print("Average: " + str(metrics.f1_score(truth[categories], predictions[categories], average='weighted')))

    ## print(classification_report(y_true, y_pred, target_names=target_names))
     ##        precision    recall  f1-score   support

if __name__ == "__main__":
    truth = pd.read_csv("/Users/sebastian/Desktop/EDM Hate Speech/Test.csv")

    prediction1 = pd.read_csv("/Users/sebastian/Desktop/EDM Hate Speech/prediction.csv")

    get_f1_results(truth, prediction1, "Prediction 1")

Prediction 1:              precision    recall  f1-score   support

  not toxic       0.68      0.27      0.39       103
      toxic       0.53      0.87      0.66        98

avg / total       0.61      0.56      0.52       201

Micro: 0.562189054726
Macro: 0.523901808786
Average: 0.52054327844
