In [33]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras import layers
from keras.callbacks import EarlyStopping

seed = 59
emb_dim = 256
lstm_dim = 256

np.random.seed(seed)
keras.utils.set_random_seed(seed)

In [2]:
df_tv = pd.read_csv('train.csv')
df_tt = pd.read_csv('test.csv')

In [3]:
df_tv.isna().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [4]:
df_tt.isna().sum()

id              0
comment_text    0
dtype: int64

In [5]:
df_tv.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [6]:
df_tt.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [7]:
df_tv.iloc[5].comment_text

'"\n\nCongratulations from me as well, use the tools well. \xa0· talk "'

In [8]:
df_tv.comment_text = df_tv.comment_text.apply(str.lower)
df_tt.comment_text = df_tt.comment_text.apply(str.lower)

In [9]:
x_tv = df_tv.comment_text
y_tv = df_tv[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

In [10]:
df_tv.iloc[5].comment_text

'"\n\ncongratulations from me as well, use the tools well. \xa0· talk "'

In [11]:
x_tr, x_vl, y_tr, y_vl = train_test_split(x_tv, y_tv, random_state=seed, test_size=0.2)

In [12]:
x_tr.head()

133963    you are a chicken shit cock sucking pussy bast...
17915     "\nthat would be a ridiculous assumption, cons...
119405                    "\n gone divin' - back by xmas\n"
105068    north & south india \nsince the quirks used in...
62731     control ownership of commmerzbank\nwhy do you ...
Name: comment_text, dtype: object

In [13]:
y_tr.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
133963,1,1,1,0,1,0
17915,0,0,0,0,0,0
119405,0,0,0,0,0,0
105068,0,0,0,0,0,0
62731,0,0,0,0,0,0


In [14]:
output_dim = y_tv.shape[1]
output_dim

6

In [15]:
tokenizer = Tokenizer(oov_token='<unk>')
tokenizer.fit_on_texts(x_tv.values)

In [16]:
vocab_size = len(tokenizer.word_index)
vocab_size

210338

In [17]:
x_tr_seq = tokenizer.texts_to_sequences(x_tr.values)
x_vl_seq = tokenizer.texts_to_sequences(x_vl.values)
x_tt_seq = tokenizer.texts_to_sequences(df_tt.comment_text.values)

In [18]:
max_len = max([len(e) for e in x_tr_seq])
max_len

1403

In [19]:
x_tr_seq = pad_sequences(x_tr_seq, maxlen=max_len, padding='post', truncating='post')
x_vl_seq = pad_sequences(x_vl_seq, maxlen=max_len, padding='post', truncating='post')
x_tt_seq = pad_sequences(x_tt_seq, maxlen=max_len, padding='post', truncating='post')

In [20]:
x_tr_seq.shape, y_tr.shape

((127656, 1403), (127656, 6))

In [35]:
model = Sequential([
    layers.Input(max_len,),
    layers.Embedding(input_dim=vocab_size + 1, output_dim=emb_dim),
    layers.Bidirectional(layers.LSTM(units=lstm_dim)),
    layers.Dense(output_dim, activation='sigmoid')
])

In [36]:
model.compile(optimizer='adam', metrics=['accuracy'], loss='binary_crossentropy')

In [37]:
model.fit(x_tr_seq, y_tr, validation_data=(x_vl_seq, y_vl), epochs=10, batch_size=256, shuffle=True, callbacks=[EarlyStopping(patience=3)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


<keras.callbacks.History at 0x7f6533056bb0>

In [38]:
classes = y_tr.columns

def predict(comment):
  comment = comment.lower()
  seq = tokenizer.texts_to_sequences([comment])
  seq = pad_sequences(seq, maxlen=max_len, padding='post', truncating='post')
  pred = model.predict(seq)[0]
  pred = np.vectorize(round)(pred)
  pred_cls = [c for p, c in zip(pred, classes) if p]
  return pred, pred_cls

In [39]:
predict(':Fuck off, you anti-semitic cunt.  |')



(array([1, 1, 1, 0, 1, 0]), ['toxic', 'severe_toxic', 'obscene', 'insult'])

In [40]:
predict("How dare you vandalize that page about the HMS Beagle! Don't vandalize again, demon!")



(array([0, 0, 0, 0, 0, 0]), [])

In [41]:
predict("::You're funny.  Ugly?  We're dudes on computers, moron.  You are quite astonishingly stupid.")



(array([1, 0, 1, 0, 1, 0]), ['toxic', 'obscene', 'insult'])