In [None]:
import pandas as pd
import re
from keras import layers,optimizers
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
import tensorflow as tf

In [None]:
train1 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
train2 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv")
train2.toxic = train2.toxic.round().astype(int)
#valid = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv')
valid = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-test-translated/jigsaw_miltilingual_valid_translated.csv")

In [None]:
train = pd.concat([
    train1[['comment_text', 'toxic']].query('toxic==0').sample(n=21384, random_state=0),
    train1[['comment_text', 'toxic']].query('toxic==1'),
    train2[['comment_text', 'toxic']].query('toxic==0').sample(n=112226, random_state=0),
    train2[['comment_text', 'toxic']].query('toxic==1'),
])

In [None]:
"""
%%time 

valid["lang"].replace("tr","turkish",inplace=True)
valid["lang"].replace("es","spanish",inplace=True)
valid["lang"].replace("it","italian",inplace=True)
for i in range(len(valid["lang"])):
    translator= Translator(from_lang=valid.iloc[i,2],to_lang="English")
    valid.iloc[i,0] = translator.translate(valid.iloc[i,1])
    if(i%1000==0):
        print(i)
"""

In [None]:
def clean(text):
    text = text.fillna("fillna").str.lower()
    text = text.map(lambda x: re.sub('\\n',' ',str(x)))
    text = text.map(lambda x: re.sub("\[\[User.*",'',str(x)))
    text = text.map(lambda x: re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",'',str(x)))
    text = text.map(lambda x: re.sub("\(http://.*?\s\(http://.*\)",'',str(x)))
    return text

valid["comment_text"] = clean(valid["translated"])
train["comment_text"] = clean(train["comment_text"])

y_valid = valid.toxic.values
y_train = train.toxic.values

In [None]:
maxlen=1024
tokenizer = Tokenizer(num_words=maxlen)

tokenizer.fit_on_texts(train["comment_text"])
xtrain = tokenizer.texts_to_sequences(train["comment_text"])
xtrain = pad_sequences(xtrain, padding='post', maxlen=maxlen)

xtest = tokenizer.texts_to_sequences(valid["comment_text"])
xtest = pad_sequences(xtest, padding='post', maxlen=maxlen)

vocab_size = len(tokenizer.word_index)+1

In [None]:
%%time

main_input = Input(shape=(maxlen,), dtype='float64')
embedder = Embedding(input_dim=vocab_size,output_dim=50,input_length=maxlen)
embed = embedder(main_input)

cnn2 = layers.Conv1D(128, 3, padding='same', strides = 1, activation='relu')(embed)
cnn2 = layers.MaxPooling1D(pool_size=3)(cnn2)
cnn3 = layers.Conv1D(128, 4, padding='same', strides = 1, activation='relu')(embed)
cnn3 = layers.MaxPooling1D(pool_size=3)(cnn3)
cnn4 = layers.Conv1D(128, 2, padding='same', strides = 1, activation='relu')(embed)
cnn4 = layers.MaxPooling1D(pool_size=3)(cnn4)

cnn = layers.concatenate([cnn2,cnn3,cnn4], axis=-1)
flat = layers.Flatten()(cnn)
main_output = Dense(1, activation='sigmoid')(flat)
model = Model(inputs = main_input, outputs = main_output)
model.compile(optimizer="Adam",loss='binary_crossentropy',metrics=[tf.keras.metrics.AUC()])
print(model.summary())
history = model.fit(xtrain, y_train,epochs=5,verbose=1,validation_data=(xtest, y_valid),batch_size=200)


In [None]:
%%time

from keras.layers import SpatialDropout1D
main_input = Input(shape=(maxlen,), dtype='float64')
embedder = Embedding(input_dim=vocab_size,output_dim=100,input_length=maxlen)
embed = embedder(main_input)
#embedding = SpatialDropout1D(0.2)(embed)
#lstm_1 = LSTM(32, return_sequences=True)(embedding)

flat = layers.Flatten()(embed)
#drop = Dropout(0.1)(flat)
main_output = Dense(1, activation='sigmoid')(flat)
model = Model(inputs = main_input, outputs = main_output)
model.compile(optimizer="Adam",loss='binary_crossentropy',metrics=[tf.keras.metrics.AUC()])
print(model.summary())
history = model.fit(xtrain, y_train,epochs=5,verbose=1,validation_data=(xtest, y_valid),batch_size=200)
