In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import re
import nltk

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.corpus import stopwords

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

In [None]:
# Load the dataset into a pandas dataframe.
data_1 = reduce_mem_usage(pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv', header=0, sep=',', quotechar='"'))
data_2 = reduce_mem_usage(pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv', header=0, sep=',', quotechar='"'))
data_3 = reduce_mem_usage(pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/validation.csv', header=0, sep=',', quotechar='"'))
submission = reduce_mem_usage(pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv', header=0, sep=',', quotechar='"'))
test = reduce_mem_usage(pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/test.csv', header=0, sep=',', quotechar='"'))

In [None]:
data = pd.concat((data_1[['comment_text','toxic']], data_2[['comment_text','toxic']], data_3[['comment_text','toxic']]),axis=0)

In [None]:
data_1 = None
data_2 = None
data_3 = None

In [None]:
training_sentences = data['comment_text']
training_labels = data['toxic']

In [None]:
testing_sentences = test['content']

In [None]:
data = None
test = None

In [None]:
del data;
del test;

In [None]:
import gc
gc.collect()

In [None]:
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)
    
training_sentences = training_sentences.apply(remove_between_square_brackets)
testing_sentences = testing_sentences.apply(remove_between_square_brackets)

In [None]:
def remove_escape_sequence(text):
  return re.sub('[\n\r\t]', '',text)

training_sentences = training_sentences.apply(remove_escape_sequence)
testing_sentences = testing_sentences.apply(remove_escape_sequence)

In [None]:
def _removeNonAscii(s): return "".join(i for i in s if ord(i)<128)

training_sentences = training_sentences.apply(_removeNonAscii)
testing_sentences = testing_sentences.apply(_removeNonAscii)

In [None]:
def _lower(s): return "".join(i.lower() for i in s)

training_sentences = training_sentences.apply(_lower)
testing_sentences = testing_sentences.apply(_lower)

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words(['english','turkish','spanish'])

# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
training_sentences = training_sentences.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
testing_sentences = testing_sentences.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [None]:
vocab_size = 500000  
embediing_dim = 256
max_length = 128
trunc_type = 'pre'
padding_type = 'pre'
oov_token = '<OOV>'

In [None]:
training_labels[training_labels!=0.0] = 1
training_labels = training_labels.astype(int)
training_labels = np.array(training_labels)

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, 
                      oov_token=oov_token, 
                      filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 
                      lower=True, 
                      split=" ", 
                      char_level=False)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, 
                                maxlen = max_length, 
                                padding = padding_type, 
                                truncating = trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, 
                               maxlen = max_length, 
                               padding = padding_type, 
                               truncating = trunc_type)

In [None]:
training_sentences = None 
testing_sentences = None
training_sequences = None 
testing_sequences = None

In [None]:
del training_sentences; 
del testing_sentences; 
del training_sequences; 
del testing_sequences;
gc.collect()

In [None]:
# Initiate model
model = tf.keras.Sequential()
# Add Embedding layer
model.add(tf.keras.layers.Embedding(vocab_size, embediing_dim, input_length = max_length, trainable=True))
# Add Convolutional layer
model.add(tf.keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(tf.keras.layers.MaxPooling1D(3))
model.add(tf.keras.layers.GlobalMaxPooling1D())
model.add(tf.keras.layers.BatchNormalization())
# Add fully connected layers
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
with tf.device('/device:GPU:0'):
  num_epochs = 10
  history = model.fit(training_padded, training_labels,batch_size=5120, epochs=num_epochs)

In [None]:
test_pred = model.predict(testing_padded)

In [None]:
submission['toxic'] = test_pred

In [None]:
submission.to_csv('submission.csv', index=False)