In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.stats import rankdata

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPool1D, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model

In [None]:
MAX_SEQUENCE_LENGTH = 100
MAX_VOCAB_SIZE = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
BATCH_SIZE = 128
EPOCHS = 10

In [None]:
# load in pre-tained word vectors
print("Loading word vectors...")
word2vec = {}
with open(os.path.join('../input/glove6b/glove.6B.%sd.txt' % EMBEDDING_DIM)) as f:
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec
print('Found %s word vectors.' % len(word2vec))

In [None]:
train_data = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
test_data = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test.csv")
test_label = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv").replace(-1,0)

In [None]:
test_data = pd.merge(test_data, test_label, how = 'left', on = 'id')
total_data = pd.concat([train_data, test_data])
total_data.sample(10)

In [None]:
total_data['severe_toxic'] = total_data.severe_toxic * 2
total_data['y'] = (total_data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1)).astype(int)
total_data['y'] = total_data['y']/total_data['y'].max()
total_data = total_data[['comment_text', 'y']].rename(columns={'comment_text': 'text'})

In [None]:
total_data.sample(10)

In [None]:
total_data['y'].value_counts()

In [None]:
sample = len(total_data[total_data.y>0])
total_data_undersample = total_data[total_data['y'] == 0].sample(n=sample, random_state=101)
comment_df = pd.concat([total_data[total_data['y'] > 0], total_data_undersample])
comment_df

In [None]:
sentences = comment_df['text']
target = comment_df['y']

In [None]:
# convert the sentences into sequences
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

In [None]:
# get word -> integer mapping
word2idx = tokenizer.word_index
# pad sequence so that we get N x T matrix
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
# prepare embedding matrix
print('Filling pre-trained embeddings...')
num_words = min(MAX_VOCAB_SIZE, len(word2idx) +1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx.items():
    if i < MAX_VOCAB_SIZE:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all zeros
            embedding_matrix[i] = embedding_vector

In [None]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(
    num_words,
    EMBEDDING_DIM,
    weights=[embedding_matrix],
    input_length=MAX_SEQUENCE_LENGTH,
    trainable=False
)

In [None]:
print('Building model...')
input_ = Input(shape=(MAX_SEQUENCE_LENGTH,))
x = embedding_layer(input_)
x = Conv1D(128, 3, activation='relu')(x)
x = GlobalMaxPool1D()(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.1)(x)
output = Dense(1)(x)

In [None]:
model = Model(input_, output)
model.compile(
    loss='mse',
    optimizer='adam'
)

In [None]:
r = model.fit(
    data,target,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.1,
    shuffle=True,
)

In [None]:
comment_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
comment_val.sample(10)

In [None]:
less_toxic_sequences = tokenizer.texts_to_sequences(comment_val['less_toxic'])
less_toxic_data = pad_sequences(less_toxic_sequences, maxlen=MAX_SEQUENCE_LENGTH)

more_toxic_sequences = tokenizer.texts_to_sequences(comment_val['more_toxic'])
more_toxic_data = pad_sequences(more_toxic_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
comment_val['less_toxic_score'] = model.predict(less_toxic_data).flatten()
comment_val['more_toxic_score'] = model.predict(more_toxic_data).flatten()
comment_val.sample(10)

In [None]:
submission_df = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
submission_df.sample(10)

In [None]:
submission_sequences = tokenizer.texts_to_sequences(submission_df['text'])
submission_data = pad_sequences(submission_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
score = model.predict(submission_data)
submission_df['score'] = rankdata(score, method='ordinal')
submission_df.sample(10)

In [None]:
submission_df[['comment_id', 'score']].to_csv("submission.csv", index=False)