In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('../input/classification-data/train.csv')
print(df.shape)

#sum toxicity
df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) ).astype(int)
df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
df.sample(5)

In [None]:
toxic = []
for l in df.y:
    if l == 0:
        toxic.append(0)
    else:
        toxic.append(1)

print(len(df))
df['toxic']= toxic

In [None]:
df['y'].value_counts()

In [None]:
#get rid of punctuation
import re
df['text'] = df['text'].str.lower()
df['text']  = df['text'].astype(str)
df['text'] = df['text'].apply(lambda x: re.sub("[^a-zA-Z]", " ", x))
df['text'] = df['text'].str.replace('nan','')
df

In [None]:
from nltk import word_tokenize
#tokenize
tokens = [word_tokenize(sentence) for sentence in df.text]
df['tokens'] = tokens 

In [None]:
import gensim
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
#train a word2vec model on all of the data
def word2vec(sentences):
    model = Word2Vec(sentences=sentences,  window=5, min_count=1)
    model.init_sims(replace=True)
    return(model)
model = word2vec(df['tokens'])

In [None]:
#similarity?
model.wv.most_similar(positive=["fuck"])

In [None]:
model.wv.most_similar(positive=["nazi"])

In [None]:
model.wv.most_similar(positive=["comment"])

In [None]:
len(df[df['y'] != 0])

In [None]:
len(df[df['y'] == 0])

In [None]:
df_zero = df[df['y'] == 0].sample(16000)
df = df[df['y']!=0].reset_index()

In [None]:
df = pd.concat([df,df_zero])
df['y'].value_counts()

In [None]:
from sklearn.utils import shuffle
df = shuffle(df).reset_index()
df

In [None]:
from keras.preprocessing.text import Tokenizer
all_training_words = [word for tokens in df["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in df["tokens"]]
vocab = sorted(list(set(all_training_words)))

tokenizer = Tokenizer(num_words=len(vocab), lower=True, char_level=False)
tokenizer.fit_on_texts(df["tokens"].tolist())
training_sequences = tokenizer.texts_to_sequences(df["tokens"].tolist())
import pickle
with open('tokenizer.pk', 'wb') as fout:
        pickle.dump(tokenizer, fout)

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))
print("Max sentence length is %s" % max(training_sentence_lengths))

In [None]:
import matplotlib.pyplot as plt
plt.figure()
plt.hist(training_sentence_lengths,bins = 50)
plt.yscale('log')
plt.xlabel('Length of comment in words')
plt.ylabel('Frequency')
plt.show()

In [None]:
from keras.preprocessing.sequence import pad_sequences
train_rnn_data = pad_sequences(training_sequences, maxlen=50)

In [None]:
train_embedding_weights = np.zeros((len(train_word_index)+1, 100))
for word,index in train_word_index.items():
    try:
        train_embedding_weights[index,:] = model.wv[word]
    except:
        train_embedding_weights[index,:] = np.random.rand(100)
print(train_embedding_weights.shape)

In [None]:
def recurrent_nn(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)


    lstm = LSTM(128)(embedded_sequences)
    x = Dense(128, activation='relu')(lstm)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.1)(x)

    preds = Dense(labels_index, activation='sigmoid')(x)
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model
print('done')

In [None]:
from keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Embedding
from keras.layers.recurrent import LSTM
from keras.models import Model

#let's classify as toxic or not
y_train = df['toxic'].values
x_train = train_rnn_data

model = recurrent_nn(train_embedding_weights, 50, len(train_word_index)+1, 100, 
                    1)
print('training model....')

num_epochs = 8
batch_size = 64
hist = model.fit(x_train, y_train, epochs=num_epochs, validation_split=0.1, shuffle=True, batch_size=batch_size)

In [None]:
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
df_val

In [None]:
import re
def correct(col):
    df_val[col] = df_val[col].str.lower()
    df_val[col]  =df_val[col].astype(str)
    df_val[col] = df_val[col].apply(lambda x: re.sub("[^a-zA-Z]", " ", x))
    df_val[col] = df_val[col].str.replace('nan','')
    return(list(df_val[col]))

In [None]:
df_val['less_toxic'] = correct('less_toxic')
df_val['more_toxic'] = correct('more_toxic')
df_val['less_tokens'] = [word_tokenize(sentence) for sentence in df_val.less_toxic]
df_val['more_tokens'] = [word_tokenize(sentence) for sentence in df_val.more_toxic]
df_val

In [None]:
train_word_index = tokenizer.word_index
training_sequences = tokenizer.texts_to_sequences(df_val["less_tokens"].tolist())
test_less = pad_sequences(training_sequences, maxlen=50)
less_pred = model.predict(test_less, batch_size=64, verbose=1)

In [None]:
train_word_index = tokenizer.word_index
training_sequences = tokenizer.texts_to_sequences(df_val["more_tokens"].tolist())
test_more = pad_sequences(training_sequences, maxlen=50)
more_pred = model.predict(test_more, batch_size=64, verbose=1)

In [None]:
df_val['less_pred'] = less_pred
df_val['more_pred'] = more_pred
df_val

In [None]:
len(df_val[df_val['more_pred']>df_val['less_pred']])/(len(df_val[df_val['more_pred']<df_val['less_pred']])+len(df_val[df_val['more_pred']>df_val['less_pred']]))

In [None]:
to_score = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
to_score

In [None]:
to_score['text'] = to_score['text'].str.lower()
to_score['text']  =to_score['text'].astype(str)
to_score['text'] = to_score['text'].apply(lambda x: re.sub("[^a-zA-Z]", " ", x))
to_score['text'] = to_score['text'].str.replace('nan','')


In [None]:
to_score['tokens'] = [word_tokenize(sentence) for sentence in to_score.text]
train_word_index = tokenizer.word_index
training_sequences = tokenizer.texts_to_sequences(to_score["tokens"].tolist())
test_seq = pad_sequences(training_sequences, maxlen=50)
pred = model.predict(test_seq, batch_size=512, verbose=1)

In [None]:
to_score['score'] = pred
to_score

In [None]:
to_score[['comment_id','score']].to_csv('submission.csv',index=False)