In [None]:
import pandas as pd
import numpy as np
import re
import csv
import tensorflow as tf
import nltk
import gc
from gensim.models import Word2Vec
from keras.preprocessing import text, sequence
from sklearn.model_selection import train_test_split

In [None]:
W = pd.read_csv('W.csv')
W = W.set_index(W.columns[0])
W = np.array(W).astype(np.float32, copy=False)

In [None]:
train_x = pd.read_csv('train_x.csv')
val_x = pd.read_csv('val_x.csv')
test_x = pd.read_csv('./test_x.csv', sep = ',')
ytrain = pd.read_csv('./ytrain.csv', sep = ',')
yval = pd.read_csv('./yval.csv', sep = ',')

#placeholders and CNN construction

In [None]:
filter_sizes = [1,2,3,4,5]
num_filters = 32
batch_size = 256
#This large batch_size is specially for this case. Usually it is between 64-128.
num_filters_total = num_filters * len(filter_sizes)
embedding_size = 300
sequence_length = 150
num_epochs = 3 #Depends on your choice.
dropout_keep_prob = 0.9

In [None]:
num_filters_total

In [None]:
input_x = tf.placeholder(tf.int32, [None, sequence_length], name = "input_x")
input_y = tf.placeholder(tf.float32, [None,6], name = "input_y")

In [None]:
embedded_chars = tf.nn.embedding_lookup(W, input_x)
embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)

In [None]:
def CNN(data):
    pooled_outputs = []
    
    for i, filter_size in enumerate(filter_sizes):
        
        filter_shape = [filter_size, embedding_size, 1, num_filters]
        
        w = tf.Variable(tf.truncated_normal(filter_shape,stddev = 0.05), name = "w")
        b = tf.Variable(tf.truncated_normal([num_filters], stddev = 0.05), name = "b")
            
        conv = tf.nn.conv2d(
            data,
            w,
            strides = [1,1,1,1],
            padding = "VALID",
            name = "conv"
        )
        h = tf.nn.relu(tf.nn.bias_add(conv, b), name = "relu")
        pooled = tf.nn.max_pool(
            h,
            ksize = [1, sequence_length - filter_size + 1, 1, 1],
            strides = [1,1,1,1],
            padding = "VALID",
            name = "pool"
        )
        
        pooled_outputs.append(pooled)
    
    #return pooled_outputs
    h_pool = tf.concat(pooled_outputs, 3)
    h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])
    return h_pool_flat

In [None]:
h_pool_flat = CNN(embedded_chars_expanded)

In [None]:
h_drop = tf.nn.dropout(h_pool_flat, dropout_keep_prob)

In [None]:
#In the first dense layer, reduce the node to half.
wd1 = tf.Variable(tf.truncated_normal([num_filters_total, int(num_filters_total/2)], stddev=0.05), name = "wd1")
bd1 = tf.Variable(tf.truncated_normal([int(num_filters_total/2)], stddev = 0.05), name = "bd1")
layer1 = tf.nn.xw_plus_b(h_drop, wd1, bd1, name = 'layer1') # Do wd1*h_drop + bd1
layer1 = tf.nn.relu(layer1)

In [None]:
#Second dense layer, reduce the outputs to 6.
wd2 = tf.Variable(tf.truncated_normal([int(num_filters_total/2),6], stddev = 0.05), name = 'wd2')
bd2 = tf.Variable(tf.truncated_normal([6], stddev = 0.05), name = "bd2")
layer2 = tf.nn.xw_plus_b(layer1, wd2, bd2, name = 'layer2') 
prediction = tf.nn.sigmoid(layer2)# Make it to be 0-1.
#pred_clipped = tf.clip_by_value(prediction, 1e-10, 0.9999999) 
#For some special loss function clip is necessary. Like log(x).

In [None]:
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits = layer2, labels = input_y))
optimizer = tf.train.AdamOptimizer(learning_rate = 0.0005).minimize(loss)
#Learning rates usually is small for CNN compared with pure neural network. 
#Need to define a approriate learning rate before you run on the whole dataset.
accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(prediction), input_y), tf.float32))
#correct_prediction = tf.equal(tf.argmax(input_y, 1), tf.argmax(prediction, 1))
#accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

Blocks and Batches

In [None]:
#Define batch generation function.
def generate_batch(data, batch_size, num_epochs, shuffle=True):
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
    l = 0
    for epoch in range(num_epochs):
        l += 1
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]

In [None]:
#For Test data. Can use generate_batch function.
def blocks(data, block_size):
    data = np.array(data)
    data_size = len(data)
    nums = int((data_size-1)/block_size) + 1
    for block_num in range(nums):
        if block_num == 0:
            print("prediction start!")
        start_index = block_num * block_size
        end_index = min((block_num + 1) * block_size, data_size)
        print(end_index)
        yield data[start_index:end_index]

Training and evaluate model

In [None]:
#Ready to predict test data.
x_train = pd.DataFrame(train_x).append(pd.DataFrame(val_x))
y_train = pd.DataFrame(ytrain).append(pd.DataFrame(yval))

In [None]:
x_train.shape

In [None]:
y_train.shape

In [None]:
# The reason to create 7 different batches here is because 
#I want to make the data totally shuffled to reduce the risk that one batch have all 0.

batch1 = generate_batch(list(zip(np.array(x_train), y_train['toxic'], y_train['severe_toxic'], y_train['obscene'], y_train['threat'], y_train['insult'], y_train['identity_hate'])), 1000, 1)
batch2 = generate_batch(list(zip(np.array(x_train), y_train['toxic'], y_train['severe_toxic'], y_train['obscene'], y_train['threat'], y_train['insult'], y_train['identity_hate'])), batch_size, 1)
batch3 = generate_batch(list(zip(np.array(x_train), y_train['toxic'], y_train['severe_toxic'], y_train['obscene'], y_train['threat'], y_train['insult'], y_train['identity_hate'])), batch_size, 1)
batch4 = generate_batch(list(zip(np.array(x_train), y_train['toxic'], y_train['severe_toxic'], y_train['obscene'], y_train['threat'], y_train['insult'], y_train['identity_hate'])), batch_size, 1)
batch5 = generate_batch(list(zip(np.array(x_train), y_train['toxic'], y_train['severe_toxic'], y_train['obscene'], y_train['threat'], y_train['insult'], y_train['identity_hate'])), batch_size, 1)
batch6 = generate_batch(list(zip(np.array(x_train), y_train['toxic'], y_train['severe_toxic'], y_train['obscene'], y_train['threat'], y_train['insult'], y_train['identity_hate'])), batch_size, 1)
batch7 = generate_batch(list(zip(np.array(x_train), y_train['toxic'], y_train['severe_toxic'], y_train['obscene'], y_train['threat'], y_train['insult'], y_train['identity_hate'])), batch_size, 1)

In [None]:
train_x.shape

In [None]:
test_blocks = blocks(list(np.array(test_x)), 1000)

In [None]:
batch_bag = [batch1,batch2,batch3]#,batch4,batch5,batch6]#,batch7]

In [None]:
init_op = tf.global_variables_initializer()

with tf.Session() as sess:
    
    sess.run(init_op)
    i = 0
    for batches in batch_bag:
        i += 1
        print('Epoch: ' + str(i) + ' start!')
        avg_acc = 0
        avg_loss = 0
        for batch in batches:
            batch = pd.DataFrame(batch, columns = ['a','b','c','d','e','g','f'])
            x_batch = pd.DataFrame(list(batch['a']))
            y_batch = batch.loc[:, batch.columns != 'a']
            _,c, acc = sess.run([optimizer, loss, accuracy],feed_dict = {input_x: x_batch, input_y: y_batch})
            avg_loss += c
            avg_acc += acc
            #print(str(c) + ' and ' + str(acc))
            #print('pred_train')
            #print(prediction.eval({input_x: x_batch, input_y: y_batch}))
        avg_loss = avg_loss/624
        avg_acc = avg_acc/624
        print('Epoch:' + str(i) + ' loss is ' + str(avg_loss) + ', accuracy is ' + str(avg_acc))
        #print('Evaluation Accuracy: ')
        #print(accuracy.eval({input_x: val_x, input_y: yval}))
    
    df = pd.DataFrame()
    for block in test_blocks:
        block = pd.DataFrame(block)
        pred = sess.run(prediction, feed_dict = {input_x: block})
        df = df.append(pd.DataFrame(pred))

In [None]:
submission = pd.read_csv('sample_submission.csv')

In [None]:
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = np.array(df)
submission.to_csv('submission.csv', index=False)

In [None]:
df.round().sum()

In [None]:
sub.loc[:,sub.columns != 'id'].round().mean()
#Results from keras epoch = 3. Accuracy =  98.20%