In [1]:
import datetime
import os
import pandas as pd
import numpy as np
import pkg_resources
import seaborn as sns
import time
import scipy.stats as stats
import config
import final_metric

from sklearn import metrics
from sklearn import model_selection
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras_preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from keras.layers import Input
from keras.layers import Conv1D
from keras.layers import MaxPooling1D
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Dense
from keras.optimizers import RMSprop
from keras.models import Model
from keras.models import load_model

In [2]:
MAX_NUM_WORDS = 10000
TEXT_COLUMN = 'comment_preprocessed'
EMBEDDINGS_PATH = 'data/glove.6B.100d.txt'
EMBEDDINGS_DIMENSION = 100
DROPOUT_RATE = 0.3
LEARNING_RATE = 0.00005
NUM_EPOCHS = 10
BATCH_SIZE = 128

In [3]:
train = pd.read_csv('data/train_comment_preprocessed.csv')
train = train.sample(frac=1, random_state=7).reset_index(drop=True)
train = train.head(config.NUM_SAMPLES)

train['comment_preprocessed'] = train['comment_preprocessed'].astype(str) 
train['target'] = train.target.apply(lambda x: 1 if x > 0.5 else 0)

In [4]:
train_df, validate_df = model_selection.train_test_split(train, test_size=0.2)

In [5]:
# Text tokenizer.
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(train_df[TEXT_COLUMN])

# All comments must be padded to be the same length.
MAX_SEQUENCE_LENGTH = 250
def pad_text(texts, tokenizer):
    return pad_sequences(tokenizer.texts_to_sequences(texts), maxlen=MAX_SEQUENCE_LENGTH)

In [6]:
def train_model(train_df, validate_df, tokenizer):
    # Prepare data
    train_text = pad_text(train_df[TEXT_COLUMN], tokenizer)
    train_labels = to_categorical(train_df[config.TOXICITY_COLUMN])
    validate_text = pad_text(validate_df[TEXT_COLUMN], tokenizer)
    validate_labels = to_categorical(validate_df[config.TOXICITY_COLUMN])

    embeddings_index = {}
    with open(EMBEDDINGS_PATH) as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embedding_matrix = np.zeros((len(tokenizer.word_index) + 1,
                                 EMBEDDINGS_DIMENSION))
    num_words_in_embedding = 0
    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            num_words_in_embedding += 1
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    # Create model layers.
    def get_convolutional_neural_net_layers():
        """Returns (input_layer, output_layer)"""
        sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
        embedding_layer = Embedding(len(tokenizer.word_index) + 1,
                                    EMBEDDINGS_DIMENSION,
                                    weights=[embedding_matrix],
                                    input_length=MAX_SEQUENCE_LENGTH,
                                    trainable=False)
        x = embedding_layer(sequence_input)
        x = Conv1D(128, 2, activation='relu', padding='same')(x)
        x = MaxPooling1D(5, padding='same')(x)
        x = Conv1D(128, 3, activation='relu', padding='same')(x)
        x = MaxPooling1D(5, padding='same')(x)
        x = Conv1D(128, 4, activation='relu', padding='same')(x)
        x = MaxPooling1D(40, padding='same')(x)
        x = Flatten()(x)
        x = Dropout(DROPOUT_RATE)(x)
        x = Dense(128, activation='relu')(x)
        preds = Dense(2, activation='softmax')(x)
        return sequence_input, preds

    # Compile model.
    input_layer, output_layer = get_convolutional_neural_net_layers()
    model = Model(input_layer, output_layer)
    model.compile(loss='categorical_crossentropy',
                  optimizer=RMSprop(learning_rate=LEARNING_RATE),
                  metrics=['acc'])

    # Train model.
    model.fit(train_text,
              train_labels,
              batch_size=BATCH_SIZE,
              epochs=NUM_EPOCHS,
              validation_data=(validate_text, validate_labels),
              verbose=2)

    return model

model = train_model(train_df, validate_df, tokenizer)

Epoch 1/10
313/313 - 8s - loss: 0.2350 - acc: 0.9419 - val_loss: 0.2078 - val_acc: 0.9445 - 8s/epoch - 24ms/step
Epoch 2/10
313/313 - 4s - loss: 0.2103 - acc: 0.9422 - val_loss: 0.1966 - val_acc: 0.9445 - 4s/epoch - 12ms/step
Epoch 3/10
313/313 - 4s - loss: 0.1949 - acc: 0.9422 - val_loss: 0.1810 - val_acc: 0.9444 - 4s/epoch - 14ms/step
Epoch 4/10
313/313 - 4s - loss: 0.1802 - acc: 0.9427 - val_loss: 0.1761 - val_acc: 0.9466 - 4s/epoch - 14ms/step
Epoch 5/10
313/313 - 4s - loss: 0.1708 - acc: 0.9436 - val_loss: 0.1666 - val_acc: 0.9462 - 4s/epoch - 14ms/step
Epoch 6/10
313/313 - 4s - loss: 0.1629 - acc: 0.9452 - val_loss: 0.1616 - val_acc: 0.9482 - 4s/epoch - 12ms/step
Epoch 7/10
313/313 - 4s - loss: 0.1554 - acc: 0.9470 - val_loss: 0.1578 - val_acc: 0.9490 - 4s/epoch - 13ms/step
Epoch 8/10
313/313 - 4s - loss: 0.1490 - acc: 0.9490 - val_loss: 0.1605 - val_acc: 0.9474 - 4s/epoch - 14ms/step
Epoch 9/10
313/313 - 3s - loss: 0.1441 - acc: 0.9502 - val_loss: 0.1528 - val_acc: 0.9504 - 3s/e

In [7]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 250)]             0         
                                                                 
 embedding (Embedding)       (None, 250, 100)          4333800   
                                                                 
 conv1d (Conv1D)             (None, 250, 128)          25728     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 50, 128)          0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 50, 128)           49280     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 10, 128)          0         
 1D)                                                         

In [8]:
MODEL_NAME = 'CNN'
y_pred = model.predict(pad_text(validate_df[TEXT_COLUMN], tokenizer))[:, 1]
validate_df[MODEL_NAME] = y_pred



In [9]:
final_metric.get_value(validate_df, y_pred, MODEL_NAME)


---------- Model Performance: CNN ----------

                        subgroup  subgroup_size  subgroup_auc  bpsn_auc  bnsp_auc
2      homosexual_gay_or_lesbian             52      0.653409  0.772581  0.765403
5                         muslim            124      0.688150  0.778919  0.792759
3                      christian            211      0.731935  0.889428  0.669406
4                         jewish             39      0.735714  0.679964  0.915166
6                          black             93      0.775776  0.693163  0.893339
1                         female            282      0.798221  0.796801  0.845759
7                          white            140      0.803355  0.683475  0.909801
8  psychiatric_or_mental_illness             22      0.835294  0.868578  0.804440
0                           male            209      0.861640  0.799534  0.887842
Final Metric: 0.7945709845269946


0.7945709845269946