In [1]:
import tensorflow as tf
print(tf.__version__)

2.2.0


In [45]:
import numpy as np
import pandas as pd
from keras.models import Model, load_model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, add, concatenate, Dropout
from keras.layers import Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, LSTM
from keras.preprocessing import text, sequence

import tensorflow as tf
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from gensim.models import KeyedVectors
from keras.callbacks import EarlyStopping, ModelCheckpoint


In [7]:
from numpy.random import seed
seed(1)

tf.random.set_seed(2)

In [8]:
def perf_measure(y_actual, y_hat):
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(y_hat)): 
        if y_actual[i]==y_hat[i]==1:
               TP += 1
        if y_hat[i]==1 and y_actual[i]!=y_hat[i]:
               FP += 1
        if y_actual[i]==y_hat[i]==0:
               TN += 1
        if y_hat[i]==0 and y_actual[i]!=y_hat[i]:
               FN += 1

    return(TP, FP, TN, FN)

In [58]:

EMBEDDING_FILES = [
    'crawl-300d-2M.gensim',
    'glove.840B.300d.gensim'
]
MAX_FEATURES = 50000 # max number of unique words to keep based on frequency
#NUM_MODELS = 2
BATCH_SIZE = 512
LSTM_UNITS = 128 # output vector dimension of each lstm cell
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
EPOCHS = 20
DROPOUT_RATE = 0.5
TRAIN_PERCENT = 0.8
VALID_PERCENT = 0.25 # 25% from the training set is equiv to 20% from the whole dataset

# IDENTITY_COLUMNS = [
#     'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
#     'muslim', 'black', 'white', 'psychiatric_or_mental_illness'
# ]
# AUX_COLUMNS = ['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']
TEXT_COLUMN = 'comment_text'
TARGET_COLUMN = 'target'
# CHARS_TO_REMOVE = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'

In [10]:
def build_matrix(word_index, path):
    unknown_words = []
    embedding_index = KeyedVectors.load(path, mmap='r')
    embedding_matrix = np.zeros((MAX_FEATURES + 1, 300))
    #embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        if i <= MAX_FEATURES:
            try:
                embedding_matrix[i] = embedding_index[word]
            except KeyError:
                try:
                    embedding_matrix[i] = embedding_index[word.lower()]
                except KeyError:
                    try:
                        embedding_matrix[i] = embedding_index[word.title()]
                    except KeyError:
                        unknown_words.append(word)
    return embedding_matrix, unknown_words

In [11]:
def build_model(embedding_matrix):
#     words = Input(shape=(None,))
    words = Input(shape=(MAX_LEN,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    #x = Embedding(MAX_FEATURES, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(0.2)(x)
#     x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
#     x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(x)

    hidden = concatenate([
        GlobalMaxPooling1D()(x),
        GlobalAveragePooling1D()(x),
    ])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    result = Dense(1, activation='sigmoid')(hidden)
    #aux_result = Dense(num_aux_targets, activation='sigmoid')(hidden)
    
    model = Model(inputs=words, outputs=[result#, aux_result
                                        ])
    model.compile(loss='binary_crossentropy', optimizer='adam') #reset learning rate using Adam Optimizer)  

    return model

In [49]:
# df = pd.read_csv('sample.csv', index_col = 'id')
# print(df.memory_usage().sum() / 1024**2) # MB

df = pd.read_csv('comments_preprocessed_1.csv', index_col = 'id')
print(df.memory_usage().sum() / 1024**2) # MB

  mask |= (ar1 == a)


40.395973205566406


In [50]:
df.shape

(1764927, 2)

In [51]:
MAX_LEN = 242 # outter_fence
comments = df[TEXT_COLUMN].astype('str')
y = df[TARGET_COLUMN].astype(np.int8)

non_long_len = comments.apply(lambda x:len(x.split()) <= 242)
text_removelong = comments[non_long_len].copy()
non_long_len_indices = text_removelong.index
y_removelong = y.loc[non_long_len_indices].copy()

In [52]:
train_size = round(y_removelong.shape[0]*TRAIN_PERCENT)
print(train_size)


1411933


In [53]:
x_train = text_removelong[:train_size,]
y_train = y_removelong[:train_size,]
x_test = text_removelong[train_size:,]
y_test = y_removelong[train_size:,]

In [54]:

tokenizer = text.Tokenizer(num_words = MAX_FEATURES, lower = False)
#tokenizer = text.Tokenizer(lower = False)
tokenizer.fit_on_texts(list(x_train) + list(x_test))

In [55]:
len(tokenizer.word_index)

242697

In [59]:
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

In [60]:
embedding_matrix_cl, unknown_words_cl = build_matrix(tokenizer.word_index, EMBEDDING_FILES[0])
print('n unknown words(crawl): ', len(unknown_words_cl))
embedding_matrix_gl, unknown_words_gl = build_matrix(tokenizer.word_index, EMBEDDING_FILES[1])
print('n unknown words(glove): ', len(unknown_words_gl))

embedding_matrix = np.concatenate([embedding_matrix_cl, embedding_matrix_gl], axis=-1)

del embedding_matrix_cl
del embedding_matrix_gl

embedding_matrix.shape

n unknown words(crawl):  2860
n unknown words(glove):  2940


(50001, 600)

In [None]:
# dropoutrate 0.5

model = build_model(embedding_matrix)
# simple early stopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)
mc = ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)
hist = model.fit(
    x_train,
    #[y_train, y_aux_train],
    y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    verbose=1,
    validation_split = VALID_PERCENT,
    callbacks=[es, mc]
)

predictions = model.predict(x_test, batch_size=512).flatten()

In [47]:
# dropoutrate 0.5
DROPOUT_RATE = 0.5
EPOCHS=20

model = build_model(embedding_matrix)
# simple early stopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)
hist = model.fit(
    x_train,
    #[y_train, y_aux_train],
    y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    verbose=1,
    validation_split = VALID_PERCENT,
    callbacks=[es, mc]
)

predictions = model.predict(x_test, batch_size=512).flatten()

Epoch 1/20
Epoch 00001: val_loss improved from inf to 0.09964, saving model to best_model.h5
Epoch 2/20
Epoch 00002: val_loss did not improve from 0.09964
Epoch 00002: early stopping


In [26]:
model = build_model(embedding_matrix)
hist = model.fit(
    x_train,
    #[y_train, y_aux_train],
    y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    verbose=1,
    validation_split = VALID_PERCENT
)

predictions = model.predict(x_test, batch_size=512).flatten()

Epoch 1/2
Epoch 2/2


In [36]:
# dropoutrate 0.8
DROPOUT_RATE = 0.8
model = build_model(embedding_matrix)
hist = model.fit(
    x_train,
    #[y_train, y_aux_train],
    y_train,
    batch_size=BATCH_SIZE,
    epochs=3,
    verbose=1,
    validation_split = VALID_PERCENT
)

predictions = model.predict(x_test, batch_size=512).flatten()

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [42]:
saved_model = load_model('best_model.h5')

In [43]:
saved_model

<tensorflow.python.keras.engine.training.Model at 0x7f6f7a2f5fd0>

In [41]:
hist.history

{'loss': [0.4225098490715027, 0.2886624038219452, 0.2296716272830963],
 'val_loss': [0.10113757848739624, 0.11286519467830658, 0.13625843822956085]}