In [None]:
import numpy as np
import pandas as pd
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, add, concatenate, Dropout
from keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, LSTM
from keras.preprocessing import text, sequence
from gensim.models import KeyedVectors
import tensorflow as tf
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

In [None]:
def perf_measure(y_actual, y_hat):
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(y_hat)): 
        if y_actual[i]==y_hat[i]==1:
               TP += 1
        if y_hat[i]==1 and y_actual[i]!=y_hat[i]:
               FP += 1
        if y_actual[i]==y_hat[i]==0:
               TN += 1
        if y_hat[i]==0 and y_actual[i]!=y_hat[i]:
               FN += 1

    return(TP, FP, TN, FN)

In [None]:
# def reduce_mem_usage(df):
#     start_mem = df.memory_usage().sum() / 1024**2
#     print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
#     for col in df.columns:
#         col_type = df[col].dtype
        
#         if col_type != object:
#             c_min = df[col].min()
#             c_max = df[col].max()
#             if str(col_type)[:3] == 'int':
#                 if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
#                     df[col] = df[col].astype(np.int8)
#                 elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
#                     df[col] = df[col].astype(np.int16)
#                 elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
#                     df[col] = df[col].astype(np.int32)
#                 elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
#                     df[col] = df[col].astype(np.int64)  
#             else:
#                 if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
#                     df[col] = df[col].astype(np.float16)
#                 elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
#                     df[col] = df[col].astype(np.float32)
#                 else:
#                     df[col] = df[col].astype(np.float64)
#         else:
#             df[col] = df[col].astype('category')

#     end_mem = df.memory_usage().sum() / 1024**2
#     print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
#     print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
#     return df

In [None]:
tf.random.set_seed(0)
EMBEDDING_FILES = [
    'crawl-300d-2M.gensim',
    'glove.840B.300d.gensim'
]
MAX_FEATURES = 50000 # max number of unique words to keep based on frequency
#NUM_MODELS = 2
BATCH_SIZE = 512
LSTM_UNITS = 128 # output vector dimension of each lstm cell
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
EPOCHS = 6
DROPOUT_RATE = 0.2
TRAIN_PERCENT = 0.8
VALID_PERCENT = 0.25 # 25% from the training set is equiv to 20% from the whole dataset

# IDENTITY_COLUMNS = [
#     'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
#     'muslim', 'black', 'white', 'psychiatric_or_mental_illness'
# ]
# AUX_COLUMNS = ['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']
TEXT_COLUMN = 'comment_text'
TARGET_COLUMN = 'target'
# CHARS_TO_REMOVE = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'

In [None]:
# def build_matrix(word_index, path):
#     embedding_index = KeyedVectors.load(path, mmap='r')
#     embedding_matrix = np.zeros((len(word_index) + 1, 300))
#     for word, i in word_index.items():
#         for candidate in [word, word.lower(), word.title()]:
#             if candidate in embedding_index:
#                 embedding_matrix[i] = embedding_index[candidate]
#                 break
#     return embedding_matrix

def build_matrix(word_index, path):
    unknown_words = []
    embedding_index = KeyedVectors.load(path, mmap='r')
    embedding_matrix = np.zeros((MAX_FEATURES + 1, 300))
    #embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        if i <= MAX_FEATURES:
            try:
                embedding_matrix[i] = embedding_index[word]
            except KeyError:
                try:
                    embedding_matrix[i] = embedding_index[word.lower()]
                except KeyError:
                    try:
                        embedding_matrix[i] = embedding_index[word.title()]
                    except KeyError:
                        unknown_words.append(word)
    return embedding_matrix, unknown_words




In [None]:
def build_model(embedding_matrix):
#     words = Input(shape=(None,))
    words = Input(shape=(MAX_LEN,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    #x = Embedding(MAX_FEATURES, weights=[embedding_matrix], trainable=False)(words)
    #x = SpatialDropout1D(0.2)(x)
    x = Dropout(DROPOUT_RATE)(x)
#     x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
#     x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(x)

    hidden = concatenate([
        GlobalMaxPooling1D()(x),
        GlobalAveragePooling1D()(x),
    ])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    result = Dense(1, activation='sigmoid')(hidden)
    #aux_result = Dense(num_aux_targets, activation='sigmoid')(hidden)
    
    model = Model(inputs=words, outputs=[result#, aux_result
                                        ])

    model.compile(loss='binary_crossentropy', optimizer='adam') #reset learning rate using Adam Optimizer 

    return model

train_df = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
test_df = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')

In [None]:
df = pd.read_csv('comments_preprocessed_1.csv', index_col = 'id')

In [None]:
print(df.memory_usage().sum() / 1024**2) # MB
print(df.shape)

In [None]:
MAX_LEN = 242 # outter_fence
comments = df[TEXT_COLUMN].astype('str')
y = df[TARGET_COLUMN].astype(np.int8)

non_long_len = comments.apply(lambda x:len(x.split()) <= 242)
text_removelong = comments[non_long_len].copy()
non_long_len_indices = text_removelong.index
y_removelong = y.loc[non_long_len_indices].copy()

In [None]:
train_size = round(y_removelong.shape[0]*TRAIN_PERCENT)
print(train_size)

In [None]:
x_train = text_removelong[:train_size,]
y_train = y_removelong[:train_size,]
x_test = text_removelong[train_size:,]
y_test = y_removelong[train_size:,]

In [None]:
# train_df = df.iloc[:1500,]
# test_df = df.iloc[1500:,]

# x_train = train_df[TEXT_COLUMN].astype('str')
# y_train = train_df[TARGET_COLUMN].values.astype(np.int8)
# #y_aux_train = train_df[AUX_COLUMNS].values
# x_test = test_df[TEXT_COLUMN].astype('str')
# y_test = test_df[TARGET_COLUMN].values.astype(np.int8)

In [None]:
# MAX_LEN = max(x_train.apply(lambda x: len(x.split()))) # outter_fence
# MAX_LEN

for column in IDENTITY_COLUMNS + [TARGET_COLUMN]:
    train_df[column] = np.where(train_df[column] >= 0.5, True, False)

In [None]:
tokenizer = text.Tokenizer(num_words = MAX_FEATURES, lower = False)
#tokenizer = text.Tokenizer(lower = False)
tokenizer.fit_on_texts(list(x_train) + list(x_test))

In [None]:
len(tokenizer.word_index)

In [None]:
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

sample_weights = np.ones(len(x_train), dtype=np.float32)
sample_weights += train_df[IDENTITY_COLUMNS].sum(axis=1)
sample_weights += train_df[TARGET_COLUMN] * (~train_df[IDENTITY_COLUMNS]).sum(axis=1)
sample_weights += (~train_df[TARGET_COLUMN]) * train_df[IDENTITY_COLUMNS].sum(axis=1) * 5
sample_weights /= sample_weights.mean()

In [None]:
# unknown_words = []
# embedding_matrix, unknown_words = np.concatenate(
#     [build_matrix(tokenizer.word_index, f) for f in EMBEDDING_FILES], axis=-1)
# print('n unknown words: ', len(unknown_words))

In [None]:
embedding_matrix_cl, unknown_words_cl = build_matrix(tokenizer.word_index, EMBEDDING_FILES[0])
print('n unknown words(crawl): ', len(unknown_words_cl))
embedding_matrix_gl, unknown_words_gl = build_matrix(tokenizer.word_index, EMBEDDING_FILES[1])
print('n unknown words(glove): ', len(unknown_words_gl))

embedding_matrix = np.concatenate([embedding_matrix_cl, embedding_matrix_gl], axis=-1)

del embedding_matrix_cl
del embedding_matrix_gl

embedding_matrix.shape

In [None]:
# checkpoint_predictions = []
# weights = []
# accuracies = []
# losses = []
# for model_idx in range(NUM_MODELS):
#     accuracy = []
#     loss = []
#     print('\n\nmodel:', model_idx,'\n')
#     model = build_model(embedding_matrix, 0)
#     for global_epoch in range(EPOCHS):
#         hist = model.fit(
#             x_train,
#             #[y_train, y_aux_train],
#             y_train,
#             batch_size=BATCH_SIZE,
#             epochs=1,
#             verbose=2
#             #,sample_weight=[sample_weights.values, np.ones_like(sample_weights)]
#         )
#         checkpoint_predictions.append(model.predict(x_test, batch_size=128).flatten())
#         weights.append(2 ** global_epoch)
        
#         # accuracy
#         predictions_encoded = pd.Series(checkpoint_predictions[-1]).apply(lambda x: 0 if x < 0.5 else 1)
#         acc = accuracy_score(y_test, predictions_encoded)
#         accuracy.append(acc)
#         print('accuracy:', acc)
        
#         # loss
#         loss.append(hist.history['loss'][0])
        
#     accuracies.append(accuracy)
#     losses.append(loss)

model = build_model(embedding_matrix)
hist = model.fit(
    x_train,
    #[y_train, y_aux_train],
    y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    verbose=2,
    validation_split = VALID_PERCENT
)

predictions = model.predict(x_test, batch_size=512).flatten()

In [None]:
predictions_encoded = pd.Series(predictions).apply(lambda x: 0 if x < 0.5 else 1)

In [None]:
print(accuracy_score(y_test, predictions_encoded))
print(precision_score(y_test, predictions_encoded))
print(recall_score(y_test, predictions_encoded))
print(f1_score(y_test, predictions_encoded))

In [None]:
import matplotlib.pyplot as plt
def plot_metric(history, EPOCHS):
    epo = list(range(EPOCHS))
    plt.plot(epo, history['loss'], label='Training loss')
    plt.plot(epo, history['val_loss'], label='Validation loss', linestyle="--")
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

In [None]:
plot_metric(hist.history, EPOCHS)

In [None]:
len(list(y_test.index))

In [None]:
y_test_pred = pd.DataFrame({'lstm_predict':predictions_encoded, 'id': list(y_test.index)})

In [None]:
y_result = y_test_pred.merge(y_test, left_on='id', right_on=y_test.index).set_index('id')

In [None]:
y_result.to_csv('result_lstm_batch512_epoch6_imbal.csv')

Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. GloVe: Global Vectors for Word Representation.https://nlp.stanford.edu/pubs/glove.pdf 

T. Mikolov, E. Grave, P. Bojanowski, C. Puhrsch, A. Joulin. Advances in Pre-Training Distributed Word Representations. https://arxiv.org/abs/1712.09405