In [16]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

#=================Keras==============
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Conv2D, Embedding, Dropout, Activation, LSTM
from keras.layers import Bidirectional, MaxPooling1D, MaxPooling2D, Reshape, Flatten, concatenate, GlobalMaxPool1D, Permute, multiply
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers, backend
#=================nltk===============
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

path = './'
comp = ''
EMBEDDING_FILE=f'{path}glove6b/glove.6B.50d.txt'
TRAIN_DATA_FILE=f'{path}{comp}train.csv'
TEST_DATA_FILE=f'{path}{comp}test.csv'

embed_size = 50 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a comment to use
number_filters = 20 # the number of CNN filters

train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

list_sentences_train = train["comment_text"].fillna("_na_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("_na_").values

special_character_removal=re.compile(r'[^a-z\d ]',re.IGNORECASE)
replace_numbers=re.compile(r'\d+',re.IGNORECASE)

def text_to_wordlist(text, remove_stopwords=True, stem_words=True):
    #Remove Special Characters
    text=special_character_removal.sub('',text)
    
    #Replace Numbers
    text=replace_numbers.sub('n',text)
    # Clean the text, with the option to remove stopwords and to stem words.
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

In [3]:
comments = []
for text in list_sentences_train:
    comments.append(text_to_wordlist(text))
    
test_comments=[]
for text in list_sentences_test:
    test_comments.append(text_to_wordlist(text))
    

# tokenizer = Tokenizer(num_words=max_features,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'', lower=True)
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(comments))
comments_sequence = tokenizer.texts_to_sequences(comments)
test_comments_sequence = tokenizer.texts_to_sequences(test_comments)    
X_t = pad_sequences(comments_sequence , maxlen=maxlen)
X_te = pad_sequences(test_comments_sequence, maxlen=maxlen)

In [4]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [23]:
INPUT_DIM = 100
TIME_STEPS = 100
# if True, the attention vector is shared across the input_dimensions where the attention is applied.
SINGLE_ATTENTION_VECTOR = False
APPLY_ATTENTION_BEFORE_LSTM = False


def attention_3d_block(inputs):
    # inputs.shape = (batch_size, time_steps, input_dim)
    input_dim = int(inputs.shape[2])
    a = Permute((2, 1))(inputs)
    a = Reshape((input_dim, TIME_STEPS))(a) # this line is not useful. It's just to know which dimension is what.
    a = Dense(TIME_STEPS, activation='softmax')(a)
    if SINGLE_ATTENTION_VECTOR:
        a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a)
        a = RepeatVector(input_dim)(a)
    a_probs = Permute((2, 1), name='attention_vec')(a)
    # output_attention_mul = merge([inputs, a_probs], name='attention_mul', mode='mul')
    output_attention_mul = multiply([inputs, a_probs])
    return output_attention_mul

In [24]:
inp = Input(shape=(maxlen,))
print(inp.shape)
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
print(x.shape)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
attention_mul = attention_3d_block(x)
attention_mul = Flatten()(attention_mul)
output = Dense(6, activation='sigmoid')(attention_mul)
model = Model(inputs=inp, outputs=output)
# print(x.shape)
# x = GlobalMaxPool1D()(x)
# print(x.shape)
# x = Dense(50, activation="relu")(x)
# print(x.shape)
# x = Dropout(0.1)(x)
# print(x.shape)
# x = Dense(6, activation="sigmoid")(x)
# print(x.shape)
# model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

(?, 100)
(?, 100, 50)


In [25]:
model.fit(X_t, y, batch_size=64, epochs=3) # validation_split=0.1);

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fa992d22630>

In [28]:
model.fit(X_t, y, batch_size=64, epochs=1) # validation_split=0.1);

Epoch 1/1


<keras.callbacks.History at 0x7fa8bb59cf98>

In [26]:
y_test = model.predict([X_te], batch_size=1024, verbose=1)
sample_submission = pd.read_csv(f'{path}{comp}sample_submission.csv')
sample_submission[list_classes] = y_test
sample_submission.to_csv('submission_lstm.csv', index=False)



In [None]:
# model = Sequential()
# model.add(LSTM(input_shape=()))
# model.compile()