In [1]:
import sys, os, re, csv, codecs, gc, numpy as np, \
pandas as pd, pickle as pkl, tensorflow as tf

#=================Keras==============
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Conv1D, Conv2D, \
Embedding, Dropout, Activation, Permute
from keras.layers import Bidirectional, MaxPooling1D, MaxPooling2D, \
Reshape, Flatten, concatenate, BatchNormalization, GlobalMaxPool1D, \
GlobalMaxPool2D
from keras import backend
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers, backend
from keras.models import load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

# Don't Show Warning Messages
import warnings
warnings.filterwarnings('ignore')
#=================nltk===============
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
#=================gensim=============
import gensim
#=================save_list==========
import pickle
#=================sklearn============
from sklearn.model_selection import train_test_split

  return f(*args, **kwds)
Using TensorFlow backend.


In [2]:
# model = gensim.models.KeyedVectors.load_word2vec_format('./word2vec/GoogleNews-vectors-negative300.bin', binary=True)
# model.save_word2vec_format('./word2vec/GoogleNews-vectors-negative300.txt', binary=False)

In [3]:
path = './'
comp = ''
TRAIN_DATA_FILE=f'{path}{comp}train.csv'
TEST_DATA_FILE=f'{path}{comp}test.csv'

In [4]:
embed_size = 50 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a comment to use
number_filters = 100 # the number of CNN filters

In [5]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

train = train.reindex(np.random.permutation(train.index))

In [6]:
list_sentences_train = train["comment_text"].fillna("_na_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("_na_").values

In [7]:
special_character_removal=re.compile(r'[^a-z\d ]',re.IGNORECASE)
replace_numbers=re.compile(r'\d+',re.IGNORECASE)

def text_to_wordlist(text, remove_stopwords=True, stem_words=True):
    #Remove Special Characters
    text=special_character_removal.sub('',text)
    
    #Replace Numbers
    text=replace_numbers.sub('n',text)
    # Clean the text, with the option to remove stopwords and to stem words.
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

In [8]:
# preprocess or not 
preprocess = False

if preprocess:
    comments = []
    for text in list_sentences_train:
        comments.append(text_to_wordlist(text))
    
    test_comments=[]
    for text in list_sentences_test:
        test_comments.append(text_to_wordlist(text))

else:
    comments = list_sentences_train
    test_comments = list_sentences_test

In [9]:
# tokenlize
if preprocess:
    tokenizer = Tokenizer(num_words=max_features,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'', lower=True)
else:
    tokenizer = Tokenizer(num_words=max_features)

tokenizer.fit_on_texts(list(list(comments) + list(test_comments)))
comments_sequence = tokenizer.texts_to_sequences(comments)
test_comments_sequence = tokenizer.texts_to_sequences(test_comments)    
X_t = pad_sequences(comments_sequence , maxlen=maxlen)
X_te = pad_sequences(test_comments_sequence, maxlen=maxlen)

EMBEDDING_FILE=f'{path}glove6b/glove.6B.50d.txt'

In [10]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

with open("emb_mean.txt", "wb") as fp:   #Pickling
    pickle.dump(emb_mean, fp)
    
with open("emb_mean.txt", "rb") as fp:   # Unpickling
    emb_mean = pickle.load(fp)
    
with open("emb_std.txt", "wb") as fp:   #Pickling
    pickle.dump(emb_std, fp)
    
with open("emb_std.txt", "rb") as fp:   # Unpickling
    emb_std = pickle.load(fp)

In [11]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [12]:
with open("X_t.txt", "wb") as fp:   #Pickling
    pickle.dump(X_t, fp)
    
with open("X_te.txt", "wb") as fp:   #Pickling
    pickle.dump(X_te, fp)
    
with open("embed_glove.txt", "wb") as fp:   #Pickling
    pickle.dump(embedding_matrix, fp)
    
with open("embed_word2vec.txt", "wb") as fp:   #Pickling
    pickle.dump(embedding_matrix, fp)
    
with open("X_t.txt", "rb") as fp:   # Unpickling
    X_t = pickle.load(fp)
    
with open("X_te.txt", "rb") as fp:   # Unpickling
    X_te = pickle.load(fp)
    
with open("embed_glove.txt", "rb") as fp:   # Unpickling
    embedding_matrix_glove = pickle.load(fp)

# with open("embed_word2vec.txt", "rb") as fp:   # Unpickling
#     embedding_matrix_word2vec = pickle.load(fp)

In [13]:
X_train, X_eval, y_train ,y_eval = train_test_split(X_t, y,test_size=0.25,shuffle=True,
                                                    random_state=5)

In [14]:
# filter_size
filter_size = [3, 4, 5]

inp = Input(shape=(maxlen, ))
x1 = Embedding(max_features, embed_size, weights=[embedding_matrix_glove], trainable=True)(inp)
x2 = Embedding(max_features, embed_size, weights=[embedding_matrix_glove], trainable=False)(inp)
# x3 = Embedding(max_features, embed_size)(inp)
x1 = Reshape((100, 50, 1))(x1)
x2 = Reshape((100, 50, 1))(x2)
# x3 = Reshape((100, 50, 1))(x3)
print(x1.shape)
x = concatenate([x1, x2])
print(x.shape)

# Version of Conv1D
# for fz in filter_size:
# conv_blocks = []
# for sz in filter_size:
#     conv = Conv1D(number_filters, sz)(x)
#     batch_norm = BatchNormalization()(conv)
#     activation = Activation('elu')(batch_norm)
#     print(activation.shape)
#     pooling = GlobalMaxPool1D()(activation)
#     conv_blocks.append(pooling)

# Version of Conv2D
conv_blocks = []
for sz in filter_size:
    conv = Conv2D(number_filters, (sz, embed_size), data_format='channels_last')(x)
    batch_norm = BatchNormalization()(conv)
    activation = Activation('elu')(batch_norm)
    pooling = GlobalMaxPool2D()(activation)
    conv_blocks.append(pooling)
    
x = concatenate(conv_blocks)
print(x.shape)
# x = Dense(128, activation="relu")(x)
# x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
print(x.shape)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')
save_best = ModelCheckpoint(
    'toxic.hdf', 
    save_best_only=True,                     
    monitor='val_loss', 
    mode='auto'
)

(?, 100, 50, 1)
(?, 100, 50, 2)
(?, 300)
(?, 6)


In [15]:
model.fit(
    X_train, y_train, validation_data=(X_eval, y_eval),
    epochs=30, 
    verbose=1,
    callbacks=[early_stopping,save_best]
)

Train on 119678 samples, validate on 39893 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30


<keras.callbacks.History at 0x7f106e1c2ef0>

In [16]:
 model.load_weights(filepath = 'toxic.hdf')

In [19]:
y_test = model.predict([X_te], batch_size=256, verbose=1)
sample_submission = pd.read_csv(f'{path}{comp}sample_submission.csv')
sample_submission[list_classes] = y_test
sample_submission.to_csv('submission_textcnn.csv', index=False)

