In [1]:
import os
import re 
import math
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split

from keras.utils import to_categorical, Sequence, plot_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential, Model
from keras.callbacks import EarlyStopping, Callback, ModelCheckpoint
from keras.layers import Embedding, Dense, Dropout, LSTM, Input, BatchNormalization, concatenate

from tensorflow import set_random_seed
from numpy.random import seed
set_random_seed(2)
seed(40)

Using TensorFlow backend.


In [2]:
data = pd.read_csv('../input/facebook-antivaccination-dataset/posts_full.csv', 
                   index_col=0).dropna(subset=['text'])
data = data[['text', 'anti_vax']]
data.head()

Unnamed: 0,text,anti_vax
0,The latest conspiracy theory is that MMR vacci...,False
1,The New Vaccine Surveillance Network Report on...,False
2,"Someone with in Santa Clara County, #Californ...",False
3,"There are 33 new measles cases in Brooklyn, br...",False
4,It took less a few minutes to debunk the lates...,False


In [3]:
discord = pd.read_csv('../input/game-of-cones/Game of Cones - battle-for-the-cone 231969664027066368.csv', 
                      sep=';').drop('Unnamed: 4', axis=1).dropna(subset=['Content'])
discord['anti_vax'] = False
discord = discord[['Content', 'anti_vax']].rename({'Content': 'text'}, axis=1)
discord.head()

Unnamed: 0,text,anti_vax
0,NO DND,False
1,NO MAPLESTORY,False
2,https://www.pathofexile.com/ascendancy/classes,False
3,No gurlzz,False
4,http://boards.4chan.org/pol/thread/103070474,False


## Prepare Data

In [4]:
data = pd.concat([data, discord]).reset_index(drop=True)
data.shape

(131303, 2)

## Build Tokenizer

In [5]:
#Remove unwanted punctuation
FILTER_STRING = '"$%&()*+,.!?-/:;<=>[\\]@#^_`{|}~\t\n'
UNWANTED = {x for x in FILTER_STRING}
def filter_unwanted(x):
    x = "".join([c if c not in UNWANTED else " " for c in x]).lower()
    return x.encode("utf8").decode("ascii",'ignore')

In [6]:
data['text'] = [sentence for sentence in data.text.apply(filter_unwanted)]
data.text.tail()

131298                                      w open gspektjq
131299    nice  michael  you opened the j list box and g...
131300                                         w dailygacha
131301                            w claim sachiko shinozaki
131302    nice  itsthtguy  you claimed    sachiko shinoz...
Name: text, dtype: object

In [7]:
#Add n-gram input sequences
NUM_WORDS = 50_000
MAX_SEQUENCE_LENGTH = 200

tokenizer = Tokenizer(num_words=NUM_WORDS, filters=FILTER_STRING, 
                      lower=True)
tokenizer.fit_on_texts(data.text)
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Build Training Sets

In [8]:
X = tokenizer.texts_to_sequences(data.text)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
X[0][-40:], data.text.head(1)

(array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     1,  1047,  3092,  1787,     7,    10,   262,
           16,     7, 18223,    44,    21,    13,   503,     2,    66,
         3627,     4,    17,  1308,     1, 19517,   885,    15,     1,
          262,    21,    64,   259], dtype=int32),
 0    the latest conspiracy theory is that mmr vacci...
 Name: text, dtype: object)

In [9]:
X_train, X_eval, y_train, y_eval = train_test_split(X, data.anti_vax.values, 
                                                    test_size=0.2, 
                                                    random_state=3000)

## Build Model

In [11]:
model = Sequential()
model.add(Embedding(NUM_WORDS, 10, input_length=(X.shape[1])))
model.add(LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))
model.add(Dropout(rate=0.3))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=False)

<h1 style="text-align:center">Final Model</h1>
<img src="model.png" width="200">

## Train

In [13]:
checkpoint = ModelCheckpoint("model-{epoch:02d}-{val_loss:.2f}.hdf5", 
                             monitor='val_loss', verbose=0, 
                             save_best_only=True, period=1)
stopping = EarlyStopping(monitor='val_loss', patience=2)
history = model.fit(X_train, y_train, epochs=20, 
                    verbose=2, batch_size=32, validation_data=(X_eval, y_eval), 
                    callbacks=[checkpoint, stopping])

Train on 105042 samples, validate on 26261 samples
Epoch 1/1

KeyboardInterrupt: 

## Comparing Loss per Epoch

In [None]:
def plot_epochs(results, col, **kwargs):
    def plot_epoch_helper(hist_df, col, ax):
        ax.plot(hist_df[col], **kwargs)
        ax.set_title(col + ' per epoch')
        ax.set_ylabel(col)
        ax.set_xlabel('epoch')
        for sp in ax.spines:
            ax.spines[sp].set_visible(False)
        ax.yaxis.grid(True, alpha=0.3)
        ax.legend(labels=[n[0] for n in results])
        ax.set_ylim(0, 1)
    fig, ax = plt.subplots(figsize=(21, 10))
    for name, hist in results:
        plot_epoch_helper(hist, col, ax)
plot_epochs([('Model', pd.DataFrame(history.history))], 'val_Main_Output_loss')

In [None]:
plot_epochs([('Model', pd.DataFrame(history.history))], 'val_Aux_Output_loss')