In [1]:
import os
import re 
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split

from keras.utils import to_categorical, Sequence, plot_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential, Model
from keras.callbacks import EarlyStopping, Callback, ModelCheckpoint
from keras.layers import Embedding, Dense, Dropout, LSTM, Input, BatchNormalization, concatenate

from tensorflow import set_random_seed
from numpy.random import seed
set_random_seed(2)
seed(40)

Using TensorFlow backend.


In [2]:
data = pd.read_csv('../input/facebook-antivaccine-post-data-scaled-features/features_scaled.csv', 
                  index_col=0)
text = pd.read_csv('../input/facebook-antivaccination-dataset/posts_full.csv', 
                   index_col=0).text
assert text.shape[0] == data.shape[0]
data['text'] = text
del text
data.head()

Unnamed: 0,has_article,has_text,anti_vax,has_img,has_words,percent_periods,percent_exclamations,percent_questionms,percent_equals,percent_dollars,ttr,sentiment_neg,sentiment_neu,sentiment_pos,sentiment_compound,readability_smog_index,readability_gunning_fog,readability_flesch_kincaid_grade,percent_all_caps,percent_hashtags,percent_linked_profiles,percent_links,percent_pos_basic_a,percent_pos_basic_n,percent_pos_basic_r,percent_pos_basic_v,percent_pos_CC,percent_pos_RBR,percent_pos_NNS,percent_pos_$,percent_pos_DT,percent_pos_VBG,percent_pos_PRP$,percent_pos_JJ,percent_pos_PRP,percent_pos_TO,percent_pos_POS,percent_pos_VBP,percent_pos_WRB,percent_pos_JJR,...,article_liwc_anx,article_liwc_anger,article_liwc_sad,article_liwc_cogmech,article_liwc_insight,article_liwc_cause,article_liwc_discrep,article_liwc_tentat,article_liwc_certain,article_liwc_inhib,article_liwc_incl,article_liwc_excl,article_liwc_percept,article_liwc_see,article_liwc_hear,article_liwc_feel,article_liwc_bio,article_liwc_body,article_liwc_health,article_liwc_sexual,article_liwc_ingest,article_liwc_relativ,article_liwc_motion,article_liwc_space,article_liwc_time,article_liwc_work,article_liwc_achiev,article_liwc_leisure,article_liwc_home,article_liwc_money,article_liwc_relig,article_liwc_death,article_liwc_assent,article_liwc_nonflu,article_liwc_filler,article_host_other,article_host_nan,article_domain_other,article_domain_nan,text
0,True,True,False,True,True,-0.089595,-0.368165,0.516354,-0.027634,-0.054159,-0.425214,0.536715,0.707525,-0.742552,-1.342403,-0.599327,0.240727,0.077457,0.421801,-0.209067,-0.23411,-0.298633,-0.373173,0.961364,-0.635626,-0.596778,-0.166546,-0.112293,-0.357248,-0.057156,0.778561,-0.153107,-0.189452,-0.339758,-0.321931,-0.119083,-0.009031,-0.81486,-0.183444,-0.140413,...,-0.156375,-0.252448,-0.183084,-0.279036,-0.367002,0.96948,-0.264614,-0.257025,-0.232941,-0.263383,-0.484553,-0.262612,-0.256238,-0.131283,-0.167426,-0.130015,-0.522068,-0.186768,-0.45706,-0.198116,-0.14393,-0.181714,1.690294,-0.646275,-0.518818,-0.427161,-0.344858,1.514424,-0.15154,-0.207483,-0.099765,-0.200243,4.496053,-0.049434,-0.082355,1,0,1,0,The latest conspiracy theory is that MMR vacci...
1,True,True,False,True,True,-0.714421,-0.368165,-0.306288,-0.027634,-0.054159,0.738363,-0.529333,1.023221,-0.742552,-0.316669,-0.599327,0.319617,0.314652,0.921814,-0.209067,-0.038374,-0.298633,-0.570571,0.825182,-0.635626,-0.185341,-0.166546,-0.112293,-0.357248,-0.057156,1.590646,-0.153107,-0.189452,-0.538453,-0.321931,-0.119083,-0.009031,-0.81486,-0.183444,-0.140413,...,-0.156375,-0.252448,-0.183084,-0.954794,-0.367002,-0.440704,-0.264614,-0.257025,-0.232941,-0.263383,-0.484553,-0.262612,-0.256238,-0.131283,-0.167426,-0.130015,0.850653,-0.186768,1.146593,-0.198116,-0.14393,1.076027,-0.299688,0.789269,1.025288,-0.427161,-0.344858,-0.258747,-0.15154,-0.207483,-0.099765,-0.200243,-0.099234,-0.049434,-0.082355,1,0,1,0,The New Vaccine Surveillance Network Report on...
2,True,True,False,False,True,-0.714421,-0.368165,-0.306288,-0.027634,-0.054159,-0.92389,-0.025922,0.874142,-0.742552,-0.467014,-0.599327,0.830079,0.773229,-0.411554,0.457415,-0.317997,-0.298633,-0.711569,0.72791,-0.635626,0.108543,-0.166546,-0.112293,0.378591,-0.057156,-0.439566,-0.153107,-0.189452,-0.680378,-0.321931,-0.119083,-0.009031,1.302772,-0.183444,-0.140413,...,-0.156375,-0.252448,-0.183084,-0.324086,-0.367002,-0.440704,-0.264614,2.042871,-0.232941,-0.263383,-0.484553,-0.262612,-0.256238,-0.131283,-0.167426,-0.130015,-0.522068,-0.186768,-0.45706,-0.198116,-0.14393,-0.880458,-0.299688,-0.646275,-0.518818,-0.427161,-0.344858,-0.258747,-0.15154,-0.207483,-0.099765,-0.200243,-0.099234,-0.049434,-0.082355,1,0,1,0,"Someone with in Santa Clara County, #Californ..."
3,True,True,False,True,True,-0.126349,0.144675,-0.306288,-0.027634,-0.054159,-1.339453,-0.529333,1.023221,-0.742552,-0.316669,-0.599327,-0.399671,0.172335,-0.411554,-0.209067,-0.317997,-0.298633,0.557414,-0.147544,-0.036662,-0.332283,-0.166546,-0.112293,0.378591,-0.057156,-0.439566,-0.153107,-0.189452,0.596946,-0.321931,-0.119083,-0.009031,-0.81486,-0.183444,-0.140413,...,-0.156375,-0.252448,-0.183084,0.396722,-0.367002,-0.440704,-0.264614,-0.257025,-0.232941,-0.263383,-0.484553,-0.262612,-0.256238,-0.131283,-0.167426,-0.130015,-0.522068,-0.186768,-0.45706,-0.198116,-0.14393,1.91452,-0.299688,1.404502,1.687048,-0.427161,-0.344858,-0.258747,-0.15154,-0.207483,-0.099765,-0.200243,-0.099234,-0.049434,-0.082355,1,0,1,0,"There are 33 new measles cases in Brooklyn, br..."
4,True,True,False,True,True,0.194417,-0.368165,0.890281,-0.027634,-0.054159,0.738363,1.040127,0.558447,-0.742552,-1.342403,-0.599327,-0.14444,0.030018,0.194522,-0.209067,-0.213139,-0.298633,0.345917,-0.439362,-0.635626,0.769782,-0.166546,-0.112293,-0.357248,-0.057156,-0.439566,-0.153107,-0.189452,-0.254603,1.346112,-0.119083,-0.009031,0.243956,-0.183444,4.877717,...,-0.156375,-0.252448,-0.183084,-0.954794,-0.367002,-0.440704,-0.264614,-0.257025,-0.232941,-0.263383,-0.484553,-0.262612,-0.256238,-0.131283,-0.167426,-0.130015,-0.522068,-0.186768,-0.45706,-0.198116,-0.14393,-0.880458,-0.299688,-0.646275,-0.518818,-0.427161,-0.344858,3.878652,-0.15154,-0.207483,-0.099765,-0.200243,-0.099234,-0.049434,-0.082355,1,0,1,0,It took less a few minutes to debunk the lates...


## Prepare Data

In [3]:
#Remove unwanted punctuation
unwanted = {x for x in '"$%&()*+,.!?-/:;<=>[\\]^_`{|}~\t\n'}

def filter_unwanted(x):
    x = "".join([c if c not in unwanted else " " for c in x]).lower()
    return x.encode("utf8").decode("ascii",'ignore')
data['text'] = [sentence for sentence in data.text.fillna('').apply(filter_unwanted)]
data.text.head()

0    the latest conspiracy theory is that mmr vacci...
1    the new vaccine surveillance network report on...
2    someone with  in santa clara county  #californ...
3    there are 33 new measles cases in brooklyn  br...
4    it took less a few minutes to debunk the lates...
Name: text, dtype: object

In [4]:
#Add n-gram input sequences
NUM_WORDS = 50_000
MAX_SEQUENCE_LENGTH = 200

tokenizer = Tokenizer(num_words=NUM_WORDS, filters='!"$%&()*+,-./:;<=>?[\\]^_`{|}~\t\n', 
                      lower=True)
tokenizer.fit_on_texts(data.text)

X1 = tokenizer.texts_to_sequences(data.text)
X1 = pad_sequences(X1, maxlen=MAX_SEQUENCE_LENGTH)
X1[-40:], data.text[0]

(array([[   0,    0,    0, ...,    0,    0,    0],
        [   0,    0,    0, ...,    2,  700,  456],
        [   0,    0,    0, ...,   23,   73, 6345],
        ...,
        [   0,    0,    0, ...,   34,  192,   24],
        [   0,    0,    0, ...,    0,    0,    0],
        [   0,    0,    0, ...,   25,   17,  290]], dtype=int32),
 'the latest conspiracy theory is that mmr vaccine is expiring  so we are trying to get rid of it  guess the expiration date on the mmr we just got ')

In [22]:
X2 = data.drop(['anti_vax', 'text'], axis=1).values

In [23]:
y = data.anti_vax.values

In [24]:
X1_train, X1_eval, X2_train, X2_eval, y_train, y_eval = train_test_split(X1, X2, y, test_size=0.15, 
                                                                         random_state=3000)

## Build Model

In [29]:
input1 = Input(shape=(X1.shape[1],), name="Text")
input2 = Input(shape=(X2.shape[1],), name="Text_Features")

#RNN of Text data
text_branch = Embedding(NUM_WORDS, 10)(input1)
text_branch = LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)(text_branch)
text_branch = Dropout(rate=0.2)(text_branch)
text_branch = LSTM(64, dropout=0.2, recurrent_dropout=0.2)(text_branch)
aux_output = Dense(1, activation='sigmoid', name='Aux_Output')(text_branch)

#Text Features
text_feat = BatchNormalization()(input2)

#Join branches
x = concatenate([text_branch, text_feat])
main_branch = Dense(80, activation='relu')(x)
main_output = Dense(1, activation='sigmoid', name="Main_Output")(main_branch)

#Model
model = Model(inputs=[input1, input2], outputs=[main_output, aux_output])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'], 
              loss_weights={'Main_Output': 1.0, 'Aux_Output':0.2})
plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=False)

<h1 style="text-align:center">Final Model</h1>
<img src="model.png" width="700">

## Train

In [27]:
checkpoint = ModelCheckpoint("model-{epoch:02d}-{val_Main_Output_loss:.2f}.hdf5", 
                             monitor='val_Main_Output_loss', verbose=0, 
                             save_best_only=True, period=1)
stopping = EarlyStopping(monitor='val_Main_Output_loss', patience=5)
history = model.fit({'Text': X1_train, 'Text_Features': X2_train}, [y_train, y_train], epochs=20, 
                    verbose=2, batch_size=32, validation_data=([X1_eval, X2_eval], [y_eval, y_eval]), 
                    callbacks=[checkpoint, stopping])

Instructions for updating:
Use tf.cast instead.
Train on 67400 samples, validate on 22467 samples
Epoch 1/20


KeyboardInterrupt: 

## Comparing Loss per Epoch

In [None]:
def plot_epochs(results, col, **kwargs):
    def plot_epoch_helper(hist_df, col, ax):
        ax.plot(hist_df[col], **kwargs)
        ax.set_title(col + ' per epoch')
        ax.set_ylabel(col)
        ax.set_xlabel('epoch')
        for sp in ax.spines:
            ax.spines[sp].set_visible(False)
        ax.yaxis.grid(True, alpha=0.3)
        ax.legend(labels=[n[0] for n in results])
        ax.set_ylim(0, 1)
    fig, ax = plt.subplots(figsize=(21, 10))
    for name, hist in results:
        plot_epoch_helper(hist, col, ax)
plot_epochs([('Model', pd.DataFrame(history.history))], 'val_Main_Output_loss')

In [None]:
plot_epochs([('Model', pd.DataFrame(history.history))], 'val_Aux_Output_loss')