In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, Conv2D, MaxPool2D, Conv1D, MaxPool1D, GlobalAveragePooling1D
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
%matplotlib inline

Using TensorFlow backend.


Read data known to be fake

In [2]:
data1 = pd.read_csv("Political_Cleaned - Political_Cleaned.csv")

Read data known to be genuine

In [4]:
data2 = pd.read_csv("genuine - election_clean_withouturlandemoticons.csv")

Resampling if necessary, set to 1 to use whole data

In [5]:
data1Sample = data1.sample(frac = 1)
data2Sample = data2.sample(frac = 1)

Combine data

In [8]:
dataFull = data1Sample.append(data2Sample, ignore_index=True, sort=False)

In [9]:
dataFull.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148497 entries, 0 to 148496
Data columns (total 3 columns):
content    148493 non-null object
label      148497 non-null int64
count      148497 non-null int64
dtypes: int64(2), object(1)
memory usage: 3.4+ MB


reshaping

In [10]:
X = dataFull.content.astype(str)
Y = dataFull.label
le = LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.reshape(-1,1)

Creating the test train split.

In [121]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.15)
x_train = []
for each in X_train:
    x_train.append(each)
x_test = []
for each in X_test:
    x_test.append(each)

#### Tokenising
The vocabulary size is fixed at 10000 words.<br>
The most frequent words are used to build a dictionary which is used to encode each sentence.<br>

In [127]:
max_words = 10000
max_len = 150
tok = Tokenizer(max_words, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ')
tok.fit_on_texts(x_train)
sequences = tok.texts_to_sequences(x_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

View dictionary and indices

In [130]:
tok.index_word

{1: 'the',
 2: 'to',
 3: 'in',
 4: 'a',
 5: 'of',
 6: 'for',
 7: 'you',
 8: 'and',
 9: 'is',
 10: 'electionday',
 11: 'rt',
 12: 'on',
 13: 'trump',
 14: 'i',
 15: 'debate',
 16: 'this',
 17: 'be',
 18: 'vote',
 19: 'are',
 20: 'at',
 21: 'your',
 22: 'it',
 23: 'will',
 24: 'with',
 25: 'that',
 26: 'debatenight',
 27: 'not',
 28: 'we',
 29: 'hillary',
 30: 'have',
 31: 'all',
 32: 'about',
 33: 'out',
 34: 'if',
 35: 'what',
 36: 'from',
 37: 'as',
 38: 'my',
 39: 'up',
 40: 'get',
 41: 'who',
 42: 'do',
 43: 'by',
 44: 'tonight',
 45: 'debates',
 46: 'clinton',
 47: 'so',
 48: 'us',
 49: 'today',
 50: 'people',
 51: 'just',
 52: "it's",
 53: 'her',
 54: 'news',
 55: 'debates2016',
 56: 'no',
 57: "don't",
 58: 'our',
 59: 'america',
 60: 'he',
 61: 'how',
 62: 'has',
 63: 'but',
 64: 'like',
 65: 'can',
 66: 'go',
 67: 'they',
 68: 'day',
 69: 'u',
 70: 'or',
 71: 'one',
 72: 'was',
 73: 'make',
 74: 'she',
 75: 'his',
 76: 'more',
 77: 'after',
 78: 'now',
 79: 'new',
 80: 'time',


Simple LSTM

Simple LSTM model

In [154]:
def SimpleLSTM():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words, 125,input_length=max_len)(inputs)
    layer = LSTM(64) (layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [155]:
model = SimpleLSTM()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 150)               0         
_________________________________________________________________
embedding_10 (Embedding)     (None, 150, 125)          1250000   
_________________________________________________________________
lstm_10 (LSTM)               (None, 64)                48640     
_________________________________________________________________
FC1 (Dense)                  (None, 256)               16640     
_________________________________________________________________
activation_17 (Activation)   (None, 256)               0         
_________________________________________________________________
dropout_10 (Dropout)         (None, 256)               0         
_________________________________________________________________
out_layer (Dense)            (None, 1)                 257       
__________

In [156]:
model.fit(sequences_matrix,Y_train,batch_size=128,epochs=10,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Train on 100977 samples, validate on 25245 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


<keras.callbacks.History at 0x7f9a9c6d2908>

In [157]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [158]:
accr = model.evaluate(test_sequences_matrix,Y_test)



In [159]:
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.217
  Accuracy: 0.908


Simple CNN model

In [None]:
def CNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words, 125,input_length=max_len)(inputs)
    layer = Conv1D(64, 3, activation='relu') (layer)
    layer = Conv1D(64, 3, activation='relu')(layer)
    layer = MaxPool1D(pool_size=3) (layer)
    layer = Conv1D(128, 3, activation='relu') (layer)
    layer = Conv1D(128, 3, activation='relu') (layer)
    layer = GlobalAveragePooling1D() (layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [None]:
model = CNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 150)               0         
_________________________________________________________________
embedding_11 (Embedding)     (None, 150, 125)          1250000   
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 148, 64)           24064     
_________________________________________________________________
conv1d_14 (Conv1D)           (None, 146, 64)           12352     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 48, 64)            0         
_________________________________________________________________
conv1d_15 (Conv1D)           (None, 46, 128)           24704     
_________________________________________________________________
conv1d_16 (Conv1D)           (None, 44, 128)           49280     
__________

In [None]:
model.fit(sequences_matrix,Y_train,batch_size=128,epochs=10,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Train on 100977 samples, validate on 25245 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


<keras.callbacks.History at 0x7f9a9c814d68>

In [None]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [None]:
accr = model.evaluate(test_sequences_matrix,Y_test)



In [None]:
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.303
  Accuracy: 0.867


Hybrid structure

In [None]:
def CLSTM():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words, 125,input_length=max_len)(inputs)
    layer = Conv1D(64, 3, activation='relu') (layer)
    layer = Conv1D(64, 3, activation='relu')(layer)
    layer = MaxPool1D(pool_size=3) (layer)
    layer = LSTM(64) (layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [None]:
model = CLSTM()
model.summary()
model.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 150)               0         
_________________________________________________________________
embedding_12 (Embedding)     (None, 150, 125)          1250000   
_________________________________________________________________
conv1d_17 (Conv1D)           (None, 148, 64)           24064     
_________________________________________________________________
conv1d_18 (Conv1D)           (None, 146, 64)           12352     
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 48, 64)            0         
_________________________________________________________________
lstm_11 (LSTM)               (None, 64)                33024     
_________________________________________________________________
FC1 (Dense)                  (None, 256)               16640     
__________

In [None]:
model.fit(sequences_matrix,Y_train,batch_size=128,epochs=10,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Train on 100977 samples, validate on 25245 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10

In [None]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [None]:
accr = model.evaluate(test_sequences_matrix,Y_test)

In [None]:
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))