In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, Conv2D, MaxPool2D, Conv1D, MaxPool1D, GlobalAveragePooling1D
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.callbacks import EarlyStopping
%matplotlib inline

Read data known to be fake

In [2]:
data1 = pd.read_csv("political.csv")

Read data known to be genuine

In [3]:
data2 = pd.read_csv("genuine.csv")

Resampling if necessary, set to 1 to use whole data

In [4]:
data1Sample = data1.sample(frac = 1)
data2Sample = data2.sample(frac = 1)

Combine data

In [5]:
dataFull = data1Sample.append(data2Sample, ignore_index=True, sort=False)

In [6]:
dataFull.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148497 entries, 0 to 148496
Data columns (total 3 columns):
content    148493 non-null object
label      148497 non-null int64
count      148497 non-null int64
dtypes: int64(2), object(1)
memory usage: 3.4+ MB


reshaping

In [7]:
X = dataFull.content.astype(str)
Y = dataFull.label
le = LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.reshape(-1,1)

Creating the test train split.

In [8]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.15)
x_train = []
for each in X_train:
    x_train.append(each)
x_test = []
for each in X_test:
    x_test.append(each)

#### Tokenising
The vocabulary size is fixed at 10000 words.<br>
The most frequent words are used to build a dictionary which is used to encode each sentence.<br>

In [11]:
max_words = 15000
max_len = 150
tok = Tokenizer(max_words, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ')
tok.fit_on_texts(x_train)
sequences = tok.texts_to_sequences(x_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [12]:
for each in tok.word_index:
    if tok.word_index[each]==29:
        print(each)

hillary


View dictionary and indices

Simple LSTM

Simple LSTM model

In [13]:
def SimpleLSTM():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words, 125,input_length=max_len)(inputs)
    layer = LSTM(64) (layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [14]:
model = SimpleLSTM()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 150)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 150, 125)          1875000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                48640     
_________________________________________________________________
FC1 (Dense)                  (None, 256)               16640     
_________________________________________________________________
activation_1 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
out_layer (Dense)            (None, 1)                 257       
__________

In [15]:
plot_model(model, to_file='lstm.png', show_shapes=True)

In [16]:
model.fit(sequences_matrix,Y_train,batch_size=128,epochs=10,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Train on 100977 samples, validate on 25245 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


<keras.callbacks.History at 0x7fc13dba6198>

In [17]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [44]:
y_dash = model.predict(test_sequences_matrix)
y_dash = np.where(y_dash > 0.5, 1, 0)
print(classification_report(Y_test, y_dash))

[ 7660   948   990 12677]


Simple CNN model

In [45]:
def CNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words, 125,input_length=max_len)(inputs)
    layer = Conv1D(64, 3, activation='relu') (layer)
    layer = Conv1D(64, 3, activation='relu')(layer)
    layer = MaxPool1D(pool_size=3) (layer)
    layer = Conv1D(128, 3, activation='relu') (layer)
    layer = Conv1D(128, 3, activation='relu') (layer)
    layer = GlobalAveragePooling1D() (layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [46]:
model = CNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 150)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 150, 125)          1875000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 148, 64)           24064     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 146, 64)           12352     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 48, 64)            0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 46, 128)           24704     
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 44, 128)           49280     
__________

In [47]:
model.fit(sequences_matrix,Y_train,batch_size=128,epochs=10,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Train on 100977 samples, validate on 25245 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


<keras.callbacks.History at 0x7fc1109dba58>

In [48]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [50]:
y_dash = model.predict(test_sequences_matrix)
y_dash = np.where(y_dash > 0.5, 1, 0)
print(classification_report(Y_test, y_dash))

             precision    recall  f1-score   support

          0       0.89      0.78      0.83      8608
          1       0.87      0.94      0.90     13667

avg / total       0.88      0.88      0.88     22275



Hybrid structure

In [51]:
def CLSTM():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words, 125,input_length=max_len)(inputs)
    layer = Conv1D(64, 3, activation='relu') (layer)
    layer = Conv1D(64, 3, activation='relu')(layer)
    layer = MaxPool1D(pool_size=3) (layer)
    layer = LSTM(64) (layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [52]:
model = CLSTM()
model.summary()
model.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 150)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 150, 125)          1875000   
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 148, 64)           24064     
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 146, 64)           12352     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 48, 64)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                33024     
_________________________________________________________________
FC1 (Dense)                  (None, 256)               16640     
__________

In [53]:
model.fit(sequences_matrix,Y_train,batch_size=128,epochs=10,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Train on 100977 samples, validate on 25245 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10


<keras.callbacks.History at 0x7fc11011d748>

In [54]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [56]:
y_dash = model.predict(test_sequences_matrix)
y_dash = np.where(y_dash > 0.5, 1, 0)
print(classification_report(Y_test, y_dash))

             precision    recall  f1-score   support

          0       0.86      0.81      0.84      8608
          1       0.89      0.92      0.90     13667

avg / total       0.88      0.88      0.88     22275

