# Fake news detection

**Authors:** Peter Mačinec, Simona Miková

## Model construction

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import sys
sys.path.append('..')

### Reading data

In [2]:
df = pd.read_csv('../data/preprocessed/dataset.csv', index_col=0)

In [3]:
df = df.sample(300)

In [4]:
df.head()

Unnamed: 0,body,label
349438,if you ve ever stepped outside for a walk or s...,unreliable
331776,by paul fassa no other food additive has had m...,unreliable
243778,natural news there are lots of reasons you mi...,unreliable
280462,summary genetic mutations which lead to abnorm...,reliable
321552,there s a new technology in the works that may...,reliable


In [5]:
df['label_encoded'] = df['label'].apply(lambda label: 1 if label == 'unreliable' else 0)
labels = np.asarray(df['label_encoded'])

Shape of labels

In [6]:
labels.shape

(300,)

### Embeddings preprocessing

In [7]:
from src.model.embeddings import get_sequences_and_word_index_table
from sklearn.model_selection import train_test_split

In [8]:
max_words = 20000

In [9]:
%%time
sequences, word_index = get_sequences_and_word_index_table(df['body'], max_words)

CPU times: user 489 ms, sys: 8.05 ms, total: 497 ms
Wall time: 495 ms


Unique tokens

In [10]:
len(word_index)

19401

Shape of input sequences

In [11]:
sequences.shape

(300, 5872)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(sequences, labels, test_size=0.10, random_state=1)

### Loading fastText model

In [13]:
from src.model.fasttext import read_fasttext_model

In [14]:
%%time
fasttext = read_fasttext_model('../models/fasttext/wiki-news-300d-1M.vec')

CPU times: user 2min 3s, sys: 2.38 s, total: 2min 5s
Wall time: 2min 5s


In [15]:
len(fasttext), len(fasttext['work'])

(999995, 300)

In [16]:
embeddings_dim = 300

In [17]:
fasttext['work']

array([-0.072 ,  0.013 , -0.0777, -0.0491, -0.085 , -0.0337, -0.0243,
        0.0472, -0.0575, -0.0356,  0.0931, -0.0579, -0.0667, -0.0612,
       -0.0634, -0.052 ,  0.1209, -0.0841,  0.0455,  0.0819, -0.0535,
        0.075 ,  0.0889,  0.1769,  0.0675, -0.0347, -0.0773, -0.0488,
       -0.0569,  0.0227,  0.0035,  0.0395,  0.0183,  0.0542, -0.0188,
        0.0203, -0.0252, -0.0498,  0.0278,  0.0521,  0.0046, -0.0992,
        0.0415, -0.034 ,  0.0507, -0.0419, -0.0607,  0.0656,  0.0076,
        0.0696, -0.0558,  0.0456, -0.6356, -0.0703, -0.1587, -0.1104,
        0.055 ,  0.016 ,  0.0074,  0.0279, -0.0508,  0.0394, -0.0134,
       -0.0315, -0.0289, -0.0604,  0.0277,  0.017 , -0.0156,  0.0355,
        0.0201, -0.0229,  0.1151, -0.0961,  0.0129,  0.0488,  0.0038,
       -0.0607,  0.0469,  0.1148,  0.0382,  0.0008, -0.0133, -0.1429,
        0.0423, -0.0841, -0.0146, -0.0523,  0.0353, -0.0013,  0.1078,
        0.0076,  0.0498, -0.0432, -0.0121, -0.054 ,  0.0992,  0.0559,
       -0.0699,  0.0

### Embeddings matrix

In [18]:
from src.model.embeddings import get_embeddings_matrix

In [14]:
word_index

{'the': 1,
 'of': 2,
 'to': 3,
 'and': 4,
 'a': 5,
 'in': 6,
 'that': 7,
 'is': 8,
 'for': 9,
 'it': 10,
 's': 11,
 'as': 12,
 'with': 13,
 'you': 14,
 'are': 15,
 'this': 16,
 'on': 17,
 'by': 18,
 'be': 19,
 'have': 20,
 'we': 21,
 'not': 22,
 'from': 23,
 'an': 24,
 'your': 25,
 'or': 26,
 'was': 27,
 'at': 28,
 'they': 29,
 'has': 30,
 'but': 31,
 'can': 32,
 'about': 33,
 'their': 34,
 'which': 35,
 'said': 36,
 'acupuncture': 37,
 'more': 38,
 'study': 39,
 'were': 40,
 'i': 41,
 'if': 42,
 'who': 43,
 'pain': 44,
 'these': 45,
 'what': 46,
 't': 47,
 'been': 48,
 'he': 49,
 'research': 50,
 'brain': 51,
 'people': 52,
 'when': 53,
 'all': 54,
 'there': 55,
 'than': 56,
 'disease': 57,
 'patients': 58,
 'may': 59,
 'how': 60,
 'out': 61,
 'treatment': 62,
 'one': 63,
 'so': 64,
 'other': 65,
 'health': 66,
 'do': 67,
 'some': 68,
 'also': 69,
 'our': 70,
 'time': 71,
 'com': 72,
 'cancer': 73,
 'university': 74,
 'medical': 75,
 'up': 76,
 'new': 77,
 'will': 78,
 'like': 79,
 'm

In [19]:
%%time
embeddings_matrix = get_embeddings_matrix(word_index, fasttext, 300)

Number of words not found in pre-trained embeddings: 2000
CPU times: user 58.5 ms, sys: 4.06 ms, total: 62.5 ms
Wall time: 61.8 ms


In [20]:
embeddings_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.0897    ,  0.016     , -0.0571    , ...,  0.1559    ,
        -0.0254    , -0.0259    ],
       [-0.0063    , -0.0253    , -0.0338    , ...,  0.1155    ,
         0.0073    ,  0.0168    ],
       ...,
       [ 0.0436    ,  0.0642    ,  0.1001    , ...,  0.0889    ,
         0.0627    ,  0.1559    ],
       [-0.20810001,  0.0026    , -0.1556    , ...,  0.20020001,
         0.0265    ,  0.2158    ],
       [-0.0077    ,  0.0923    , -0.0875    , ..., -0.0091    ,
         0.0296    , -0.0597    ]])

In [21]:
embeddings_matrix.shape

(19401, 300)

In [22]:
%load_ext tensorboard
%tensorboard --logdir logs --bind_all

Reusing TensorBoard on port 6006 (pid 372), started 3:44:13 ago. (Use '!kill 372' to kill it.)

In [23]:
import tensorflow.keras as keras


class FakeNewsDetectionNet(keras.Model):

    def __init__(self, dim_input, dim_embeddings, dim_output, embeddings):
        super(FakeNewsDetectionNet, self).__init__()
        self.embedding_layer = keras.layers.Embedding(
                input_dim=dim_input,
                output_dim=dim_embeddings,
                mask_zero=True,
                embeddings_initializer=keras.initializers.Constant(embeddings),
                trainable=False
        )
        self.lstm_layer = keras.layers.LSTM(32)
        self.dense_layer = keras.layers.Dense(
                units=32,
                activation='relu'
        )
        self.final_dense = keras.layers.Dense(
                units=1,
                activation='sigmoid'
        )

    def call(self, input):
        x = self.embedding_layer(input)
        mask = self.embedding_layer.compute_mask(input)
        x = self.lstm_layer(x, mask=mask)
        x = self.dense_layer(x)
        x = self.final_dense(x)

        return x


In [24]:
import tensorflow.keras as keras
import os
import datetime

model = FakeNewsDetectionNet(
    dim_input=len(word_index),
    dim_embeddings=300,
    dim_output=2,
    embeddings=embeddings_matrix
)

model.compile(
    optimizer='rmsprop',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

callbacks = [
    keras.callbacks.TensorBoard(
        log_dir=os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S")),
        histogram_freq=1,
        profile_batch=0
    )
]

model.fit(
    x=X_train,
    y=y_train.reshape((-1,1)),
    batch_size=16,
    validation_data=(X_test, y_test.reshape((-1,1))),
    callbacks=callbacks,
    epochs=5
)

model.summary()

Train on 270 samples, validate on 30 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Model: "fake_news_detection_net"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  5820300   
_________________________________________________________________
lstm (LSTM)                  multiple                  42624     
_________________________________________________________________
dense (Dense)                multiple                  1056      
_________________________________________________________________
dense_1 (Dense)              multiple                  33        
Total params: 5,864,013
Trainable params: 43,713
Non-trainable params: 5,820,300
_________________________________________________________________
