# Fake news detection

**Authors:** Peter Mačinec, Simona Miková

## Model construction

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import sys
sys.path.append('..')

### Reading data

In [2]:
df = pd.read_csv('../data/preprocessed/dataset.csv', index_col=0)

In [3]:
df.head()

Unnamed: 0,body,label
259882,healthline media partners with wellness advoca...,reliable
259883,online patient communities are thriving and ha...,reliable
259884,email marketing is more critical than ever. in...,reliable
259885,industry veterans from netflix and ea join fas...,reliable
259886,the healthline property has risen to 1 in the ...,reliable


In [4]:
df['label_encoded'] = df['label'].apply(lambda label: 1 if label == 'unreliable' else 0)
labels = np.asarray(df['label_encoded'])

Shape of labels

In [5]:
labels.shape

(65712,)

### Embeddings preprocessing

In [6]:
from src.model.embeddings import get_sequences_and_word_index_table
from sklearn.model_selection import train_test_split

In [7]:
max_words = 100000

In [8]:
%%time
sequences, word_index = get_sequences_and_word_index_table(df['body'], max_words)

CPU times: user 1min 31s, sys: 416 ms, total: 1min 31s
Wall time: 1min 31s


Unique tokens

In [9]:
len(word_index)

100000

Shape of input sequences

In [10]:
sequences.shape

(65712, 6292)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(sequences, labels, test_size=0.10, random_state=1)

### Loading fastText model

In [12]:
from src.model.fasttext import read_fasttext_model

In [13]:
%%time
fasttext = read_fasttext_model('../models/fasttext/wiki-news-300d-1M.vec')

CPU times: user 1min 41s, sys: 1.63 s, total: 1min 43s
Wall time: 1min 43s


In [14]:
len(fasttext), len(fasttext['work'])

(999995, 300)

In [15]:
embeddings_dim = 300

In [16]:
fasttext['work']

array([-0.072 ,  0.013 , -0.0777, -0.0491, -0.085 , -0.0337, -0.0243,
        0.0472, -0.0575, -0.0356,  0.0931, -0.0579, -0.0667, -0.0612,
       -0.0634, -0.052 ,  0.1209, -0.0841,  0.0455,  0.0819, -0.0535,
        0.075 ,  0.0889,  0.1769,  0.0675, -0.0347, -0.0773, -0.0488,
       -0.0569,  0.0227,  0.0035,  0.0395,  0.0183,  0.0542, -0.0188,
        0.0203, -0.0252, -0.0498,  0.0278,  0.0521,  0.0046, -0.0992,
        0.0415, -0.034 ,  0.0507, -0.0419, -0.0607,  0.0656,  0.0076,
        0.0696, -0.0558,  0.0456, -0.6356, -0.0703, -0.1587, -0.1104,
        0.055 ,  0.016 ,  0.0074,  0.0279, -0.0508,  0.0394, -0.0134,
       -0.0315, -0.0289, -0.0604,  0.0277,  0.017 , -0.0156,  0.0355,
        0.0201, -0.0229,  0.1151, -0.0961,  0.0129,  0.0488,  0.0038,
       -0.0607,  0.0469,  0.1148,  0.0382,  0.0008, -0.0133, -0.1429,
        0.0423, -0.0841, -0.0146, -0.0523,  0.0353, -0.0013,  0.1078,
        0.0076,  0.0498, -0.0432, -0.0121, -0.054 ,  0.0992,  0.0559,
       -0.0699,  0.0

### Embeddings matrix

In [17]:
from src.model.embeddings import get_embeddings_matrix

In [18]:
%%time
embeddings_matrix = get_embeddings_matrix(word_index, fasttext, embeddings_dim)

CPU times: user 153 ms, sys: 60 ms, total: 213 ms
Wall time: 213 ms


In [19]:
embeddings_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.0897    ,  0.016     , -0.0571    , ...,  0.1559    ,
        -0.0254    , -0.0259    ],
       [-0.0063    , -0.0253    , -0.0338    , ...,  0.1155    ,
         0.0073    ,  0.0168    ],
       ...,
       [ 0.36790001,  0.0007    , -0.1013    , ...,  0.28470001,
         0.0406    ,  0.1751    ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.0437    ,  0.0427    , -0.0377    , ...,  0.1001    ,
         0.1626    ,  0.0293    ]])

In [20]:
embeddings_matrix.shape

(100001, 300)

In [21]:
%load_ext tensorboard
%tensorboard --logdir logs --bind_all

In [None]:
import tensorflow.keras as keras
from src.model.model import FakeNewsDetectionNet
import os
import datetime

model = FakeNewsDetectionNet(
    dim_input=len(word_index) + 1,
    dim_embeddings=embeddings_dim,
    dim_output=2,
    embeddings=embeddings_matrix
)

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

callbacks = [
    keras.callbacks.TensorBoard(
        log_dir=os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S")),
        histogram_freq=1,
        profile_batch=0
    )
]

model.fit(
    x=X_train,
    y=y_train,
    batch_size=16,
    validation_data=(X_test, y_test),
    callbacks=callbacks,
    epochs=5
)

model.summary()

Train on 59140 samples, validate on 6572 samples
Epoch 1/5
   48/59140 [..............................] - ETA: 12:25:18 - loss: 0.6945 - accuracy: 0.5000