![alt text](https://raw.githubusercontent.com/stephanefschwarz/GDG-MG/master/imgs/FN_arch.png)

**Install important packages**

In [1]:
!pip install tensorflow-hub
!pip install tf-sentencepiece
!pip install seaborn
!pip install keras
!pip install matplotlib
!pip install scikit-learn

Collecting tf-sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/e7/0e/74a98e395470d94191517e7dc08921e033db8ad8ce013d62588d4bd1ad52/tf_sentencepiece-0.1.85-py2.py3-none-manylinux1_x86_64.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 5.1MB/s 
[?25hInstalling collected packages: tf-sentencepiece
Successfully installed tf-sentencepiece-0.1.85


**Import packages**

In [2]:
import tensorflow as tf
import keras
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sklearn as skl

Using TensorFlow backend.


**Connect with google drive**

In [3]:
from google.colab import drive
drive.mount("/content/drive/")

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/


**Upload dataset**

In [4]:
training_set = pd.read_pickle('drive/My Drive/GDG/fake-news-detection/dataset/train_news_GDG.pkl')
test_set = pd.read_pickle('drive/My Drive/GDG/fake-news-detection/dataset/test_news_GDG.pkl')

print(training_set.shape)
print(training_set.head())

print('==============')

print(test_set.shape)
print(test_set.head())

(34691, 14)
                                       embedded_news  ...                                  embedded_comments
0  [[0.049736008, 0.056551598, -0.020507792, -0.0...  ...  [[0.021964531, 0.018814467, 0.002004556, -0.00...
1  [[0.049735997, 0.056551613, -0.020507792, -0.0...  ...  [[0.029968878, -0.04019299, -0.04854328, 0.018...
2  [[0.049735997, 0.056551594, -0.020507783, -0.0...  ...  [[0.029968878, -0.04019299, -0.04854328, 0.018...
3  [[0.049736015, 0.05655162, -0.020507794, -0.05...  ...  [[0.029968878, -0.04019299, -0.04854328, 0.018...
4  [[0.049735997, 0.056551598, -0.020507783, -0.0...  ...  [[0.029968878, -0.04019299, -0.04854328, 0.018...

[5 rows x 14 columns]
(36678, 11)
                                            bbc_news  ...                                             url
0   SANTIAGO (Reuters) - Hundreds of thousands of...  ...  https://www.reuters.com//article/idUSKCN1TX2V6
1   By Natalia A. Ramos Miranda  CACHIYUYO, Chile...  ...  https://www.reuters.com//art

**Feature vector generation**

---

Not considering text semantics

---



In [7]:
 from sklearn.feature_extraction.text import CountVectorizer

 BoW = CountVectorizer(analyzer='word', ngram_range=(2,2), 
                       stop_words='english', lowercase=True,
                       max_features=300)

train_news_BoW = BoW.fit_transform(training_set.bbc_news).todense()
test_news_BoW = BoW.transform(test_set.bbc_news).todense()
# ------------------------------------------------------------------------------
# train_news_BoW = pd.Series(list(train_news_BoW))
# test_news_BoW = pd.Series(list(test_news_BoW))

# ==============================================================================

train_tweet_BoW = BoW.fit_transform(training_set.tweetText).todense()
test_tweet_BoW = BoW.transform(test_set.tweetText).todense()
# ------------------------------------------------------------------------------
# train_tweet_BoW = pd.Series(list(train_tweet_BoW))
# test_tweet_BoW = pd.Series(list(test_tweet_BoW))

# ==============================================================================

# ----------------------- #
#   Concatenat comments   #
# ----------------------- #

tweets_comments = []

for comments in test_set.comments:

    tweets_comments.append(' '.join(comments))

train_comments_BoW = BoW.fit_transform(training_set.conc_comments).todense()
test_comments_BoW = BoW.transform(tweets_comments).todense()
# ------------------------------------------------------------------------------
# train_comments_BoW = pd.Series(list(train_comments_BoW))
# test_comments_BoW = pd.Series(list(test_comments_BoW))


matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [8]:
print(train_news_BoW[0].shape)
print(test_news_BoW[0].shape)

print(train_tweet_BoW[0].shape)
print(test_tweet_BoW[0].shape)

print(train_comments_BoW[0].shape)
print(test_comments_BoW[0].shape)

(1, 300)
(1, 300)
(1, 300)
(1, 300)
(1, 300)
(1, 300)


**Setup EMET cnn**

In [0]:
from keras.layers import (concatenate, Flatten, AveragePooling1D,
                          Reshape, Conv1D, Dense, MaxPool1D,
                          Dropout, GlobalAveragePooling1D)

from keras import (Input, Model)

class Emet:

  def __init__(self, news_input_shape=(1 ,1), 
               tweet_input_shape=(1,1), 
               comments_input_shape=None):

    # print('init')

    self.model = self.__build_model(news_input_shape,
                                tweet_input_shape,
                                comments_input_shape)

  def __build_model(self, news_input_shape, tweet_input_shape, comments_input_shape):

    print('build model')

    news = Input(shape=news_input_shape, name='news_input')
    tweet = Input(shape=tweet_input_shape, name='tweets_input')
    comments = Input(shape=comments_input_shape, name='comments_input')

    first_layer = Conv1D(filters=5, kernel_size=5, name='first_conv_news')(news)
    second_layer = Conv1D(filters=5, kernel_size=3, name='first_conv_tw')(tweet)
    third_layer = Conv1D(filters=5, kernel_size=3, name='first_conv_com')(comments)

    first_layer = Flatten()(first_layer)
    second_layer = Flatten()(second_layer)
    third_layer = Flatten()(third_layer)

    merged = concatenate([first_layer, second_layer
                          , third_layer
                          ])
    output = Dense(units=200, activation='relu', name='dense_layer_200')(merged)
    output = Reshape(target_shape=(200,1))(output)
    output = AveragePooling1D(pool_size=3, strides=1, name='first_avgPool')(output)
    output = Conv1D(filters=5, kernel_size=3, name='1_conv_conc')(output)
    output = Conv1D(filters=3, kernel_size=2, name='2_conv_conc')(output)
    output = Flatten()(output)
    output = Dropout(rate=0.5)(output)
    output = Dense(units=150, activation='relu', name="1_dense_layer")(output)
    output = Reshape(target_shape=(150, 1))(output)
    output = Conv1D(filters=5, kernel_size=2)(output)
    output = Flatten()(output)
    output = Dense(units=70, activation='relu')(output)
    output = Dense(units=3, activation='relu', name="2_dense_layer")(output)

    model = Model(inputs=[news, tweet
                        , comments
                        ], outputs=output)

    model.compile(optimizer='adadelta', loss='mean_squared_error', metrics=['accuracy'])

    return model

**Callback for validation set**

In [0]:
from keras.callbacks import Callback

class TestCallback(Callback):
  acc_history = []
  def __init__(self, test_data):
    self.test_data = test_data

  def on_epoch_end(self, epoch, logs=None):

    X_test, y_test = self.test_data

    X_news_test = x_test[0]
    X_tweet_test = x_test[1]
    X_comments_test = x_test[2]

    eval = self.model.evaluate([X_news_test, X_tweet_test
    , X_comments_test
    ], y_test)

    self.acc_history.append(eval[1])

**Restructure features input size**

In [0]:
X_news_train = np.expand_dims(train_news_BoW, axis=2)
X_tweet_train = np.expand_dims(train_tweet_BoW, axis=2)
X_comments_train = np.expand_dims(train_comments_BoW, axis=2)

X_news_test = np.expand_dims(test_news_BoW, axis=2)
X_tweet_test = np.expand_dims(test_tweet_BoW, axis=2)
X_comments_test = np.expand_dims(test_comments_BoW, axis=2)

**Encoder label**

In [0]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelBinarizer()
label_encoder.fit_transform(training_set.label)

y_train = label_encoder.transform(training_set.label)
y_test = label_encoder.transform(test_set.label)

**Test EMET model**

In [18]:
emet = Emet(news_input_shape=(X_news_train.shape[1], 1),
            tweet_input_shape=(X_tweet_train.shape[1], 1)
            ,comments_input_shape=(X_comments_train.shape[1], 1)
            ).model

build model





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.



**Predict**

In [19]:
predictions = emet.predict([X_news_test, 
                            X_tweet_test
                            ,X_comments_test
                            ], batch_size=10)









In [20]:
x_test = [X_news_test, X_tweet_test, X_comments_test]
new_label = np.array(y_test)

call = TestCallback((x_test, new_label))

history = emet.fit(x=[X_news_train, 
                      X_tweet_train,
                      X_comments_train], 
                   y=y_train,
                   callbacks=[call],
                   batch_size=40, epochs=10)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
print(emet.evaluate([X_news_test, 
                     X_tweet_test, 
                     X_comments_test], 
                    y_test))

[0.05213233943137279, 0.9092098805823654]


**Embeddings**

In [0]:
X_news_train = np.vstack(training_set.embedded_news)  
X_tweet_train = np.vstack(training_set.embedded_tweets)
X_comments_train = np.vstack(training_set.embedded_comments)

X_news_train = X_news_train.reshape((X_news_train.shape[0], X_news_train.shape[1], 1))
X_tweet_train = X_tweet_train.reshape((X_tweet_train.shape[0], X_tweet_train.shape[1], 1))
X_comments_train = X_comments_train.reshape((X_comments_train.shape[0], X_comments_train.shape[1], 1))

X_news_test = np.vstack(test_set.embedded_news)
X_tweet_test = np.vstack(test_set.embedded_tweets)
X_comments_test = np.vstack(test_set.embedded_comments)

X_news_test = X_news_test.reshape((X_news_test.shape[0], X_news_test.shape[1], 1))
X_tweet_test = X_tweet_test.reshape((X_tweet_test.shape[0], X_tweet_test.shape[1], 1))
X_comments_test = X_comments_test.reshape((X_comments_test.shape[0], X_comments_test.shape[1], 1))

**Setup model**

In [0]:
emet = Emet(news_input_shape=(X_news_train.shape[1], 1),
            tweet_input_shape=(X_tweet_train.shape[1], 1)
            ,comments_input_shape=(X_comments_train.shape[1], 1)
            ).model

init
build model


**Predict**

In [0]:
predictions = emet.predict([X_news_test, 
                            X_tweet_test
                            ,X_comments_test
                            ], batch_size=10)

In [0]:
x_test = [X_news_test, X_tweet_test, X_comments_test]
new_label = np.array(y_test)

call = TestCallback((x_test, new_label))

history = emet.fit(x=[X_news_train, 
                      X_tweet_train,
                      X_comments_train], 
                   y=y_train,
                   callbacks=[call],
                   batch_size=40, epochs=10)

print(X_news_test.shape)
print(X_tweet_test.shape)
print(X_comments_test.shape)

print(X_news_train.shape)
print(X_tweet_train.shape)
print(X_comments_train.shape)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
(36678, 512, 1)
(36678, 512, 1)
(36678, 512, 1)
(34691, 512, 1)
(34691, 512, 1)
(34691, 512, 1)


In [0]:
print(emet.evaluate([X_news_test, 
                     X_tweet_test, 
                     X_comments_test], 
                    y_test))

[0.035447027983784546, 0.9349473799007579]
