In [3]:
import numpy as np
import math
import re
import nltk
from scipy import stats
from random import shuffle
from keras.preprocessing.text import one_hot
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense, Dropout, Activation, Embedding

In [4]:
def transform_keywords(file_name):
    inf_file = open(file_name)
    data = list()
    for one_news in inf_file.readlines():
        single = one_news.strip().split(',')
        mapping = list()
        for one_keyword in single:
            mapping.append(one_hot(one_keyword, 7000)[0])
        data.append(mapping)
    #print(data)
    return data

In [6]:
def transform_titles(file_name):
    inf_file = open(file_name)
    data = list()
    for one_news in inf_file.readlines():
        single = nltk.word_tokenize(clean_sentence(one_news))
        print(single)
        mapping = list()
        for one_keyword in single:
            mapping.append(one_hot(one_keyword, 7000)[0])
        data.append(mapping)
    # print(data)
    return data

def clean_sentence(s):
    c = s.lower().strip()
    return re.sub('[^a-z ]', '', c)

'''
:param
    type: 0 indicates using the keywords from the content
          1 indicates using the titles
'''

'\n:param\n    type: 0 indicates using the keywords from the content\n          1 indicates using the titles\n'

In [7]:
def make_prediction(fake_file, real_file, type, unit_size = 10):
    if type == 0:
        fake_data = transform_keywords(fake_file)
        real_data = transform_keywords(real_file)
    else:
        fake_data = transform_titles(fake_file)
        real_data = transform_titles(real_file)
    labels = list()
    max_len = 0
    for i in fake_data:
        labels.append(0)
    for i in real_data:
        labels.append(1)
    data=fake_data
    for r in fake_data:
        if max_len < len(r):
            max_len = len(r)
    for r in real_data:
        if max_len < len(r):
            max_len = len(r)
        data.append(r)
    print(max_len)
    for d in data:
        cur_len = len(d)
        while cur_len < max_len:
            d.append(0)
            cur_len = cur_len+1
    print(data)

    #shuffle the given data
    index_shuf = list(range(len(data)))
    shuffle(index_shuf)
    data_shuffled = list()
    label_shuffled = list()
    for i in index_shuf:
        data_shuffled.append(data[i])
        label_shuffled.append(labels[i])
    print(len(label_shuffled))
    print(label_shuffled)

    # generate cross validation datasets
    k = 0
    testing_size = len(data_shuffled)/unit_size
    training_set_X = list()
    training_set_Y = list()
    testing_set_X = list()
    testing_set_Y = list()

    while k < testing_size:
        test_X= data_shuffled[k*unit_size:(k+1)*unit_size]
        test_Y = label_shuffled[k*unit_size:(k+1)*unit_size]

        train_X = data_shuffled[:k * unit_size] + data_shuffled[(k + 1) * unit_size:]
        train_Y = label_shuffled[:k * unit_size] + label_shuffled[(k + 1) * unit_size:]

        training_set_X.append(train_X)
        training_set_Y.append(train_Y)
        testing_set_X.append(test_X)
        testing_set_Y.append(test_Y)
        k = k+1

    print(len(training_set_X))
    print(training_set_Y)

    # testing with the baseline
    test_index = 0
    while test_index < testing_size:
        print('Build model...')
        baselineTest = np.float(np.sum(testing_set_Y[test_index])) / unit_size
        model = Sequential()
        model.add(Embedding(7000, 256, dropout=0.2))
        model.add(LSTM(16, dropout_W=0.2, dropout_U=0.2))  # try using a GRU instead, for fun
        model.add(Dense(1))
        model.add(Activation('sigmoid'))
        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])

        print('Train...')
        model.fit(training_set_X[test_index], training_set_Y[test_index], batch_size=len(testing_set_X[test_index]),
                  nb_epoch=10,
                  validation_data=(testing_set_X[test_index], testing_set_Y[test_index]), shuffle=False)
        score, acc = model.evaluate(testing_set_X[test_index], testing_set_Y[test_index],
                                    batch_size=len(testing_set_X[test_index]))
        print('Test accuracy:', acc)
        print('Baseline: ', str(max(baselineTest,1-baselineTest)))
        test_index = test_index +1

In [8]:
if __name__ == "__main__":
    #Task on the content keywords
    make_prediction("./fakenews_keywords.csv","./realnews_keywords.csv",0)
    #Task on the titles
    #make_prediction("./data/titles/fake_news_training.txt", "./data/titles/real_news_training.txt",1 )

19
[[3918, 100, 3622, 6084, 4793, 5330, 4610, 3582, 4887, 3415, 5993, 2479, 1403, 5425, 1236, 2138, 0, 0, 0], [394, 1443, 100, 4745, 960, 6739, 3161, 5993, 6546, 5082, 2479, 5425, 4749, 3432, 4962, 0, 0, 0, 0], [5056, 4215, 100, 4238, 249, 4365, 2712, 3097, 5743, 5993, 121, 4934, 2479, 1809, 1618, 503, 1233, 4498, 0], [4378, 2479, 1103, 4610, 3582, 6070, 6099, 1342, 4121, 140, 5, 5842, 1924, 3533, 2568, 0, 0, 0, 0], [1765, 4238, 4908, 728, 2543, 3879, 5299, 6846, 6701, 1393, 3195, 2731, 3918, 5866, 5773, 0, 0, 0, 0], [6120, 6986, 4238, 494, 5120, 6480, 846, 4710, 1059, 1819, 3052, 6210, 4875, 649, 2987, 0, 0, 0, 0], [3918, 100, 3622, 6084, 4793, 5330, 4610, 3582, 4887, 3415, 5993, 2479, 1403, 5425, 1236, 2138, 0, 0, 0], [2479, 5239, 5145, 5120, 3747, 6885, 2293, 6312, 4594, 3834, 2500, 3463, 5734, 653, 1095, 0, 0, 0, 0], [5000, 712, 6739, 3713, 2822, 4604, 584, 2227, 5993, 5063, 2479, 5233, 1012, 529, 4962, 838, 0, 0, 0], [2255, 3104, 4078, 5239, 5145, 919, 2206, 3466, 5993, 911, 2069,



Train...




Train on 80 samples, validate on 10 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.800000011921
Baseline:  0.8
Build model...
Train...
Train on 80 samples, validate on 10 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.800000011921
Baseline:  0.7
Build model...
Train...
Train on 80 samples, validate on 10 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.899999976158
Baseline:  0.8
Build model...
Train...
Train on 80 samples, validate on 10 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.899999976158
Baseline:  0.8
Build model...
Train...
Train on 80 samples, validate on 10 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch

Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.800000011921
Baseline:  0.8
Build model...
Train...
Train on 80 samples, validate on 10 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 1.0
Baseline:  0.9
Build model...
Train...
Train on 80 samples, validate on 10 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 1.0
Baseline:  0.8
Build model...
Train...
Train on 80 samples, validate on 10 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.699999988079
Baseline:  0.5
