In [2]:
import warnings
warnings.filterwarnings("ignore") # Ignore some unimportant warnings

import pandas as pd
import numpy as np
import nltk
import gensim
from string import punctuation
from nltk.corpus import stopwords
from itertools import chain
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, Word2Vec, Doc2Vec, KeyedVectors
from gensim.models.doc2vec import TaggedDocument

# in case some packages are not properly installed
nltk.download('gutenberg')
nltk.download('reuters')
nltk.download('stopwords')
nltk.download("punkt")

class DocumentSequence:
    def __init__(self, raw_docs, clean=False, sw=None, punct=None):
        """
        an instance featuring difference representations of a doc sequence

        public methods are:
            self.get_dictionary()
            self.get_tokenized()
            self.get_tagged()
            self.get_bow()

        :param raw_docs: numpy.ndarray[str]
            each string for a document
        :param clean: bool
            whether to clean stopwords and punctuations
        :param sw: list[str]
            list of stopwords, only works if `clean` is True, default is empty
        :param punct: str
            string of punctuations, only works if `clean` is True, default is empty

        """
        self.raw_docs = raw_docs
        self._set_tokenized(clean=clean, sw=sw, punct=punct)
        self._set_tagged()

    def _set_tokenized(self, clean=False, sw=None, punct=None):
        """
        set self._tokenized to list[list[str]]: each string for a token
        :param clean: bool, whether to clean stopwords and punctuations
        :param sw: list[str], list of stopwords, only works if `clean` is True, default is empty
        :param punct: str, string of punctuations, only works if `clean` is True, default is empty
        """
        print("converting raw docs into tokens")

        # lower-casing all documents in the first step
        self._tokenized = [nltk.word_tokenize(doc.lower()) for doc in self.raw_docs]

        if clean:  # if clean is set to True, stopwords and punctuations are removed
            print("cleaning up stopwords and punctuations")
            # hashing stopwords and punctuations speeds up look-up computation
            if sw is None:  # default value of sw is None, corresponding to an empty list
                sw = []
            if punct is None:  # default value of punct is None, corresponding to an empty list
                punct = []
            skip_tokens = set(chain(sw, punct))
            print("all tokens to be skipped are: {}".format(skip_tokens))
            # retain only meaningful tokens, while preserving the structure
            self._tokenized = [[token for token in doc if token not in skip_tokens] for doc in self._tokenized]

    def _set_tagged(self):
        """set self._set_tagged to list[TaggedDocument] each TaggedDocument has a tag of [index]"""
        print("listing tagged documents in memory")
        self._tagged = [TaggedDocument(doc, tags=[index]) for index, doc in enumerate(self._tokenized)]

    def _set_dictionary(self):
        """stores the dictionary of current corpus"""
        self._dictionary = Dictionary(self._tokenized)

    def _set_bow(self):
        """set self._bow to list[list[tuple]], where each tuple is (word_id, word_frequency)"""
        if not hasattr(self, '_dictionary'):  # check whether dictionary is set or not
            print("dictionary is not set for {}, setting dictionary automatically".format(self))
            self._set_dictionary()
        self._bow = [self._dictionary.doc2bow(doc) for doc in self._tokenized]

    def get_dictionary(self):
        """getter for class attribute dictionary"""
        if not hasattr(self, "_dictionary"):  # self._dictionary is only computed once
            self._set_dictionary()

        # the previous method is only called once
        return self._dictionary

    def get_tokenized(self):
        """getter for tokenized documents, cleaned as desired"""
        return self._tokenized

    def get_tagged(self):
        """getter for list of TaggedDocuments"""
        return self._tagged

    def get_bow(self):
        """getter for bag-of-words representation of documents"""
        if not hasattr(self, '_bow'):  # self._bow is only computed lazily
            self._set_bow()

        # the previous method is only called once
        return self._bow


class DocumentEmbedder:
    def __init__(self, docs: DocumentSequence, pretrained_word2vec=None):
        """
        This class features interfaces to different methods of computing document embeddings.
        Supported embedding mechanisms are:
            Dov2Vec:                               see self.get_doc2vec()
            Naive Doc2Vec:                         see self.get_naive_doc2vec()
            One-Hot Sum:                           see self.get_onehot()
            Attention is all you need              To be implemented
            FastText                               To be implemented

        :param docs: a DocumentSequence instance
        :pretrained_word2vec: path to pretrained word2vec model, in .bin format
        """
        self.docs = docs
        self.pretrained = pretrained_word2vec

    def _set_word2vec(self):
        if self.pretrained is None:
            raise ValueError("Pretrained word2vec path is not specified during instantiation")
        self._w2v = KeyedVectors.load_word2vec_format(self.pretrained, binary=True)

    def _set_doc2vec(self, vector_size=300, window=5, min_count=5, dm=1, epochs=20):
        # instantiate a Doc2Vec model, setting pretrained GoogleNews Vector
        self._d2v = Doc2Vec(vector_size=vector_size, window=window, min_count=min_count, dm=dm, epochs=epochs,
                            pretrained=self.pretrained)
        # build vocabulary from corpus
        self._d2v.build_vocab(self.docs.get_tagged())

        # somehow, the training won't start automatically, and must be manually started
        self._d2v.train(self.docs.get_tagged(), total_examples=self._d2v.corpus_count, epochs=epochs)

        # list document embeddings by order of their tags
        self._d2v_embedding = [self._d2v.docvecs[index]
                               for index in range(len(self.docs.get_tagged()))]

    def get_doc2vec(self, vectors_size=300, window=5, min_count=5, dm=1, epochs=20):
        """
        get the doc2vec embeddings with word vectors pretrained on GoogleNews task
        :param vectors_size: size for document embeddings, should be 300 if using GoogleNews pretrained word vectors
        :param window: number of tokens to be include in both directions
        :param min_count: lower threshold for a token to be included
        :param dm: using distributed memory or not
            if 1, use distributed memory
            if 0, use distributed bag of words
        :param epochs: number of epochs for training, usually < 20
        :return: a list of document embeddings, vector size can be specified
        """
        if vectors_size != 300:
            print("Warning: pretrained Google News vecs have length 300, got vec-size={} ".format(vectors_size))

        if not hasattr(self, '_d2v_embedding'):
            self._set_doc2vec(vector_size=vectors_size, window=window, min_count=min_count, dm=dm, epochs=epochs)

        return self._d2v_embedding

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\86720\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\86720\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\86720\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\86720\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Methods above are used as tools for preprocessing the raw data.

In [4]:
import pandas as pd
import numpy as np
import os
from nltk.corpus import stopwords
from string import punctuation

# load the raw data set and Google pretrained w2v model
df = pd.read_csv("./fake_or_real_news.csv")
pretrained = "./pretrained/GoogleNews-vectors-negative300.bin"
# obtain the raw news texts and titles
raw_texts = df['text'].values
raw_title = df['title'].values

# obtain the raw label data
def trans_labels(labels):
    for idx in range(len(labels)):
        if labels[idx] == 'FAKE':
            labels[idx] = 0
        else:
            labels[idx] = 1
    return np.array(labels, dtype=int)

labels = trans_labels(df['label'].values)

# build two instances for preprocessing raw data
texts = DocumentSequence(raw_texts, clean=True, sw=stopwords.words('english'), punct=punctuation)
titles = DocumentSequence(raw_title, clean=True, sw=stopwords.words('english'), punct=punctuation)

# build two instances for producing document embeddings
text_embedder = DocumentEmbedder(texts, pretrained_word2vec=pretrained)
titles_embedder = DocumentEmbedder(titles, pretrained_word2vec=pretrained)

# vectors_size: Number of dimensions for the embedding model
# window: Number of context words to observe in each direction within a document
# min_count: Minimum frequency for words included in model
# dm (distributed memory): '0' indicates DBOW model; '1' indicates DM
# epoches: Number of epochs to train the model for
text_embeddings = text_embedder.get_doc2vec(vectors_size=300,
                                            window=13,
                                            min_count=5,
                                            dm=0,
                                            epochs=100)

title_embeddings = titles_embedder.get_doc2vec(vectors_size=300,
                                               window=13,
                                               min_count=5,
                                               dm=0,
                                               epochs=100)

# if the embeddings is in a list, stack them into a 2-D numpy array
def trans_list_to_array(embeddings):
    if isinstance(embeddings, list): 
        try:
            embeddings = np.stack(emb if isinstance(emb, np.ndarray) else np.zeros(300) for emb in embeddings)
        except ValueError as e:
            print(e)
    return embeddings

# change text_embeddings and title_embeddings into 2-D numpy array
text_embeddings = trans_list_to_array(text_embeddings)
title_embeddings = trans_list_to_array(title_embeddings)

# concatenate text matrix and title matrix as a whole for training
news_embeddings = np.concatenate((title_embeddings, text_embeddings), axis=1)

converting raw docs into tokens
cleaning up stopwords and punctuations
all tokens to be skipped are: {'until', 'our', 'am', "hasn't", 'y', 'own', '+', 'not', 'below', 'from', 'be', 'and', 'at', 'as', 'ma', 'yourselves', "shouldn't", 'you', 'her', '}', '%', 'itself', 'been', "isn't", '~', "you've", 'while', 'those', 'had', 'most', 'haven', 'me', 'him', 'if', 'she', 'have', 'after', "aren't", 's', 'i', 'will', 'what', 'ours', "hadn't", '\\', 'their', "you'd", 'll', 'weren', 'few', '@', 'up', 'all', 'o', 'couldn', "needn't", "wasn't", 'no', 'yourself', ']', 'hadn', ',', 're', '/', 'for', 'which', '$', '=', 'this', 'doesn', 'on', 'who', '!', 'he', 'wouldn', '&', 'it', 'other', 'same', 'has', 'don', "didn't", ')', 'can', 'or', 'by', 'd', 'mustn', '-', 'in', "don't", "doesn't", '*', 'were', ':', '<', '[', 'does', "couldn't", 'being', 'just', "she's", 'herself', 'do', 'a', 'but', '`', ';', "won't", 'each', 'more', 'having', "wouldn't", 'an', 'ain', 'very', 'is', 'the', 'didn', 'are', 'against

The process above is used to get embeddings of preprocessed data.

In [7]:
from embedding_visualizer import visualize_embeddings

# visualize the news embeddings in the graph
# MUST run in command line "tensorboard --logdir visual/" and visit localhost:6006 to see the visualization
visualize_embeddings(embedding_values=news_embeddings, label_values=labels)

currently setting metadata_path to metadata.tsv. Due to tensorboard version reasons, if prompted 'metadata not found' when visiting tensorboard server page, please manually edit metadata_path in projector_config.pbtxt to visual\metadata.tsv or the absolute path for `metadata.tsv` and restart tensorboard
If your tensorboard version is 1.7.0, you probably should not worry about this
Embeddings are available now. Please start your tensorboard server with commandline `tensorboard --logdir visual` and visit http://localhost:6006 to see the visualization


The method above is used to visualize the news embeddings

In [1]:
import pickle as pkl
import os

# store the d2v model in files
save_embeddings_path = "d2v(vecsize={}, winsize={}, mincount={}, {}, epochs={}).pkl".format(
    300, 13, 5, "dbow", 100)
save_embeddings_path = "./pretrained/title_text-" + save_embeddings_path
# dump the data into files
with open(save_embeddings_path, "wb") as file:
    print("storing embeddings in {}".format(save_embeddings_path))
    pkl.dump(news_embeddings, file)
    print("embeddings stored")
    
# store the labels in files
save_labels_path = "./pretrained/labels.pkl"
# dump the data into files
with open(save_labels_path, "wb") as file:
    print("storing labels in {}".format(save_labels_path))
    pkl.dump(labels, file)
    print("labels stored")

FileNotFoundError: [Errno 2] No such file or directory: './pretrained/title_text-d2v(vecsize=300, winsize=13, mincount=5, dbow, epochs=100).pkl'

The process above is used to store d2v embeddings.

In [28]:
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.model_selection._search import BaseSearchCV
import pickle as pkl

# obtain the embeddings from files.
def get_file(path):
    try:
        with open(path, "rb") as f:
            item = pkl.load(f)
    except FileNotFoundError as e:
        print("unable to load {}, see stack trace below".format(path))
        print("double check that you have the file saved {}".format(path))
        print(e)
        return None

    return item

# load pretrained data
embeddings = get_file("./pretrained/title_text-d2v(vecsize=300, winsize=13, mincount=5, dbow, epochs=100).pkl")
labels = get_file("./pretrained/labels.pkl")

# perform the split which gets us the train data and the test data
news_train, news_test, labels_train, labels_test = train_test_split(news_embeddings, labels,
                                                                    test_size=0.25,
                                                                    random_state=0,
                                                                    stratify=labels)

The process above is used to split the original data into two parts, one for training, the other for testing.

In [24]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from scipy.stats import randint
from scipy.stats.distributions import uniform
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import numpy as np

# MLP classifier
mlp = MLPClassifier(activation='relu', alpha=0.01, batch_size='auto', beta_1=0.8,
                    beta_2=0.9, early_stopping=False, epsilon=1e-08,
                    hidden_layer_sizes=(600, 300), learning_rate='constant',
                    learning_rate_init=0.0001, max_iter=200, momentum=0.9,
                    nesterovs_momentum=True, power_t=0.5, random_state=0, shuffle=True,
                    solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
                    warm_start=False)

# KNN classifier
knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='cosine',
                           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
                           weights='distance')

# QDA classifier
qda = QuadraticDiscriminantAnalysis(priors=np.array([0.5, 0.5]),
                                    reg_param=0.6531083254653984, store_covariance=False,
                                    store_covariances=None, tol=0.0001)

# GDB classifier
gdb = GradientBoostingClassifier(criterion='friedman_mse', init=None,
                                 learning_rate=0.1, loss='exponential', max_depth=10,
                                 max_features='log2', max_leaf_nodes=None,
                                 min_impurity_decrease=0.0, min_impurity_split=None,
                                 min_samples_leaf=0.0012436966435001434,
                                 min_samples_split=100, min_weight_fraction_leaf=0.0,
                                 n_estimators=200, presort='auto', random_state=0,
                                 subsample=0.8, verbose=0, warm_start=False)

# SVC classifier
svc = SVC(C=0.8, cache_size=200, class_weight=None, coef0=0.0,
          decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
          max_iter=-1, probability=False, random_state=0, shrinking=True,
          tol=0.001, verbose=False)

# GNB classifier
gnb = GaussianNB(priors=None)

# RF classifier
rf = RandomForestClassifier(bootstrap=False, class_weight=None,
                            criterion='entropy', max_depth=10, max_features=7,
                            max_leaf_nodes=None, min_impurity_decrease=0.0,
                            min_impurity_split=None, min_samples_leaf=9,
                            min_samples_split=6, min_weight_fraction_leaf=0.0,
                            n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
                            verbose=0, warm_start=False)

# LGR classifer
lgr = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=74.9222112826074,
                         fit_intercept=True, intercept_scaling=1,
                         class_weight=None, random_state=None,
                         solver='liblinear', max_iter=100, multi_class='ovr', 
                         verbose=0, warm_start=False, n_jobs=1)

# All the parameters of the classifiers above are optimal in our experiments
# The list below is used to store every classifier instance
classifiers_list = [mlp, knn, qda, gdb, svc, gnb, rf, lgr]

The process above is used to build every classifier with almost optimal parameters in our experiments.

![a](resources/models_with_best_performance.jpg)

In [27]:
from sklearn.metrics import classification_report

# print details of testing results
for model in classifiers_list:
    model.fit(news_train, labels_train)
    labels_pred = model.predict(news_test)
    
    # Report the metrics
    target_names = ['Fake', 'Real']
    print(str(model))
    print(classification_report(y_true=labels_test, y_pred=labels_pred, target_names=target_names, digits=3))

MLPClassifier(activation='relu', alpha=0.01, batch_size='auto', beta_1=0.8,
       beta_2=0.9, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(600, 300), learning_rate='constant',
       learning_rate_init=0.0001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=0, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)
             precision    recall  f1-score   support

       Fake      0.944     0.942     0.943       791
       Real      0.942     0.945     0.943       793

avg / total      0.943     0.943     0.943      1584

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='cosine',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='distance')
             precision    recall  f1-score   support

       Fake      0.905     0.839     0.871       791
       Real      0.851     0.912     0.880       793

avg / total      0.878     0.