# Themes: UCB Fake News Detector
## Group: Hollis lab group 2
### Members: Shuheng Liu, Qiaoyi Yin, Yuyuan  Fang
____

# Getting Set Up
#### Tools for preprocessing the raw data.
(with tools.DocumentSequence and tools.DocumentEmbedder provided by Shuheng Liu)

In [30]:
import warnings
warnings.filterwarnings("ignore") # Ignore some unimportant warnings

import pandas as pd
import numpy as np
import nltk
import gensim
from string import punctuation
from nltk.corpus import stopwords
from itertools import chain
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, Word2Vec, Doc2Vec, KeyedVectors
from gensim.models.doc2vec import TaggedDocument

# a very popular graph plotting library 
import matplotlib.pyplot as plt 
import seaborn
%matplotlib inline

# in case some packages are not properly installed
nltk.download('gutenberg')
nltk.download('reuters')
nltk.download('stopwords')
nltk.download("punkt")

from tools import DocumentSequence,DocumentEmbedder

## Save and load file
import pickle as pkl

def get_file(path):
    print('Trying to load file at:{}'.format(path))
    try:
        with open(path, "rb") as f:
            item = pkl.load(f)
    except FileNotFoundError as e:
        print("unable to load {}, see stack trace below".format(path))
        print("double check that you have the file saved {}".format(path))
        print(e)
        return None
    print('Loading success')
    return item

def save_file(path, file):
    with open(path,"wb") as f:
        print("Storing item in {}".format(save_embeddings_path))
        pkl.dump(file,f)
        print("Item stored")

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Load the dataset into pandas: fake_or_real_news.csv

In [19]:
import pandas as pd
import numpy as np
import os
from nltk.corpus import stopwords
from string import punctuation

# load the raw data set and Google pretrained w2v model
df = pd.read_csv("./fake_or_real_news.csv")
pretrained = "./pretrained/GoogleNews-vectors-negative300.bin"

# obtain the raw label data    {'FAKE':1, 'REAL':0}
def trans_labels(labels):           
    for idx in range(len(labels)):
        if labels[idx] == 'FAKE':
            labels[idx] = 1
        else:
            labels[idx] = 0
    return np.array(labels, dtype=int)

labels = trans_labels(df['label'].values)

df.head(5)

Unnamed: 0.1,Unnamed: 0,title,text,label,title_vectors
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",1,[ 1.1533764e-02 4.2144405e-03 1.9692603e-02 ...
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,1,[ 0.11267698 0.02518966 -0.00212591 0.021095...
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,0,[ 0.04253004 0.04300297 0.01848392 0.048672...
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",1,[ 0.10801624 0.11583211 0.02874823 0.061732...
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,0,[ 1.69016439e-02 7.13498285e-03 -7.81233795e-...


#### Tokenize and clean raw data

In [24]:
# obtain the raw news texts and titles
raw_texts = df['text'].values
raw_title = df['title'].values

# build two instances for preprocessing raw data
texts = DocumentSequence(raw_texts, clean=True, sw=stopwords.words('english'), punct=punctuation)
titles = DocumentSequence(raw_title, clean=True, sw=stopwords.words('english'), punct=punctuation)

# save these two data
save_file('./pretrained/texts.pkl',texts)
save_file('./pretrained/titles.pkl',titles)

converting raw docs into tokens
cleaning up stopwords and punctuations
all tokens to be skipped are: {'why', 'didn', 'he', 'mustn', 'did', 'being', 'hadn', 'yours', 'whom', 'same', '^', '=', 'too', 'not', 'from', 'again', 'then', 'ours', ';', 'they', 'in', 'aren', "didn't", "you're", 'them', '!', '/', 'during', 'between', "weren't", 'should', 'doing', 'y', 'through', 'the', 'because', "shan't", 'up', 'isn', 't', "hadn't", 'me', 'herself', 'than', 'just', 'both', 'doesn', 'yourselves', 'with', 'we', "you'll", 'any', '"', '>', 'she', "you'd", 'its', 'over', 'm', 'now', 'which', 'where', "couldn't", '+', 'an', 'shouldn', 'shan', "wouldn't", 'very', 'their', 'll', '<', 'have', 'there', 'these', '|', ',', 'will', 'after', "wasn't", 'was', 'once', 'nor', "mustn't", 'does', 'his', "doesn't", 'that', '&', 'into', 'my', 'by', 'this', 'when', 'as', 'needn', 'her', 's', 'under', 'ain', 'down', "aren't", '-', ':', '`', 'can', 'theirs', 'i', 'be', 'won', 'of', 'ourselves', "you've", 'before', 'some

# Get embeddings
#### How do we get embeddings:
1. Text:   

| Embeddings | Parameters | 
| ------ | ------ | 
| Dov2Vec | (Min_count = 5,Winsize = 13, DBOW/DM) | 
| Naive Doc2Vec | Normalizer = L2/Mean/None | 
| One-Hot Sum |(Rawcount/TF-IDF, Normalized/None) | 
| Attention is all you need | To be implemented |
| FastText | To be implemented |
            
2. Title:   
    .....  
3. Title concatenated with Text: Concatenate Title_d2v and Text_d2v together(for D2V)





In [None]:
# build two instances for producing document embeddings
text_embedder = DocumentEmbedder(texts, pretrained_word2vec=pretrained)
titles_embedder = DocumentEmbedder(titles, pretrained_word2vec=pretrained)

# vectors_size: Number of dimensions for the embedding model
# window: Number of context words to observe in each direction within a document
# min_count: Minimum frequency for words included in model
# dm (distributed memory): '0' indicates DBOW model; '1' indicates DM
# epoches: Number of epochs to train the model for
text_embeddings = text_embedder.get_doc2vec(vectors_size=300,
                                            window=13,
                                            min_count=5,
                                            dm=0,
                                            epochs=100)

title_embeddings = titles_embedder.get_doc2vec(vectors_size=300,
                                               window=13,
                                               min_count=5,
                                               dm=0,
                                               epochs=100)

# if the embeddings is in a list, stack them into a 2-D numpy array
def trans_list_to_array(embeddings):
    if isinstance(embeddings, list): 
        try:
            embeddings = np.stack(emb if isinstance(emb, np.ndarray) else np.zeros(300) for emb in embeddings)
        except ValueError as e:
            print(e)
    return embeddings

# change text_embeddings and title_embeddings into 2-D numpy array
text_embeddings = trans_list_to_array(text_embeddings)
title_embeddings = trans_list_to_array(title_embeddings)

# concatenate text matrix and title matrix as a whole for training
news_embeddings = np.concatenate((title_embeddings, text_embeddings), axis=1)

Save embeddings 

In [23]:
import pickle as pkl
import os

# store the d2v model in files
save_embeddings_path_tail = "d2v(vecsize={}, winsize={}, mincount={}, {}, epochs={}).pkl".format(
    300, 13, 5, "dbow", 100)

# store the text_embeddings in files
save_embeddings_path = "./pretrained/text-" + save_embeddings_path_tail
save_file(save_embeddings_path,text_embeddings)

# store the title_embeddings in files
save_embeddings_path = "./pretrained/title-" + save_embeddings_path_tail
save_file(save_embeddings_path,title_embeddings)

# store the text_embeddings in files
save_embeddings_path = "./pretrained/title_text-" + save_embeddings_path_tail
save_file(save_embeddings_path,news_embeddings)

# store the labels in files
save_labels_path = "./pretrained/labels.pkl"
save_file(save_labels_path,labels)

# # store the d2v model in files
# save_embeddings_path_tail = "d2v(vecsize={}, winsize={}, mincount={}, {}, epochs={}).pkl".format(
#     300, 13, 5, "dbow", 100)

# # get the text_embeddings in files
# save_embeddings_path = "./pretrained/text-" + save_embeddings_path_tail
# text_embeddings = get_file(save_embeddings_path)

# # get the title_embeddings in files
# save_embeddings_path = "./pretrained/title-" + save_embeddings_path_tail
# title_embeddings = get_file(save_embeddings_path)

# # get the text_embeddings in files
# save_embeddings_path = "./pretrained/title_text-" + save_embeddings_path_tail
# news_embeddings = get_file(save_embeddings_path)


Storing item in ./pretrained/title_text-d2v(vecsize=300, winsize=13, mincount=5, dbow, epochs=100).pkl
Item stored
Trying to load file at:./pretrained/text-d2v(vecsize=300, winsize=13, mincount=5, dbow, epochs=100).pkl
Loading success
Trying to load file at:./pretrained/title-d2v(vecsize=300, winsize=13, mincount=5, dbow, epochs=100).pkl
Loading success
Trying to load file at:./pretrained/title_text-d2v(vecsize=300, winsize=13, mincount=5, dbow, epochs=100).pkl
Loading success


#### Visualizing the news embeddings
(with visualize_embeddings.embedding_visualizer provided by Shuheng Liu. 
Tensorflow needed)

In [58]:
from embedding_visualizer import visualize_embeddings

# visualize the news embeddings in the graph
# MUST run in command line "tensorboard --logdir visual/" and visit localhost:6006 to see the visualization
visualize_embeddings(embedding_values=news_embeddings, label_values=labels)

currently setting metadata_path to metadata.tsv. Due to tensorboard version reasons, if prompted 'metadata not found' when visiting tensorboard server page, please manually edit metadata_path in projector_config.pbtxt to visual\metadata.tsv or the absolute path for `metadata.tsv` and restart tensorboard
If your tensorboard version is 1.7.0, you probably should not worry about this
Embeddings are available now. Please start your tensorboard server with commandline `tensorboard --logdir visual` and visit http://localhost:6006 to see the visualization


In [None]:
!tensorboard --logdir visual/

### 2D visualizing 
- Red:Fake 
- Blue:Real

![a](resources/T-SNE_2D.jpg)

### 3D visualizing 
- Red:Fake 
- Blue:Real

![a](resources/T-SNE_3D.jpg)

# Classification process

### For Doc2Vec:

#### Split the dataset (with 75% of data for 5-fold Randomsearching, 25% for testing)

In [22]:
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.model_selection._search import BaseSearchCV
import pickle as pkl

# load pretrained data
news_embeddings = get_file("./pretrained/title_text-d2v(vecsize=300, winsize=13, mincount=5, dbow, epochs=100).pkl")
labels = get_file("./pretrained/labels.pkl")

# perform the split which gets us the train data and the test data
news_train, news_test, labels_train, labels_test = train_test_split(news_embeddings, labels,
                                                                    test_size=0.25,
                                                                    random_state=0,
                                                                    stratify=labels)

Trying to load file at:./pretrained/title_text-d2v(vecsize=300, winsize=13, mincount=5, dbow, epochs=100).pkl
Loading success
Trying to load file at:./pretrained/labels.pkl
Loading success


#### Classifier score and comparement 
We used RandomSearch on different datasets to get the best hyper-parameters.    
The following exhibits every classifier with almost optimal parameters in our experiments.   
The RandomSearch process is omitted.

In [7]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from scipy.stats import randint
from scipy.stats.distributions import uniform
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import numpy as np

# MLP classifier
mlp = MLPClassifier(activation='relu', alpha=0.01, batch_size='auto', beta_1=0.8,
                    beta_2=0.9, early_stopping=False, epsilon=1e-08,
                    hidden_layer_sizes=(600, 300), learning_rate='constant',
                    learning_rate_init=0.0001, max_iter=200, momentum=0.9,
                    nesterovs_momentum=True, power_t=0.5, random_state=0, shuffle=True,
                    solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
                    warm_start=False)

# KNN classifier
knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='cosine',
                           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
                           weights='distance')

# QDA classifier
qda = QuadraticDiscriminantAnalysis(priors=np.array([0.5, 0.5]),
                                    reg_param=0.6531083254653984, store_covariance=False,
                                    store_covariances=None, tol=0.0001)

# GDB classifier
gdb = GradientBoostingClassifier(criterion='friedman_mse', init=None,
                                 learning_rate=0.1, loss='exponential', max_depth=10,
                                 max_features='log2', max_leaf_nodes=None,
                                 min_impurity_decrease=0.0, min_impurity_split=None,
                                 min_samples_leaf=0.0012436966435001434,
                                 min_samples_split=100, min_weight_fraction_leaf=0.0,
                                 n_estimators=200, presort='auto', random_state=0,
                                 subsample=0.8, verbose=0, warm_start=False)

# SVC classifier
svc = SVC(C=0.8, cache_size=200, class_weight=None, coef0=0.0,
          decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
          max_iter=-1, probability=False, random_state=0, shrinking=True,
          tol=0.001, verbose=False)

# GNB classifier
gnb = GaussianNB(priors=None)

# RF classifier
rf = RandomForestClassifier(bootstrap=False, class_weight=None,
                            criterion='entropy', max_depth=10, max_features=7,
                            max_leaf_nodes=None, min_impurity_decrease=0.0,
                            min_impurity_split=None, min_samples_leaf=9,
                            min_samples_split=6, min_weight_fraction_leaf=0.0,
                            n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
                            verbose=0, warm_start=False)


# All the parameters of the classifiers above are optimal in our experiments
# The list below is used to store every classifier instance
classifiers_list = [mlp, knn, qda, gdb, svc, gnb, rf]

### Histogram of scores achieved by different classifiers

![a](resources/models_with_best_performance.jpg)

In [8]:
from sklearn.metrics import classification_report

# print details of testing results
for model in classifiers_list:
    model.fit(news_train, labels_train)
    labels_pred = model.predict(news_test)
    
    # Report the metrics
    target_names = ['Real', 'Fake']
    print(str(model))
    print(classification_report(y_true=labels_test, y_pred=labels_pred, target_names=target_names, digits=3))

MLPClassifier(activation='relu', alpha=0.01, batch_size='auto', beta_1=0.8,
       beta_2=0.9, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(600, 300), learning_rate='constant',
       learning_rate_init=0.0001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=0, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)
             precision    recall  f1-score   support

       Fake      0.944     0.932     0.938       791
       Real      0.933     0.945     0.939       793

avg / total      0.938     0.938     0.938      1584

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='cosine',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='distance')
             precision    recall  f1-score   support

       Fake      0.907     0.814     0.858       791
       Real      0.832     0.917     0.872       793

avg / total      0.869     0.