In [15]:
#Imports
import os
import pandas as pd
import nltk
import string
import gensim
import timeit
import numpy as np
from gensim import corpora, models, similarities

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)

import math

from matplotlib import pyplot as plt

import nltk
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag
from nltk import FreqDist
from nltk import bigrams
from nltk import word_tokenize
from nltk import tokenize

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

ImportError: cannot import name 'Doc2Vec'

### Reading and Cleaning

In [None]:
fakenews_df = pd.read_csv("fake.csv")
fake_real_df = pd.read_csv("fake_and_real_news.csv")

In [None]:
len(fakenews_df)

In [None]:
len(fake_real_df)

In [None]:
fakenews_df =  fakenews_df.drop(['author','domain_rank','main_img_url'], axis=1) 

In [None]:
fakenews_df = fakenews_df.dropna(subset=['title','text'])

In [None]:
class NLTKPreprocessor(BaseEstimator, TransformerMixin):

    def __init__(self, stopwords=None, punct=None,
                 lower=True, strip=True):
        self.lower      = lower
        self.strip      = strip
        self.stopwords  = stopwords or set(sw.words('english'))
        self.punct      = punct or set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        return self

    def inverse_transform(self, X):
        return [" ".join(doc) for doc in X]

    def transform(self, X):
        return [
            list(self.tokenize(doc)) for doc in X
        ]

    def tokenize(self, document):
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token
                token = token.strip('»') if self.strip else token
                token = token.strip('…') if self.strip else token
                token = token.strip('“') if self.strip else token
                token = token.strip('’') if self.strip else token

                # If stopword, ignore token and continue
                if token in self.stopwords:
                    continue
                  
                # If digit, ignore token and continue
                if token.isdigit():
                    continue

                # If punctuation, ignore token and continue
                if all(char in self.punct for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                yield lemma

    def lemmatize(self, token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)

preprocessor = NLTKPreprocessor()

In [None]:
fakenews_df['pre_title'] = preprocessor.fit_transform([title for title in fakenews_df.title])

In [None]:
fakenews_df['pre_text'] = preprocessor.fit_transform([text for text in fakenews_df.text])

In [None]:
fake_real_df['pre_title'] = preprocessor.fit_transform([title for title in fake_real_df.title])

In [None]:
fake_real_df['pre_text'] = preprocessor.fit_transform([text for text in fake_real_df.text])

In [None]:
fakenews_df.to_csv('fakenews_processed.csv')

In [None]:
fake_real_df.to_csv('fake_real_processed.csv')

### Create text corpus for training word2vec

In [3]:
# Reload saved processed data
fake_real_df = pd.read_csv("src/fake_real_processed.csv")
fakenews_df = pd.read_csv("src/fakenews_processed.csv")

In [None]:
raw_fake = [x for x in fakenews_df['text']]

In [None]:
raw_fake = "".join(raw_fake)

In [None]:
# Token setting 
pattern = r'''(?x)  # set flag to allow verbose regexps
(?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
|\w+(?:[-']\w+)*    # words with optional internal hyphens
|\$?\d+(?:\.\d+)?   # currency, e.g. $12.80 
|\.\.\.             # elipses
|[.,;"'?()-_`]      # these are separate tokens
'''

In [None]:
tokenized_raw =" ".join( nltk.regexp_tokenize(raw_fake,pattern))

In [None]:
tokenized_raw=tokenize.sent_tokenize(tokenized_raw)

In [None]:
nopunct=[]
for sent in tokenized_raw:
        a = [w for w in sent.split() if w not in string.punctuation]
        nopunct.append(" ".join(a))

#word list
tok_corp = [nltk.word_tokenize(sent) for sent in nopunct]

In [None]:
# Unique Words in text
combined_list = [" ".join(w) for w in tok_corp]
unique_list = []
for sent in combined_list:
    unique_list.append([w for w in sent.split()])
unique_list=list(set([item for sublist in unique_list for item in sublist]))

In [None]:
# training gensim on corpus
fake_model = gensim.models.Word2Vec(tok_corp, min_count=1, size = 16, window=5)

In [None]:
# fake_model.save('fake_model.model')
fake_model = gensim.models.Word2Vec.load('src/fake_model.model')

In [None]:
google_model = gensim.models.KeyedVectors.load_word2vec_format('src/GoogleNews-vectors-negative300.bin', binary=True)

### Fun with Vectors

In [None]:
fake_model.most_similar(positive=['sanders'], topn=5)

In [None]:
# Vector Similarities data_frame
vectors_df = pd.read_csv('similarities.csv')

In [None]:
# TODO: write text process to compare, print embeddings

model = fake_model

for index, row in vectors_df.iterrows():

    # if positive not null
    positive = row[0].split(',')
    
    positive = list(map(str.strip, positive))
    
    # if negative not null
    if pd.notnull(row[1]):
        negative = row[1].split(',')
        results = model.most_similar(positive=positive, negative=negative, topn=5)
        # negative array = negative
        print("Positive Words: {}. Negative Words: {}.".format(', '.join([x for x in positive]), ', '.join([x for x in negative])))
        print(*results, sep='\n')
    else:
        results = model.most_similar(positive=positive, topn=5)
        print("Positive Words: {}.".format(', '.join([x for x in positive])))
#         print("Results:{} \n".format(results))
        print(*results, sep='\n')
        
    
#     for result in results:
#         print(result)
    



In [None]:
google_model.most_similar(positive=['King',"woman"],negative=[], topn=5)

### Dimensionality Reduction

In [None]:
## List of Vectors for unique words in text
vectors_list = []
for word in unique_list:
    vectors_list.append(fake_model[word])

In [None]:
# BUG: kernel crashes with too many vectors (most tried: ~20,000)
vectors_list

In [None]:
# Subsample vectors list 1/10
vectors_list_sample = vectors_list[::10]

In [None]:
data_embed=TSNE(n_components=2, perplexity=50, verbose=2, method='barnes_hut').fit_transform(vectors_list_sample)

In [None]:
x_axis = data_embed[:,0]
y_axis = data_embed[:,1]

In [None]:
plt.scatter(x_axis, y_axis, s=1)

In [None]:
plt.show()

### doc2vec

https://medium.com/@klintcho/doc2vec-tutorial-using-gensim-ab3ac03d3a1

In [4]:
# fake_real_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,title,text,label,pre_title,pre_text
0,0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,"['smell', 'hillary', 'fear']","['daniel', 'greenfield', 'shillman', 'journali..."
1,1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,"['watch', 'exact', 'moment', 'paul', 'ryan', '...","['google', 'pinterest', 'digg', 'linkedin', 'r..."
2,2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,"['kerry', 'go', 'paris', 'gesture', 'sympathy']","['u', 'secretary', 'state', 'john', 'f', 'kerr..."
3,3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,"['bernie', 'supporter', 'twitter', 'erupt', 'a...","['—', 'kaydee', 'king', 'kaydeeking', 'novembe..."
4,4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,"['battle', 'new', 'york', 'primary', 'matter']","['primary', 'day', 'new', 'york', 'front', 'ru..."


In [9]:
# Create Labels
docLabels = fake_real_df['title'].tolist()

In [11]:
# Get Data
data = fake_real_df['text'].tolist()

In [25]:
class DocIterator(object):
    def __init__(self, doc_list, labels_list):
       self.labels_list = labels_list
       self.doc_list = doc_list
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            yield DocIterator(words=doc.split(),labels=[self.labels_list[idx]])

In [26]:
# iterator object
it = DocIterator(data, docLabels)

In [29]:
model.build_vocab(it)

2017-04-30 20:04:30,268 : INFO : collecting all words and their counts


TypeError: __init__() got an unexpected keyword argument 'words'

In [56]:
def read_corpus(doc, tokens_only=False):
#     for i, line in enumerate(doc.split('\n')):
    for i, line in enumerate(doc):
        if tokens_only:
            yield gensim.utils.simple_preprocess(line)
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])

In [72]:
train_corpus = list(read_corpus(data))

In [95]:
model = gensim.models.doc2vec.Doc2Vec(size=16, min_count=2, iter=55)

2017-04-30 21:58:56,903 : DEBUG : Fast version of gensim.models.word2vec is being used


In [96]:
model.build_vocab(train_corpus)

2017-04-30 21:58:59,722 : INFO : collecting all words and their counts
2017-04-30 21:58:59,724 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2017-04-30 21:59:00,967 : INFO : collected 65388 word types and 6335 unique tags from a corpus of 6335 examples and 4746764 words
2017-04-30 21:59:00,968 : INFO : Loading a fresh vocabulary
2017-04-30 21:59:01,086 : INFO : min_count=2 retains 42804 unique words (65% of original 65388, drops 22584)
2017-04-30 21:59:01,087 : INFO : min_count=2 leaves 4724180 word corpus (99% of original 4746764, drops 22584)
2017-04-30 21:59:01,198 : INFO : deleting the raw counts dictionary of 65388 items
2017-04-30 21:59:01,200 : INFO : sample=0.001 downsamples 39 most-common words
2017-04-30 21:59:01,201 : INFO : downsampling leaves estimated 3756542 word corpus (79.5% of prior 4724180)
2017-04-30 21:59:01,202 : INFO : estimated required memory for 42804 words and 16 dimensions: 27286352 bytes
2017-04-30 21:59:01,385 : INFO : res

In [97]:
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter)

2017-04-30 21:59:01,966 : INFO : training model with 3 workers on 42804 vocabulary and 16 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2017-04-30 21:59:01,969 : DEBUG : queueing job #0 (9794 words, 15 sentences) at alpha 0.02500
2017-04-30 21:59:01,971 : DEBUG : queueing job #1 (9774 words, 11 sentences) at alpha 0.02500
2017-04-30 21:59:01,973 : DEBUG : queueing job #2 (9422 words, 20 sentences) at alpha 0.02500
2017-04-30 21:59:01,976 : DEBUG : queueing job #3 (9441 words, 18 sentences) at alpha 0.02500
2017-04-30 21:59:01,982 : DEBUG : queueing job #4 (9383 words, 13 sentences) at alpha 0.02500
2017-04-30 21:59:01,987 : DEBUG : queueing job #5 (9753 words, 9 sentences) at alpha 0.02499
2017-04-30 21:59:01,994 : DEBUG : queueing job #6 (9796 words, 12 sentences) at alpha 0.02499
2017-04-30 21:59:02,004 : DEBUG : queueing job #7 (9409 words, 12 sentences) at alpha 0.02499
2017-04-30 21:59:02,006 : DEBUG : queueing job #8 (9666 words, 16 sentences) at alpha 0.02499
2017-0

CPU times: user 8min 7s, sys: 21.2 s, total: 8min 28s
Wall time: 4min 14s


206427608

In [98]:
len(model.docvecs)

6335

In [99]:
len(fake_real_df)

6335

In [100]:
model.save('fake_real_docVecs.model')

2017-04-30 22:32:06,852 : INFO : saving Doc2Vec object under fake_real_docVecs.model, separately None
2017-04-30 22:32:06,856 : INFO : not storing attribute syn0norm
2017-04-30 22:32:06,857 : INFO : not storing attribute cum_table
2017-04-30 22:32:07,315 : INFO : saved fake_real_docVecs.model


In [101]:
vectors = list(model.docvecs)

In [103]:
vectors_df = pd.DataFrame(vectors)

In [106]:
vectors_df.to_csv('fake_real_docVecs.csv')

In [92]:
fake_real_df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,title,text,label,pre_title,pre_text,vectors
0,0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,"['smell', 'hillary', 'fear']","['daniel', 'greenfield', 'shillman', 'journali...","[1.86644, -0.600091, -1.89861, 0.0277527, 0.97..."
1,1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,"['watch', 'exact', 'moment', 'paul', 'ryan', '...","['google', 'pinterest', 'digg', 'linkedin', 'r...","[1.7042, -2.41558, -0.794482, 1.08904, 0.95250..."
2,2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,"['kerry', 'go', 'paris', 'gesture', 'sympathy']","['u', 'secretary', 'state', 'john', 'f', 'kerr...","[-0.576178, 1.15601, 0.818085, 0.744736, 2.363..."
3,3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,"['bernie', 'supporter', 'twitter', 'erupt', 'a...","['—', 'kaydee', 'king', 'kaydeeking', 'novembe...","[-1.77581, -2.02436, -0.876729, 1.64908, 1.490..."
4,4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,"['battle', 'new', 'york', 'primary', 'matter']","['primary', 'day', 'new', 'york', 'front', 'ru...","[0.251354, 1.65583, 0.773402, 0.804722, 0.0867..."
5,5,6903,"Tehran, USA","\nI’m not an immigrant, but my grandparents ...",FAKE,"['tehran', 'usa']","['immigrant', 'grandparent', 'year', 'ago', 'a...","[-2.99305, 1.40977, -4.13671, 0.155265, 1.9998..."
6,6,7341,Girl Horrified At What She Watches Boyfriend D...,"Share This Baylee Luciani (left), Screenshot o...",FAKE,"['girl', 'horrified', 'watch', 'boyfriend', 'l...","['share', 'baylee', 'luciani', 'leave', 'scree...","[3.45064, -0.75968, -2.12559, -0.445087, -1.10..."
7,7,95,‘Britain’s Schindler’ Dies at 106,A Czech stockbroker who saved more than 650 Je...,REAL,"['‘', 'britain', 'schindler', 'die']","['czech', 'stockbroker', 'save', 'jewish', 'ch...","[-0.358347, -0.522781, -1.37426, -0.234077, -0..."
8,8,4869,Fact check: Trump and Clinton at the 'commande...,Hillary Clinton and Donald Trump made some ina...,REAL,"['fact', 'check', 'trump', 'clinton', 'command...","['hillary', 'clinton', 'donald', 'trump', 'mak...","[-2.18008, 1.57515, 0.672199, 0.308268, -0.253..."
9,9,2909,Iran reportedly makes new push for uranium con...,Iranian negotiators reportedly have made a las...,REAL,"['iran', 'reportedly', 'make', 'new', 'push', ...","['iranian', 'negotiator', 'reportedly', 'make'...","[-0.130085, 2.36362, 0.682668, -5.05137, 1.119..."


In [93]:
fake_real_df['vectors'][0]

array([ 1.86644268, -0.60009092, -1.89860678,  0.02775274,  0.97697639,
        1.66069853,  2.04857254, -0.27863249, -2.39358354, -0.0135734 ,
       -2.2194562 , -0.13207346,  0.69616896,  1.96375251, -1.24584687,
       -0.73789799, -1.81084371,  0.97853881, -3.71052456, -1.06838608,
       -0.45857069,  0.68638921, -1.29756284, -4.31388092, -1.13982379,
       -2.14487696,  0.44357163, -0.74665552,  0.0655511 , -0.07896062,
        1.28122175,  1.04448283,  1.52298987,  2.38131142,  0.38440508,
       -1.23368347,  0.32400516, -0.48490149,  3.427351  , -2.53891754,
        1.05500352,  1.43910635, -2.25527668, -4.34939003,  2.06018043,
       -4.25589561,  1.8297224 , -1.77836549,  3.15529895, -0.95061392,
       -0.40989038, -1.35180902, -0.24904236,  1.17847133, -2.36190128,
        1.48101723, -0.04179225, -1.57934654,  1.37584603, -0.93348432], dtype=float32)