In [23]:
#Imports
import os
import pandas as pd
import nltk
import string
import gensim
import timeit
import numpy as np
from gensim import corpora, models, similarities

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)

import nltk
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag
from nltk import FreqDist
from nltk import bigrams
from nltk import word_tokenize
from nltk import tokenize

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

### Reading and Cleaning

In [2]:
fakenews_df = pd.read_csv("fake.csv")
fake_real_df = pd.read_csv("fake_and_real_news.csv")

In [3]:
len(fakenews_df)

12999

In [4]:
len(fake_real_df)

6335

In [5]:
fakenews_df =  fakenews_df.drop(['author','domain_rank','main_img_url'], axis=1) 

In [6]:
fakenews_df = fakenews_df.dropna(subset=['title','text'])

In [7]:
class NLTKPreprocessor(BaseEstimator, TransformerMixin):

    def __init__(self, stopwords=None, punct=None,
                 lower=True, strip=True):
        self.lower      = lower
        self.strip      = strip
        self.stopwords  = stopwords or set(sw.words('english'))
        self.punct      = punct or set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        return self

    def inverse_transform(self, X):
        return [" ".join(doc) for doc in X]

    def transform(self, X):
        return [
            list(self.tokenize(doc)) for doc in X
        ]

    def tokenize(self, document):
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token
                token = token.strip('»') if self.strip else token
                token = token.strip('…') if self.strip else token
                token = token.strip('“') if self.strip else token
                token = token.strip('’') if self.strip else token

                # If stopword, ignore token and continue
                if token in self.stopwords:
                    continue
                  
                # If digit, ignore token and continue
                if token.isdigit():
                    continue

                # If punctuation, ignore token and continue
                if all(char in self.punct for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                yield lemma

    def lemmatize(self, token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)

preprocessor = NLTKPreprocessor()

In [8]:
fakenews_df['pre_title'] = preprocessor.fit_transform([title for title in fakenews_df.title])

In [9]:
fakenews_df['pre_text'] = preprocessor.fit_transform([text for text in fakenews_df.text])

In [11]:
fake_real_df['pre_title'] = preprocessor.fit_transform([title for title in fake_real_df.title])

In [12]:
fake_real_df['pre_text'] = preprocessor.fit_transform([text for text in fake_real_df.text])

In [15]:
fakenews_df.to_csv('fakenews_processed.csv')

In [16]:
fake_real_df.to_csv('fake_real_processed.csv')

### Create text corpus for training word2vec

In [37]:
# Reload saved processed data
fake_real_df = pd.read_csv("src/fake_real_processed.csv")
fakenews_df = pd.read_csv("src/fakenews_processed.csv")

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,title,text,label,pre_title,pre_text
0,0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,"['smell', 'hillary', 'fear']","['daniel', 'greenfield', 'shillman', 'journali..."
1,1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,"['watch', 'exact', 'moment', 'paul', 'ryan', '...","['google', 'pinterest', 'digg', 'linkedin', 'r..."
2,2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,"['kerry', 'go', 'paris', 'gesture', 'sympathy']","['u', 'secretary', 'state', 'john', 'f', 'kerr..."
3,3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,"['bernie', 'supporter', 'twitter', 'erupt', 'a...","['—', 'kaydee', 'king', 'kaydeeking', 'novembe..."
4,4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,"['battle', 'new', 'york', 'primary', 'matter']","['primary', 'day', 'new', 'york', 'front', 'ru..."


### Combine text into one big string

In [13]:
raw_fake = [x for x in fakenews_df['text']]

In [15]:
raw_fake = "".join(raw_fake)

In [10]:
# Token setting 
pattern = r'''(?x)  # set flag to allow verbose regexps
(?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
|\w+(?:[-']\w+)*    # words with optional internal hyphens
|\$?\d+(?:\.\d+)?   # currency, e.g. $12.80 
|\.\.\.             # elipses
|[.,;"'?()-_`]      # these are separate tokens
'''

In [18]:
tokenized_raw =" ".join( nltk.regexp_tokenize(raw_fake,pattern))

In [24]:
tokenized_raw=tokenize.sent_tokenize(tokenized_raw)

In [25]:
nopunct=[]
for sent in tokenized_raw:
        a = [w for w in sent.split() if w not in string.punctuation]
        nopunct.append(" ".join(a))

#word list
tok_corp = [nltk.word_tokenize(sent) for sent in nopunct]

In [26]:
# Unique Words in text
combined_list = [" ".join(w) for w in tok_corp]
unique_list = []
for sent in combined_list:
    unique_list.append([w for w in sent.split()])
unique_list=list(set([item for sublist in unique_list for item in sublist]))

In [27]:
# training gensim on corpus
fake_model = gensim.models.Word2Vec(tok_corp, min_count=1, size = 16, window=5)

2017-04-27 18:48:04,373 : DEBUG : Fast version of gensim.models.word2vec is being used
2017-04-27 18:48:04,376 : INFO : collecting all words and their counts
2017-04-27 18:48:04,378 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-04-27 18:48:04,440 : INFO : PROGRESS: at sentence #10000, processed 210655 words, keeping 21349 word types
2017-04-27 18:48:04,501 : INFO : PROGRESS: at sentence #20000, processed 432453 words, keeping 30710 word types
2017-04-27 18:48:04,548 : INFO : PROGRESS: at sentence #30000, processed 655137 words, keeping 35965 word types
2017-04-27 18:48:04,604 : INFO : PROGRESS: at sentence #40000, processed 864748 words, keeping 42263 word types
2017-04-27 18:48:04,662 : INFO : PROGRESS: at sentence #50000, processed 1080840 words, keeping 47208 word types
2017-04-27 18:48:04,721 : INFO : PROGRESS: at sentence #60000, processed 1298931 words, keeping 53473 word types
2017-04-27 18:48:04,772 : INFO : PROGRESS: at sentence #70000, proces

In [38]:
google_model = gensim.models.KeyedVectors.load_word2vec_format('src/GoogleNews-vectors-negative300.bin', binary=True)

2017-04-27 19:41:15,964 : INFO : loading projection weights from src/GoogleNews-vectors-negative300.bin
2017-04-27 19:42:00,191 : INFO : loaded (3000000, 300) matrix from src/GoogleNews-vectors-negative300.bin


In [42]:
model.most_similar(positive=['Clinton'], topn=5)

[('Hillary', 0.9294048547744751),
 ('campaign', 0.9005018472671509),
 ('Trump', 0.8691877722740173),
 ('Clintons', 0.8475222587585449),
 ('DNC', 0.8457835912704468)]

In [40]:
google_model.most_similar(positive=['truth'], topn=5)

[('truths', 0.6511818170547485),
 ('falsehood', 0.5787492990493774),
 ('veritas_Latin', 0.5751311779022217),
 ('Fatma_Trad_veiled', 0.5659777522087097),
 ('truthful', 0.5630574226379395)]

### doc2vec