In [7]:
import pandas as pd
import numpy as np 
import re
% matplotlib inline
import matplotlib.pyplot as plt
from collections import Counter

In [8]:
import pickle
import time

In [9]:
import spacy
nlp = spacy.load('en')

In [4]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.models import Word2Vec
from gensim.models import KeyedVectors



In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

In [6]:
# Update puncuation list in spacy
nlp.vocab["$"].is_punct = True
nlp.vocab["|"].is_punct = True
nlp.vocab["+"].is_punct = True
nlp.vocab["<"].is_punct = True
nlp.vocab[">"].is_punct = True
nlp.vocab["="].is_punct = True
nlp.vocab["^"].is_punct = True
nlp.vocab["`"].is_punct = True
nlp.vocab["~"].is_punct = True

### Load in Models

In [10]:
model = KeyedVectors.load_word2vec_format('~/Documents/Capstone/GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
trigram = pickle.load(open('trigram2.sav', 'rb'))

In [49]:
lr_model = pickle.load(open('lr_model.sav', 'rb'))
nb_model = pickle.load(open('tfidf_nb_model.sav', 'rb'))
rf_model = pickle.load(open('tfidf_rf_model.sav', 'rb'))
svm_model = pickle.load(open('tfidf_svm_model.sav', 'rb'))
wv_lr_model = pickle.load(open('wv_lr_model.sav', 'rb'))
wv_rf_model = pickle.load(open('wv_rf_model.sav', 'rb'))
wv_svm_model = pickle.load(open('wv_svm_model.sav', 'rb'))

### Write Function to Determine Fake vs. Real

In [192]:
# Read in article text
with open("C:/Users/jgoldste/Documents/Capstone/code/fake_news_test8.txt", 'r') as f:
    article = ""
    for l in f:
        l = l.rstrip()
        article += str(l)

In [194]:
# See article text
article

"?Police have identified James Alex Fields as the driver of the car that smashed into a crowd of anti-fascist protesters and another vehicle during Charlottesville rallies. Fields faces second degree murder charges for killing a woman and injuring 19 other people. The 20-year-old man was taken into custody following the deadly crash on Saturday afternoon. The suspect has been charged with one count of second degree murder, three counts of malicious wounding and one count of failure to prevent a hit-and-run incident.                Although police have not disclosed any details concerning the prime suspect’s identity, media reported that he was the owner of the grey Dodge Challenger, a vehicle that was filmed by the witnesses speeding up and plowing into the crowd of anti-fascist protesters who flocked to downtown Charlottesville on Saturday to oppose a large far-right rally there.           The guy arrested over the car plowing incident isn't even on the record as owning the Dodge Chal

In [177]:
# helper function to eliminate tokens that are pure punctuation, whitespace, or stopword
# can be updated based on desired filtering 

def process_txt(token):
    return token.is_punct or token.is_space or token.is_stop or token.like_num

In [178]:
# function to take array of articles and turn them into nested list of tokens

def lemmatize_txt(array):
    lemma = []
    
    doc = nlp(array)

    lemma.append([n.lemma_ for n in doc if not process_txt(n)])
    
    return lemma

In [179]:
# function to recombine nested list of tokens into full articles 

def lemma_combine(lis):
    parsed_articles = []
    
    for i in range(len(lis)):
        concat_art = ' '.join(lis[i])
        parsed_articles.append(concat_art)
    
    return parsed_articles

In [180]:
def infer_vector(text):
    text = text.split()
    vector = []
    for i in text:
        try:
            vector.append(model.word_vec(i))
        except:
            pass
    return list(np.mean(vector, axis=0))

In [181]:
def fake_detector(article):
    article = re.sub('[^\x00-\x7F]+', "", article) #remove non-ascii characters
    article = re.sub('(\r)+', "",  article) #remove newline characters
    article = re.sub(r'@([A-Za-z0-9_]+)', "",  article) # remove twitter handles
    article = re.sub(r"(https|http)\S+", "",  article) # remove hyperlinks

    # tokenize article text
    lem = lemmatize_txt(article)
    
    # add text to trigram model
    trigram.add_vocab(lem)
    trigram_lem = list(trigram[lem])
    
    # recombine tokens
    tri_lem_comb = lemma_combine(trigram_lem)
    
    # format text for w2v model
    tri_lem = pd.Series([x for x in tri_lem_comb])
    vec_text = tri_lem.apply(infer_vector)
    vec_text = pd.DataFrame(list(map(lambda x: list(x), vec_text)))
    
    # make predictions
    lr_predicted = lr_model.predict(tri_lem_comb)
    nb_predicted = nb_model.predict(tri_lem_comb)
    rf_predicted = rf_model.predict(tri_lem_comb)
    svm_predicted = svm_model.predict(tri_lem_comb)
    wv_lr_predicted = wv_lr_model.predict(vec_text)
    wv_rf_predicted = wv_rf_model.predict(vec_text)
    wv_svm_predicted = wv_svm_model.predict(vec_text)

    print ("TFIDF NB Prediction:" , lr_predicted)
    print ("TFIDF LR Prediction:" , nb_predicted)
    print ("TFIDF RF Prediction:" , rf_predicted)
    print ("TFIDF SVM Prediction:" , svm_predicted)
    print ("W2V LR Prediction:" , wv_lr_predicted)
    print ("W2V RF Prediction:" , wv_rf_predicted)
    print ("W2V SVM Prediction:" , wv_svm_predicted)

In [182]:
fake_detector(article) # predictions for real article about Hurricane Irma



TFIDF NB Prediction: [0]
TFIDF LR Prediction: [0]
TFIDF RF Prediction: [0]
TFIDF SVM Prediction: [0]
W2V LR Prediction: [0]
W2V RF Prediction: [0]
W2V SVM Prediction: [0]


In [163]:
fake_detector(article) # predictions for real article about Brexit



TFIDF NB Prediction: [0]
TFIDF LR Prediction: [0]
TFIDF RF Prediction: [0]
TFIDF SVM Prediction: [0]
W2V LR Prediction: [0]
W2V RF Prediction: [0]
W2V SVM Prediction: [0]


In [188]:
fake_detector(article) # predictions for fake article about Clinton foundation



TFIDF NB Prediction: [1]
TFIDF LR Prediction: [1]
TFIDF RF Prediction: [1]
TFIDF SVM Prediction: [1]
W2V LR Prediction: [1]
W2V RF Prediction: [0]
W2V SVM Prediction: [1]


In [191]:
fake_detector(article) # predictions for fake article about Seth Rich conspiracy



TFIDF NB Prediction: [1]
TFIDF LR Prediction: [1]
TFIDF RF Prediction: [1]
TFIDF SVM Prediction: [1]
W2V LR Prediction: [1]
W2V RF Prediction: [1]
W2V SVM Prediction: [1]


In [193]:
fake_detector(article) # predictions for fake article about Charlottesville driver



TFIDF NB Prediction: [1]
TFIDF LR Prediction: [0]
TFIDF RF Prediction: [1]
TFIDF SVM Prediction: [1]
W2V LR Prediction: [1]
W2V RF Prediction: [0]
W2V SVM Prediction: [1]
