In [2]:
import pandas as pd
import numpy as np 
import itertools as it
% matplotlib inline
import matplotlib.pyplot as plt

In [3]:
import pickle
import time

In [4]:
from gensim.models import Word2Vec



In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

In [None]:
model = Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True, norm_only=True)
w2v = {w: vec for w, vec in zip(model.wv.index2word, model.wv.syn0)}

In [6]:
clean_final = pd.read_csv("clean_final_news.csv", encoding = "utf8", index_col = 0)

In [10]:
text = clean_final.text

In [None]:
clean_final["binary_response"] = np.where(clean_final["response"] == "Not fake", 0, 1)

In [None]:
y_response = clean_final.binary_response.values

In [None]:
fileObject = open('trigram_text','rb')  
tri_lem_comb2 = pickle.load(fileObject)  ## Need to save uncombined version and load here instead

In [None]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec.itervalues().next())

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

In [None]:
def evaluate_model(clf, actual, predicted):
    report = metrics.classification_report(actual, predicted, target_names = ["0", "1"])
    matrix = confusion_matrix(actual, predicted)
    train_loss = metrics.log_loss(y_train, clf.predict_proba(x_train))
    test_loss = metrics.log_loss(y_test, clf.predict_proba(x_test))
    
    print("Train score:", train_loss)
    print("Test score:", test_loss)
    print("Confusion Matrix\n", matrix)
    print("Classification Report:\n", report)
    
    return test_loss

In [None]:
x_train, x_test, y_train, y_test = train_test_split(tri_lem_comb2, y_response, test_size = 0.2, stratify = y_response)

In [None]:
w2v_pipe_nb = Pipeline([('w2c', MeanEmbeddingVectorizer(w2v)), ('nb', MultinomialNB())])

In [None]:
w2v_pipe_nb.fit(x_train,y_train)

In [None]:
parameters = {#'tfidf__max_df': (0.75, 0.80, 0.85, 0.90, 0.95, 1.0),\
              #'tfidf__min_df': (0.001,0.01,0.1),\
              'nb__alpha': (1, 0.1, 0.01, 0.001, 0.0001, 0.00001)}

In [None]:
# Establish grid search instance
wv_nb = GridSearchCV(w2v_pipe_nb, parameters, n_jobs=-1, scoring = 'f1')

In [None]:
# Check best parameters
wv_nb.best_params_

In [None]:
# Predict on test data
wv_nb_predicted = wv_nb.predict(x_test)

In [None]:
# Produce confusion matrix and classification report for nb model
evaluate_model(wv_nb, y_test, wv_nb_predicted)