In [13]:
import spacy
import wmd
from utils.ArticlesHandler import ArticlesHandler
from utils import Config
from utils import knn_similarities, solve, get_rate, accuracy, precision, recall, f1_score
import numpy as np

In [5]:
nlp = spacy.load('en_core_web_md')

In [9]:
config = Config(file='config')


articles = ArticlesHandler(config)
articles = articles.articles.original_articles

In [12]:
nlp_articles = {
    'fake': list(map(lambda a: nlp(a), articles['fake'])),
    'real': list(map(lambda a: nlp(a), articles['real'])),
}

In [86]:
all_articles = [('fake', a) for a in nlp_articles['fake']] + [('real', a) for a in nlp_articles['real']]

np.random.shuffle(all_articles)

labels, articles = zip(*all_articles)

In [87]:
all_labels = np.array([1 if label == 'real' else -1 for label in labels])

In [88]:
n_neighbours = 5
n_unknown = 150

In [97]:
graph = knn_similarities(articles, n_neighbours)

In [89]:
indexes = np.arange(0, len(labels), dtype=int)
to_remove = np.random.choice(indexes, n_unknown)
labels = all_labels.copy()
labels[to_remove] = 0

In [98]:
graph = np.array(graph)

In [91]:
beliefs = solve(graph, labels)

In [92]:
beliefs[beliefs > 0] = 1
beliefs[beliefs < 0] = -1

print(beliefs)

TP, TN, FP, FN = get_rate(beliefs, labels, all_labels)
acc = accuracy(TP, TN, FP, FN)
prec = precision(TP, FP)
rec = recall(TP, FN)
f1 = f1_score(prec, rec)
print("return int belief", beliefs)
print("labels correct", all_labels)
print("labels to complete", labels)
print("% Correct (accuracy, precision, recall, f1_score)", 100 * acc, prec * 100, rec * 100, f1 * 100)
print(100 * float(len(np.array(list(labels)) == 0.))/float(len(list(labels))), '% of labels')

[ 1.  1. -1.  1.  1. -1.  1.  1.  1. -1.  1. -1.  1.  1.  1. -1.  1. -1.
  1. -1.  1. -1. -1. -1.  1.  1. -1.  1.  1. -1.  1.  1.  1.  1.  1. -1.
 -1. -1.  1.  1. -1.  1. -1. -1.  1. -1. -1.  1.  1.  1. -1.  1. -1.  1.
 -1.  1.  1. -1.  1. -1. -1. -1.  1.  1.  1.  1. -1. -1.  1. -1. -1. -1.
  1.  1.  1.  1.  1.  1. -1. -1.  1.  1. -1.  1. -1.  1. -1. -1.  1.  1.
 -1.  1.  1.  1. -1.  1.  1.  1.  1.  1. -1.  1.  1.  1.  1.  1.  1.  1.
  1.  1. -1.  1.  1. -1. -1.  1.  1.  1.  1.  1. -1.  1. -1.  1. -1.  1.
  1.  1.  1.  1.  1.  1.  1.  1. -1.  1.  1.  1.  1.  1.  1. -1.  1.  1.
  1.  1.  1.  1. -1. -1. -1.  1.  1.  1.  1.  1.  1. -1.  1.  1. -1. -1.
  1.  1. -1. -1. -1.  1.  1.  1.  1. -1.  1. -1. -1. -1.  1. -1.  1. -1.
 -1.  1.  1.  1. -1. -1. -1. -1. -1.  1. -1.  1.  1.  1.  1. -1. -1.  1.
  1.  1.  1.  1. -1. -1.  1.  1.  1.  1. -1.  1.  1.  1.  1.  1.  1.  1.
  1. -1.  1.  1.  1. -1.  1.  1.  1.  1. -1. -1. -1. -1.  1.  1.  1.  1.
 -1. -1.  1.  1. -1. -1.  1. -1.  1.  1.  1. -1.  1

In [99]:
n_tries = 1000
n_unknown = 10

acc_mean = np.zeros(n_tries)
prec_mean = np.zeros(n_tries)
recall_mean = np.zeros(n_tries)
f1_mean = np.zeros(n_tries)

for k in range(n_tries):
    g = graph.copy()
    all_articles = [('fake', a) for a in nlp_articles['fake']] + [('real', a) for a in nlp_articles['real']]

    np.random.shuffle(all_articles)

    labels, articles = zip(*all_articles)
    all_labels = np.array([1 if label == 'real' else -1 for label in labels])

    indexes = np.arange(0, len(labels), dtype=int)
    to_remove = np.random.choice(indexes, n_unknown)
    labels = all_labels.copy()
    labels[to_remove] = 0

    beliefs = solve(g, labels)

    beliefs[beliefs > 0] = 1
    beliefs[beliefs < 0] = -1


    TP, TN, FP, FN = get_rate(beliefs, labels, all_labels)
    acc_mean[k] = accuracy(TP, TN, FP, FN)
    prec_mean[k] = precision(TP, FP)
    recall_mean[k] = recall(TP, FN)
    f1_mean[k] = f1_score(prec, rec)
    
print("% Correct (accuracy, precision, recall, f1_score)", 100 * acc_mean.mean(), prec_mean.mean() * 100, recall_mean.mean() * 100, f1 * 100)

% Correct (accuracy, precision, recall, f1_score) 48.27916666666666 41.25992063492064 47.980714285714285 61.016949152542374
