<a href="https://colab.research.google.com/github/s-miramontes/News_Filter/blob/master/scripts/compare_corpus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Evaluation of Clustering with Universal Sentence Encoder on Different Corpus 


*   just title
*   just content
*   text: title + content




In [0]:
# %%capture
# # Install the latest Tensorflow version.
# !pip3 install --upgrade tensorflow-gpu
# # Install TF-Hub.
# !pip3 install tensorflow-hub
# !pip3 install seaborn

In [0]:
# import statements 

import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

from joblib import Parallel, delayed

import heapq
import operator

from absl import logging

import tensorflow as tf
import tensorflow_hub as hub

from collections import Counter

## Import Data

In [2]:
eval_data = pd.read_csv("news_filter/data/eval_data.csv")

eval_data.shape

(200, 13)

In [3]:
eval_data.head()

Unnamed: 0.1,index,Unnamed: 0,id,title,publication,author,date,year,month,url,content,label,text
0,74496,77946,118473,"Chaos in the Family, Chaos in the State: The W...",National Review,Kevin D. Williamson,2016-03-17,2016.0,3.0,http://www.nationalreview.com/article/432876/d...,Michael Brendan Dougherty is bitter. I think t...,0,"Chaos in the Family, Chaos in the State: The W..."
1,71184,74592,113594,US Civil Rights Commission Will Observe Standi...,Buzzfeed News,Nidhi Subbaraman,2016-12-08,2016.0,12.0,https://web.archive.org/web/20161208153906/htt...,WASHINGTON — The US Commission on Civil Ri...,0,US Civil Rights Commission Will Observe Standi...
2,120205,123668,184574,"Venezuela hunts rogue helicopter attackers, Ma...",Reuters,Andrew Cawthorne and Victoria Ramirez,2017-06-29,2017.0,6.0,http://www.reuters.com/article/us-venezuela-po...,The Venezuelan government hunted on Wednesday...,0,"Venezuela hunts rogue helicopter attackers, Ma..."
3,128977,132440,199665,Fruit juice isn’t much better for you than sod...,Vox,Julia Belluz,2016/3/25,2016.0,3.0,http://www.vox.com/2016/3/25/11305614/soda-jui...,One of the biggest public health wins of rece...,0,Fruit juice isn’t much better for you than sod...
4,134837,138300,208223,Sessions won’t testify at congressional budget...,Washington Post,Sari Horwitz,2017-06-10,2017.0,6.0,https://web.archive.org/web/20170611000758/htt...,"Attorney General Jeff Sessions, who had agree...",1,Sessions won’t testify at congressional budget...


## Compare Corpus for Universal Encoder

### Output Precision and Recall

In [0]:
def prec_recall(corpus):

  # download model from https://tfhub.dev/google/universal-sentence-encoder/4 and save locally in 'tmp' folder
  model = hub.load("news_filter/tmp")

  # reduce logging output
  logging.set_verbosity(logging.ERROR)

  # compute embeddings for each article
  eval_embeddings = model(corpus)

  # example user inputs 
  input_topics = ["Russian interference with election", "Wikileaks hacking Hillary Clinton's email", "democratic campaigns", "Trump against Obamacare", "Trump's Campaign"]

  # create embeddings for each user input 
  input_embeddings = model(input_topics)

  # data frame of titles and semantic similarities
  cos_df = pd.DataFrame(cosine_similarity(input_embeddings, eval_embeddings))

  # function to return the column index of the top n values in a row of a dataframe
  def find_topind(df, i, n):
    return list(list(zip(*heapq.nlargest(n, enumerate(df.iloc[i,:]), key=operator.itemgetter(1))))[0])

  # function to return the top n values in a list
  def find_top(lst, ind):
    return [lst[i] for i in ind]

  # how many articles per cluster
  n = 10

  # find index of n most similar articles 
  top_ind = Parallel(n_jobs=16)(delayed(find_topind)(cos_df, i, n) for i in range(len(cos_df)))

  # original annotations of articles in each cluster (true and false positives)
  top_lab = Parallel(n_jobs=16)(delayed(find_top)(eval_data.label, ind) for ind in top_ind)

  # how many articles annotated to each label (ground truth)
  num_lab = dict(Counter(eval_data.label))

  # how many of articles land in correct cluster (true positives)
  i=1
  true_pos = []
  for cluster in top_lab:
    true_pos.append(sum([l == i for l in cluster]))
    i+=1

  # percent of correct predictions out of original assignments (recall)
  recall = []
  for i in range(len(true_pos)):
    recall.append(true_pos[i]/num_lab[i+1])

  # average recall 
  avg_recall = np.mean(recall)

  # percent of correct predictions out of all predictions (precision)
  precision = [i/n for i in true_pos]

  # average precision
  avg_precision = np.mean(precision)

  return (avg_precision, avg_recall)



In [8]:
# average precision and recall on titles
prec_recall(eval_data.title)

(0.32, 0.55)

In [9]:
# average precision and recall on content
prec_recall(eval_data.content)

(0.42000000000000004, 0.6583333333333333)

In [7]:
# average precision and recall on text
prec_recall(eval_data.text)

(0.42000000000000004, 0.6583333333333333)