<a href="https://colab.research.google.com/github/s-miramontes/News_Filter/blob/master/scripts/clustering_univencoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Evaluation of Clustering with Universal Sentence Encoder 

In [0]:
# %%capture
# # Install the latest Tensorflow version.
# !pip3 install --upgrade tensorflow-gpu
# # Install TF-Hub.
# !pip3 install tensorflow-hub
# !pip3 install seaborn

In [0]:
# import statements 

import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

from joblib import Parallel, delayed

import heapq
import operator

from absl import logging

import tensorflow as tf
import tensorflow_hub as hub

from collections import Counter

## Import Eval Data 

In [3]:
eval_data = pd.read_csv("news_filter/data/eval_data.csv")

eval_data.shape

(200, 13)

In [4]:
eval_data.head()

Unnamed: 0.1,index,Unnamed: 0,id,title,publication,author,date,year,month,url,content,label,text
0,74496,77946,118473,"Chaos in the Family, Chaos in the State: The W...",National Review,Kevin D. Williamson,2016-03-17,2016.0,3.0,http://www.nationalreview.com/article/432876/d...,Michael Brendan Dougherty is bitter. I think t...,0,"Chaos in the Family, Chaos in the State: The W..."
1,71184,74592,113594,US Civil Rights Commission Will Observe Standi...,Buzzfeed News,Nidhi Subbaraman,2016-12-08,2016.0,12.0,https://web.archive.org/web/20161208153906/htt...,WASHINGTON — The US Commission on Civil Ri...,0,US Civil Rights Commission Will Observe Standi...
2,120205,123668,184574,"Venezuela hunts rogue helicopter attackers, Ma...",Reuters,Andrew Cawthorne and Victoria Ramirez,2017-06-29,2017.0,6.0,http://www.reuters.com/article/us-venezuela-po...,The Venezuelan government hunted on Wednesday...,0,"Venezuela hunts rogue helicopter attackers, Ma..."
3,128977,132440,199665,Fruit juice isn’t much better for you than sod...,Vox,Julia Belluz,2016/3/25,2016.0,3.0,http://www.vox.com/2016/3/25/11305614/soda-jui...,One of the biggest public health wins of rece...,0,Fruit juice isn’t much better for you than sod...
4,134837,138300,208223,Sessions won’t testify at congressional budget...,Washington Post,Sari Horwitz,2017-06-10,2017.0,6.0,https://web.archive.org/web/20170611000758/htt...,"Attorney General Jeff Sessions, who had agree...",1,Sessions won’t testify at congressional budget...


## Create Embeddings of Eval Data

In [0]:
# download model from https://tfhub.dev/google/universal-sentence-encoder/4 and save locally 
model = hub.load("news_filter/tmp")

In [0]:
# reduce logging output
logging.set_verbosity(logging.ERROR)

# compute embeddings for each article
eval_embeddings = model(eval_data.text)

In [7]:
# sanity check of cosine similarity of eval embeddings
cosine_similarity(eval_embeddings, eval_embeddings)

array([[1.0000001 , 0.2914571 , 0.33693594, ..., 0.28300524, 0.32328773,
        0.33720085],
       [0.2914571 , 0.99999994, 0.5439817 , ..., 0.39472055, 0.45258018,
        0.32834154],
       [0.33693594, 0.5439817 , 1.0000002 , ..., 0.41472793, 0.4813952 ,
        0.35500538],
       ...,
       [0.28300518, 0.3947206 , 0.41472793, ..., 1.        , 0.26588523,
        0.28309864],
       [0.3232877 , 0.45258015, 0.48139524, ..., 0.26588523, 1.0000002 ,
        0.32311845],
       [0.33720073, 0.32834148, 0.35500535, ..., 0.28309864, 0.32311845,
        1.0000002 ]], dtype=float32)

## Assign Cluster to Inputs 

In [0]:
# example user inputs 
input_topics = ["Russian interference with election", "Wikileaks hacking Hillary Clinton's email", "democratic campaigns", "Trump against Obamacare", "Trump's Campaign"]

In [0]:
# create embeddings for each user input 
input_embeddings = model(input_topics)

In [10]:
# data frame of titles and semantic similarities
cos_df = pd.DataFrame(cosine_similarity(input_embeddings, eval_embeddings))
cos_df.columns = eval_data.title
cos_df.index = input_topics

cos_df.shape

(5, 200)

In [11]:
# function to return the column index of the top n values in a row of a dataframe
def find_topind(df, i, n):
  return list(list(zip(*heapq.nlargest(n, enumerate(df.iloc[i,:]), key=operator.itemgetter(1))))[0])

# function to return the top n values in a list
def find_top(lst, ind):
  return [lst[i] for i in ind]

# how many articles per cluster
n = 10

# find index of n most similar titles 
top_ind = Parallel(n_jobs=16)(delayed(find_topind)(cos_df, i, n) for i in range(len(cos_df)))

# show most similar titles -- predicted clusters
top = Parallel(n_jobs=16)(delayed(find_top)(eval_data.title, ind) for ind in top_ind)

top[:5]
# topics: "Russian interference with election", "Wikileaks hacking Hillary Clinton's email", "democratic campaigns", "Trump against Obamacare", "Trump's Campaign"

[['US kicks out dozens of Russian diplomats over election\xa0hacking',
  'Use the Electoral College Properly or Lose It',
  'Officials ’identify White House person of interest’ in Trump-Russia investigation',
  'Trump knew for weeks Michael Flynn misled over Russia contact',
  'Jill Stein files for recount in\xa0Wisconsin',
  'Obama’s State Department Sponsored 2016 Meeting of Russian Ambassador with Sen. Sessions ',
  'Exclusive: Say goodbye to OPEC, powerful Putin pal predicts',
  'FBI Director asked Justice Department to reject Trump’s wiretapping claim — they haven’t listened',
  'Russia, Iran sanctions bill hits roadblock in U.S. House',
  'Emmanuel Macron Declared Next French President'],
 ['TRUMP: Adversaries of US ’almost certainly have a blackmail file’ on Clinton',
  'Hillary Clinton Campaign Meltdown: Top-Secret Emails ’Innocuous,’ Should Be Released ',
  'AP: Gov’t Declares 22 Clinton Emails ’Top Secret’',
  'FBI: Clinton ’extremely careless’ but no charges recommended',
  

In [12]:
# original annotations of articles in each cluster (true and false positives)
top_lab = Parallel(n_jobs=16)(delayed(find_top)(eval_data.label, ind) for ind in top_ind)

top_lab[:5]

[[1, 3, 1, 1, 0, 1, 0, 1, 0, 0],
 [2, 2, 2, 2, 3, 5, 3, 0, 1, 0],
 [3, 1, 3, 3, 0, 3, 3, 3, 0, 2],
 [4, 0, 0, 0, 0, 0, 0, 1, 5, 5],
 [3, 5, 5, 5, 5, 2, 0, 5, 0, 0]]

In [0]:
# how many articles annotated to each label (ground truth)
num_lab = dict(Counter(eval_data.label))

In [14]:
# how many of articles land in correct cluster (true positives)
i=1
true_pos = []
for cluster in top_lab:
  true_pos.append(sum([l == i for l in cluster]))
  i+=1
true_pos

[5, 4, 6, 1, 5]

In [15]:
# percent of correct predictions out of original assignments (recall)
recall = []
for i in range(len(true_pos)):
  recall.append(true_pos[i]/num_lab[i+1])

# average recall 
np.mean(recall)

0.6583333333333333

In [16]:
# percent of correct predictions out of all predictions (precision)
precision = [i/n for i in true_pos]

# average precision
np.mean(precision)

0.42000000000000004