<a href="https://colab.research.google.com/github/s-miramontes/News_Filter/blob/master/notebooks/cluster_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cluster Training Data with Universal Sentence Encoder on Article Content

In [0]:
# %%capture
# # Install the latest Tensorflow version.
# !pip3 install --upgrade tensorflow-gpu
# # Install TF-Hub.
# !pip3 install tensorflow-hub
# !pip3 install seaborn

In [0]:
# import statements 

import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

from joblib import Parallel, delayed

import heapq
import operator

from absl import logging

import tensorflow as tf
import tensorflow_hub as hub

## Import Training Data

In [2]:
# small_data = pd.read_csv("news_filter/data/small_data.csv")
small_data = pd.read_csv("news_filter/data/filter_small_data.csv") # filtered data


small_data.shape

(13000, 15)

In [3]:
small_data.head()

Unnamed: 0.1,index,Unnamed: 0,id,title,publication,author,date,year,month,url,content,pub_bias,polarity,subjectivity,text
0,101379,104839,153712,Has the age of quantum computing arrived?,Guardian,Andrew Anthony,2016-05-22,2016.0,5.0,https://www.theguardian.com/technology/2016/ma...,"Ever since Charles Babbage’s conceptual, unrea...",left-center,0.136875,0.497199,Has the age of quantum computing arrived? Ever...
1,65920,69328,98425,Sanders Projected To Win The Wisconsin Democra...,Talking Points Memo,,2016-04-06,2016.0,4.0,https://web.archive.org/web/20160406060830/htt...,Sen. Bernie Sanders ( ) was projected to win t...,left-center,0.319638,0.418182,Sanders Projected To Win The Wisconsin Democra...
2,35078,35798,54621,Toxic foam floods the streets of Bangalore,CNN,,2017-05-31,2017.0,5.0,,New Delhi (CNN) In the southern Indian city of...,left-center,0.028901,0.362504,Toxic foam floods the streets of Bangalore New...
3,101226,104686,153505,"Soul star Sharon Jones dies, aged 60",Guardian,Michael Hann,2016-11-18,2016.0,11.0,https://www.theguardian.com/music/2016/nov/19/...,"The soul singer Sharon Jones has died, aged 60...",left-center,0.158978,0.453571,"Soul star Sharon Jones dies, aged 60 The soul ..."
4,62873,66194,91357,GOPers Make Last-Ditch Attempt To Tie Georgia ...,Talking Points Memo,Caitlin MacNeal,,,,https://web.archive.org/web/20170407001946/htt...,With just under two weeks to go until a specia...,left-center,0.115283,0.4622,GOPers Make Last-Ditch Attempt To Tie Georgia ...


## Create Embeddings on Training Data

In [0]:
# download model from https://tfhub.dev/google/universal-sentence-encoder/4 and save locally 
model = hub.load("news_filter/tmp")

In [0]:
# reduce logging output
logging.set_verbosity(logging.ERROR)

# compute embeddings for each article
train_embeddings = model(small_data.content)

In [6]:
# sanity check of cosine similarity of training embeddings
cosine_similarity(train_embeddings, train_embeddings)

array([[0.9999999 , 0.12882592, 0.23755008, ..., 0.19744748, 0.23914221,
        0.1751818 ],
       [0.12882592, 1.        , 0.12856285, ..., 0.17623353, 0.26177713,
        0.11302789],
       [0.23755008, 0.12856285, 1.0000001 , ..., 0.31207043, 0.26491994,
        0.1968452 ],
       ...,
       [0.19744748, 0.17623353, 0.31207043, ..., 1.0000002 , 0.43059224,
        0.19811964],
       [0.23914221, 0.26177713, 0.26491994, ..., 0.43059224, 1.0000001 ,
        0.16268337],
       [0.1751818 , 0.11302789, 0.1968452 , ..., 0.19811964, 0.16268337,
        1.0000002 ]], dtype=float32)

## Create Embeddings on Example Input Data

In [0]:
# example user inputs 
input_topics = ["Russian interference with election", "Immigration and customs enforcement", "Ariana Grande Manchester bombing", "UC Berkeley student protests", "Suicide Squad movie"]

In [0]:
# create embeddings for each user input 
input_embeddings = model(input_topics)

In [30]:
# data frame of titles and semantic similarities
cos_df = pd.DataFrame(cosine_similarity(input_embeddings, train_embeddings))
cos_df.columns = small_data.title
cos_df.index = input_topics

cos_df.shape

(5, 13000)

## Create Clusters for Each Input

In [31]:
# function to return the column index of the top n values in a row of a dataframe
def find_topind(df, i, n):
  return list(list(zip(*heapq.nlargest(n, enumerate(df.iloc[i,:]), key=operator.itemgetter(1))))[0])

# function to return the top n values in a list
def find_top(lst, ind):
  return [lst[i] for i in ind]

# how many articles per cluster
n = 10

# find index of n most similar titles 
top_ind = Parallel(n_jobs=16)(delayed(find_topind)(cos_df, i, n) for i in range(len(cos_df)))

# show most similar titles -- predicted clusters
top = Parallel(n_jobs=16)(delayed(find_top)(small_data.title, ind) for ind in top_ind)

top

[['Russian intelligence agencies behind US election hacks, lawmakers say',
  'US intelligence report: Putin ordered a hacking campaign to harm Hillary Clinton',
  '’They are totally embarrassed!’: Trump goes on hours-long tweetstorm over Russian hacking',
  'Graham: Russia’s ’trying to undermine democracies’',
  'Kellyanne Conway: Electors’ concern over Russian hacking allegations is ‘nonsense’',
  'Russian hacking: What we know and Trump doesn’t',
  'Trump Calls for Closer Relationship Between U.S. and Russia ',
  'US investigates if Russia may be trying to influence election – report',
  'Comey Repeatedly Emphasized The Gravity Of Russia’s Election Meddling',
  'Here’s how the West should respond to the Macron hack'],
 ['U.S. immigration arrests up nearly 40 percent under Trump',
  '’Sanctuary cities’ are not the problem',
  'Are undocumented immigrants committing a crime? Not necessarily ',
  'What immigrants’ advocates want you to know',
  'The Future of Deportations Under Trump',


In [32]:
# id of most similar titles 
top_id = Parallel(n_jobs=16)(delayed(find_top)(small_data.id, ind) for ind in top_ind)

top_id

[[57968, 64920, 64937, 59653, 217756, 213954, 17439, 152608, 93751, 207361],
 [182385, 64419, 51961, 51617, 74081, 51545, 172077, 177584, 147168, 205399],
 [207726, 54410, 54710, 208076, 67052, 207755, 164980, 54371, 207753, 66086],
 [206778, 161623, 204945, 18255, 148858, 53511, 207024, 58822, 97722, 66465],
 [56586, 79546, 68639, 210908, 153925, 153220, 61265, 82378, 51973, 22131]]

## Output Clusters

In [0]:
# make dataframe of clusters 

clusters = small_data.iloc[[i in top_id[0] for i in small_data.id]]
for c in range(1, len(input_topics)):
  cluster = (small_data.iloc[[i in top_id[c] for i in small_data.id]])
  clusters = pd.concat([clusters, cluster], sort=False)

clusters = clusters.drop(columns=['index', 'Unnamed: 0'])

In [34]:
clusters.shape

(50, 13)

In [0]:
# add column of cluster labels to dataframe
cluster_labs = [[i]*10 for i in range(1, 6)]
clusters["cluster_labels"] = [y for x in cluster_labs for y in x]

In [36]:
clusters

Unnamed: 0,id,title,publication,author,date,year,month,url,content,pub_bias,polarity,subjectivity,text,cluster_labels
2652,207361,Here’s how the West should respond to the Macr...,Washington Post,Editorial Board,2017-05-08,2017.0,5.0,https://web.archive.org/web/20170509003603/htt...,THE MASSIVE leak of documents from the campai...,left-center,0.097736,0.34538,Here’s how the West should respond to the Macr...,1
3582,59653,Graham: Russia’s ’trying to undermine democrac...,CNN,Eugene Scott,2016-12-10,2016.0,12.0,,(CNN) Sen. Lindsey Graham said Saturday that ...,left-center,0.058119,0.383503,Graham: Russia’s ’trying to undermine democrac...,1
5571,57968,Russian intelligence agencies behind US electi...,CNN,Tal Kopan,2016-09-22,2016.0,9.0,,Washington (CNN) The top Democrats on Congress...,left-center,0.018389,0.290897,Russian intelligence agencies behind US electi...,1
9372,64937,’They are totally embarrassed!’: Trump goes on...,Business Insider,Jeremy Berke,2017-01-08,2017.0,1.0,,’ ’ ’ Donald Trump said Saturday that the...,center,0.095214,0.618889,’They are totally embarrassed!’: Trump goes on...,1
10207,17439,Trump Calls for Closer Relationship Between U....,New York Times,Nicholas Fandos,2017-01-08,2017.0,1.0,,WASHINGTON — A day after the release of a d...,left-center,0.036063,0.344684,Trump Calls for Closer Relationship Between U....,1
10250,213954,Russian hacking: What we know and Trump doesn’t,Washington Post,Jennifer Rubin,2016-07-28,2016.0,7.0,https://web.archive.org/web/20160729002345/htt...,"Last month, The Post reported: CrowdStr...",left-center,0.020886,0.420984,Russian hacking: What we know and Trump doesn’...,1
10430,217756,Kellyanne Conway: Electors’ concern over Russi...,Washington Post,Amy B Wang,2016-12-18,2016.0,12.0,https://web.archive.org/web/20161219001336/htt...,"Kellyanne Conway, senior adviser to ...",left-center,0.049756,0.373419,Kellyanne Conway: Electors’ concern over Russi...,1
10446,93751,Comey Repeatedly Emphasized The Gravity Of Rus...,Talking Points Memo,,,,,https://web.archive.org/web/20170609023648/htt...,"Throughout his testimony Thursday, former FBI ...",left-center,0.032477,0.412605,Comey Repeatedly Emphasized The Gravity Of Rus...,1
12650,64920,US intelligence report: Putin ordered a hackin...,Business Insider,Pamela Engel,2017-01-07,2017.0,1.0,,’ ’ ” A declassified version of a on Russi...,center,0.117451,0.301422,US intelligence report: Putin ordered a hackin...,1
12919,152608,US investigates if Russia may be trying to inf...,Guardian,Jamiles Lartey,2016-09-05,2016.0,9.0,https://www.theguardian.com/us-news/2016/sep/0...,US intelligence officials are investigating th...,left-center,0.080236,0.353075,US investigates if Russia may be trying to inf...,1


In [0]:
# export dataframe of clusters
# clusters.to_csv("news_filter/data/clusters.csv", index=False)
clusters.to_csv("news_filter/data/filter_clusters.csv", index=False)