<a href="https://colab.research.google.com/github/s-miramontes/News_Filter/blob/master/notebooks/cluster_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cluster Training Data with Universal Sentence Encoder on Article Content

In [0]:
# %%capture
# # Install the latest Tensorflow version.
# !pip3 install --upgrade tensorflow-gpu
# # Install TF-Hub.
# !pip3 install tensorflow-hub
# !pip3 install seaborn

In [0]:
# import statements 

import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

from joblib import Parallel, delayed

import heapq
import operator

from absl import logging

import tensorflow as tf
import tensorflow_hub as hub

## Import Training Data

In [2]:
small_data = pd.read_csv("news_filter/data/small_data.csv")

small_data.shape

(13000, 12)

In [3]:
small_data.head()

Unnamed: 0.1,index,Unnamed: 0,id,title,publication,author,date,year,month,url,content,text
0,74496,77946,118473,"Chaos in the Family, Chaos in the State: The W...",National Review,Kevin D. Williamson,2016-03-17,2016.0,3.0,http://www.nationalreview.com/article/432876/d...,Michael Brendan Dougherty is bitter. I think t...,"Chaos in the Family, Chaos in the State: The W..."
1,71184,74592,113594,US Civil Rights Commission Will Observe Stand...,Buzzfeed News,Nidhi Subbaraman,2016-12-08,2016.0,12.0,https://web.archive.org/web/20161208153906/htt...,WASHINGTON — The US Commission on Civil Ri...,US Civil Rights Commission Will Observe Stand...
2,120205,123668,184574,"Venezuela hunts rogue helicopter attackers, Ma...",Reuters,Andrew Cawthorne and Victoria Ramirez,2017-06-29,2017.0,6.0,http://www.reuters.com/article/us-venezuela-po...,The Venezuelan government hunted on Wednesday...,"Venezuela hunts rogue helicopter attackers, Ma..."
3,128977,132440,199665,Fruit juice isn’t much better for you than sod...,Vox,Julia Belluz,2016/3/25,2016.0,3.0,http://www.vox.com/2016/3/25/11305614/soda-jui...,One of the biggest public health wins of rece...,Fruit juice isn’t much better for you than sod...
4,134837,138300,208223,Sessions won’t testify at congressional budget...,Washington Post,Sari Horwitz,2017-06-10,2017.0,6.0,https://web.archive.org/web/20170611000758/htt...,"Attorney General Jeff Sessions, who had agree...",Sessions won’t testify at congressional budget...


## Create Embeddings on Training Data

In [0]:
# download model from https://tfhub.dev/google/universal-sentence-encoder/4 and save locally 
model = hub.load("news_filter/tmp")

In [0]:
# reduce logging output
logging.set_verbosity(logging.ERROR)

# compute embeddings for each article
train_embeddings = model(small_data.content)

In [6]:
# sanity check of cosine similarity of training embeddings
cosine_similarity(train_embeddings, train_embeddings)

array([[1.0000001 , 0.2899389 , 0.337429  , ..., 0.46606538, 0.2253764 ,
        0.3247815 ],
       [0.2899389 , 1.0000001 , 0.5517271 , ..., 0.27665898, 0.18718976,
        0.22730926],
       [0.337429  , 0.5517271 , 0.99999964, ..., 0.3719284 , 0.26888457,
        0.1751447 ],
       ...,
       [0.46606538, 0.27665898, 0.3719284 , ..., 0.99999994, 0.31106484,
        0.21267769],
       [0.2253764 , 0.18718976, 0.26888457, ..., 0.31106484, 1.        ,
        0.30073023],
       [0.3247815 , 0.22730926, 0.1751447 , ..., 0.21267769, 0.30073023,
        1.0000001 ]], dtype=float32)

## Create Embeddings on Example Input Data

In [0]:
# example user inputs 
input_topics = ["Russian interference with election", "Hillary Clinton WikiLeaks email Benghazi", "Women's march highlights", "Bernie Sander's Campaign", "ICE immigration policies"]

In [0]:
# create embeddings for each user input 
input_embeddings = model(input_topics)

In [9]:
# data frame of titles and semantic similarities
cos_df = pd.DataFrame(cosine_similarity(input_embeddings, train_embeddings))
cos_df.columns = small_data.title
cos_df.index = input_topics

cos_df.shape

(5, 13000)

## Create Clusters for Each Input

In [10]:
# function to return the column index of the top n values in a row of a dataframe
def find_topind(df, i, n):
  return list(list(zip(*heapq.nlargest(n, enumerate(df.iloc[i,:]), key=operator.itemgetter(1))))[0])

# function to return the top n values in a list
def find_top(lst, ind):
  return [lst[i] for i in ind]

# how many articles per cluster
n = 10

# find index of n most similar titles 
top_ind = Parallel(n_jobs=16)(delayed(find_topind)(cos_df, i, n) for i in range(len(cos_df)))

# show most similar titles -- predicted clusters
top = Parallel(n_jobs=16)(delayed(find_top)(small_data.title, ind) for ind in top_ind)

top
# topics: "Russian interference with election", "Hillary Clinton WikiLeaks email Benghazi", "Women's march highlights", "Bernie Sander's Campaign", "ICE immigration policies"

[['Clinton campaign backs effort to brief Electoral College members on Russian hacking',
  'Trump: Special Counsel Will Prove My Team Didn’t Collude With Russia',
  'Paul Ryan Backs President Obama’s Sanctions Over Alleged Russian Hacks ',
  'The US government is launching a massive effort to stop Russia from hacking the election',
  'Russian hacking activity continues unabated',
  'Top-secret NSA report: Russian hackers tried to breach US voting systems days before the election',
  'Putin’s chaos strategy is coming back to bite him',
  'Clinton Aide Jennifer Palmieri Blasts Trump for Being ‘Preoccupied’ with Russia',
  'Russia’s assault on America’s elections is just one example of a global threat',
  'Russia intervened to help Trump win election: intelligence officials'],
 ['Assange: ’We have more material related to the Hillary Clinton campaign’',
  'The GOP stoops for scandal',
  'Donald Trump: ’Good Job, Huma. Thank you, Anthony Weiner’ ',
  'Clinton team used special program to s

In [19]:
# id of most similar titles 
top_id = Parallel(n_jobs=16)(delayed(find_top)(small_data.id, ind) for ind in top_ind)

top_id

[[72962, 92926, 45558, 72358, 59828, 67261, 72300, 31540, 205453, 194362],
 [56439, 213302, 40299, 85727, 39429, 92404, 70464, 86771, 68550, 49591],
 [60705, 103164, 122748, 73686, 147468, 161410, 38222, 34095, 74730, 48701],
 [96452, 49182, 68874, 96444, 67766, 55909, 97167, 199737, 117693, 39532],
 [163918, 28111, 205111, 46778, 34717, 80967, 120670, 214420, 45658, 163453]]

## Output Clusters

In [0]:
# make dataframe of clusters 

clusters = small_data.iloc[[i in top_id[0] for i in small_data.id]]
for c in range(1, len(input_topics)):
  cluster = (small_data.iloc[[i in top_id[c] for i in small_data.id]])
  clusters = pd.concat([clusters, cluster], sort=False)

clusters = clusters.drop(columns=['index', 'Unnamed: 0'])

In [24]:
clusters.shape

(50, 10)

In [0]:
# add column of cluster labels to dataframe
cluster_labs = [[i]*10 for i in range(1, 6)]
clusters["cluster_labels"] = [y for x in cluster_labs for y in x]

In [34]:
clusters

Unnamed: 0,id,title,publication,author,date,year,month,url,content,text,cluster_labels
3168,45558,Paul Ryan Backs President Obama’s Sanctions Ov...,Breitbart,Adelle Nazarian,2016-12-29,2016.0,12.0,,House Speaker Paul Ryan issued a statement on ...,Paul Ryan Backs President Obama’s Sanctions Ov...,1
7266,72300,Putin’s chaos strategy is coming back to bite him,Business Insider,,2016-10-31,2016.0,10.0,,"’ ’ ’ Back in March, when the U. S. electio...",Putin’s chaos strategy is coming back to bite ...,1
9885,194362,Russia intervened to help Trump win election: ...,Reuters,John Walcott,2016-12-10,2016.0,12.0,http://www.reuters.com/article/us-usa-election...,U. S. intelligence analysts have concluded th...,Russia intervened to help Trump win election: ...,1
10309,59828,Russian hacking activity continues unabated,CNN,Shimon Prokupecz,2016-12-16,2016.0,12.0,,(CNN) Russian cyberhacking activity has conti...,Russian hacking activity continues unabated (...,1
11376,92926,Trump: Special Counsel Will Prove My Team Didn...,Talking Points Memo,,,,,https://web.archive.org/web/20170518003300/htt...,President Donald Trump responded Wednesday to ...,Trump: Special Counsel Will Prove My Team Didn...,1
11974,205453,Russia’s assault on America’s elections is jus...,Washington Post,David Ignatius,2017-02-23,2017.0,2.0,https://web.archive.org/web/20170224004841/htt...,One of the most startling allegations in a Ja...,Russia’s assault on America’s elections is jus...,1
11990,67261,Top-secret NSA report: Russian hackers tried t...,Business Insider,Natasha Bertrand,2017-06-06,2017.0,6.0,,’ ’ ’ Hackers associated with Russia’’s mil...,Top-secret NSA report: Russian hackers tried t...,1
12028,72962,Clinton campaign backs effort to brief Elector...,Business Insider,Pamela Engel,2016-12-13,2016.0,12.0,,’ ’ ’ A top adviser to Hillary Clinton said...,Clinton campaign backs effort to brief Elector...,1
12237,72358,The US government is launching a massive effor...,Business Insider,Mark Abadi,2016-11-04,2016.0,11.0,,’ ’ ” The US government is launching a mass...,The US government is launching a massive effor...,1
12265,31540,Clinton Aide Jennifer Palmieri Blasts Trump fo...,Breitbart,Adam Shaw,2017-05-16,2017.0,5.0,,Former Clinton aide Jennifer Palmieri has blas...,Clinton Aide Jennifer Palmieri Blasts Trump fo...,1


In [0]:
# export dataframe of clusters
clusters.to_csv("news_filter/data/clusters.csv", index=False)