<a href="https://colab.research.google.com/github/s-miramontes/News_Filter/blob/master/notebooks/cluster_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cluster Training Data with Universal Sentence Encoder on Article Content

In [0]:
# %%capture
# # Install the latest Tensorflow version.
# !pip3 install --upgrade tensorflow-gpu
# # Install TF-Hub.
# !pip3 install tensorflow-hub
# !pip3 install seaborn

In [0]:
# import statements 

import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

from joblib import Parallel, delayed

import heapq
import operator

from absl import logging

import tensorflow as tf
import tensorflow_hub as hub

## Import Training Data

In [0]:
small_data = pd.read_csv("news_filter/data/small_data.csv")

small_data.shape

In [0]:
small_data.head()

## Create Embeddings on Training Data

In [0]:
# download model from https://tfhub.dev/google/universal-sentence-encoder/4 and save locally 
model = hub.load("news_filter/tmp")

In [0]:
# reduce logging output
logging.set_verbosity(logging.ERROR)

# compute embeddings for each article
train_embeddings = model(small_data.content)

In [0]:
# sanity check of cosine similarity of training embeddings
cosine_similarity(train_embeddings, train_embeddings)

## Create Embeddings on Example Input Data

In [0]:
# example user inputs 
input_topics = ["Russian interference with election", "Hillary Clinton WikiLeaks email Benghazi", "Women's march highlights", "Bernie Sander's Campaign", "ICE immigration policies"]

In [0]:
# create embeddings for each user input 
input_embeddings = model(input_topics)

In [0]:
# data frame of titles and semantic similarities
cos_df = pd.DataFrame(cosine_similarity(input_embeddings, train_embeddings))
cos_df.columns = small_data.title
cos_df.index = input_topics

cos_df.shape

## Create Clusters for Each Input

In [0]:
# function to return the column index of the top n values in a row of a dataframe
def find_topind(df, i, n):
  return list(list(zip(*heapq.nlargest(n, enumerate(df.iloc[i,:]), key=operator.itemgetter(1))))[0])

# function to return the top n values in a list
def find_top(lst, ind):
  return [lst[i] for i in ind]

# how many articles per cluster
n = 10

# find index of n most similar titles 
top_ind = Parallel(n_jobs=16)(delayed(find_topind)(cos_df, i, n) for i in range(len(cos_df)))

# show most similar titles -- predicted clusters
top = Parallel(n_jobs=16)(delayed(find_top)(small_data.title, ind) for ind in top_ind)

top
# topics: "Russian interference with election", "Wikileaks hacking Hillary Clinton's email", "democratic campaigns", "Trump against Obamacare", "Trump's Campaign"

In [0]:
# id of most similar titles 
top_id = Parallel(n_jobs=16)(delayed(find_top)(small_data.id, ind) for ind in top_ind)

top_id

## Output Clusters

In [0]:
for c in range(len(input_topics)):
  cluster = small_data.iloc[[i in top_id[c] for i in small_data.id]]
  print(cluster)