<a href="https://colab.research.google.com/github/s-miramontes/News_Filter/blob/master/eval_similaritymatrix_summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import networkx as nx


from sklearn.metrics.pairwise import cosine_similarity
from nltk.cluster.util import cosine_distance

from joblib import Parallel, delayed

from absl import logging
import tensorflow as tf
import tensorflow_hub as hub

# Load Universal Sentence Encoder and Training Data for Evaluation

In [2]:
# import the training data
small_data = pd.read_csv("")

AttributeError: ignored

In [0]:
# download model from https://tfhub.dev/google/universal-sentence-encoder/4 and save locally 
emb_model = hub.model("tmp")

In [0]:
# reduce logging output
logging.set_verbosity(logging.ERROR)

# compute embeddings for each article
train_embeddings = emb_model(small_data.content)

# Create Summaries for Clusters from Training Data

Some functions to make this possible.

In [0]:
clusters = pd.read_csv("clusters.csv")

In [0]:
def read_content(piece):
  article = piece.split(". ")
  sentences = []

  for sentence in article:
    #print(sentence)
    sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
    #sentences.pop()

  return sentences

In [0]:
# measuring similarity
def sentence_similarity(sent1, sent2, stopwords=None):
  if stopwords is None:
    stopwords=[]

  sent1 = [w.lower() for w in sent1]
  sent2 = [w.lower() for w in sent2]

  all_words = list(set(sent1 + sent2))

  vector1 = [0] * len(all_words)
  vector2 = [0] * len(all_words)

  # build vector for the first sentence
  for w in sent1:
    if w in stopwords:
      continue
    vector1[all_words.index(w)] += 1

  # build vector for second sentence
  for w in sent2:
    if w in stopwords:
      continue
    vector2[all_words.index(w)] += 1

  return 1 - cosine_distance(vector1, vector2)

In [0]:
# similarity matrix: cos sim to find similarity btw sent
def build_sim_matrix(sentences, stop_words):
  # create empty sim matrix
  sim_matrix = np.zeros((len(sentences), len(sentences)))

  for i in range(len(sentences)):
    for j in range(len(sentences)):
      if i == j: # skip if both are same sent
        continue
      sim_matrix[i][j] = sentence_similarity(sentences[i],
                                             sentences[j],
                                             stop_words)
  return sim_matrix

In [0]:
# generate summary method
def generate_summary(content, top_n=5):
  stop_words = stopwords.words('english')
  summarized_text=[]

  # first read in article and tokenize
  sentences = read_content(content)

  # second generate sim matrix accross sents
  sent_sim_mat = build_sim_matrix(sentences, stop_words)

  # third rank sentences in sim matrix
  sent_sim_graph = nx.from_numpy_array(sent_sim_mat)

  scores = nx.pagerank(sent_sim_graph, max_iter=5000)

  # fourth sort the rank and pick top sent
  ranked_sent = sorted(((scores[i],s) for i,s in enumerate(sentences)),
                       reverse=True)

  for i in range(top_n):
    summarized_text.append(" ".join(ranked_sent[i][1]))

  # fifth, output the sumarized text
  output = ". ".join(summarized_text)
  #print("Summarized Text: \n", ". ".join(summarized_text))

  return output

In [0]:
def get_summaries(cluster_name):
  '''
  input: list of articles in cluster
  output: dictionary with valid summaries
  '''

  summa = {}
  for j in range(len(cluster_name)):
    try:
      out = generate_summary(cluster_name[j])
      summa[j] = out
    except:
      continue
  
  return summa


In [0]:
def summary_summaries(cluster):
  """
  Recall that each cluster is a dictionary
  Key: Article number
  Values: Text

  Thus, cluster - dictionary
  """
  articles = cluster.values()
  concat_ = ""
  for article in articles:
    concat_ += article
  
  summary = generate_summary(concat_)

  return summary

# Generation of Summaries per Cluster


In [0]:
# Summarizing based on the given clusters

cluster_summaries = []
for i in range(1,6):
  # argument below is the list of articles for each cluster
  summaries = get_summaries(clusters[clusters.cluster_labels == i].reset_index().content)
  cluster_summaries.append(summaries)

In [0]:
cluster_summaries

# Create Summary of Summaries for each Cluster
Since there are five clusters, each of them with summaries of relevant artciles, we now concatenate and return a final summary per cluster. The following operation yields a single summary per cluster.

In [0]:
summary_of_summaries = []
for summaries in cluster_summaries:
  # this is a summary of concatenated text
  summario = summary_summaries(summaries)
  summary_of_summaries.append(summario)


In [0]:
# making sure that the number of clusters, matches the summaries
len(cluster_summaries) == len(summary_of_summaries)

In [0]:
summary_of_summaries

#Evaluation of Summaries
Goal: Each summary is clustered with the original articles that were used to create the summaries.

In [0]:
# create embeddings for each summary
summary_embeddings = emb_model(summary_of_summaries)

In [0]:
# create DF with titles on axis, and semantic similarities in cells
cos_df = pd.DataFrame(cosine_similarity(summary_embeddings, small_embeddings))
cos_df.columns = small_data.title
cos_df.index = [summary_of_summaries[i][:50] for i in range(len(summary_of_summaries))]

cos_df.shape

In [0]:
# func to return the column index of the top n values in a row of a dataframe

def find_topind(df, i ,n):
  return list(list(zip(*heapq.nlargest(n, enumerate(df.iloc[i,:]), key=operator.itemgetter(1))))[0])

# function to return the top n values in a list
def find_top(lst, ind):
  return [lst[i] for i in ind]

# how many articles per cluster
n = 10

#find index of n most similar articles
top_ind = Parallel(n_jobs=16)(delayed(find_topind)(cos_df, i, n) for i in range(len(cos_df)))

In [0]:
# ids of most similar articles 
top_id = Parallel(n_jobs=16)(delayed(find_top)(small_data.id, ind) for ind in top_ind)

In [0]:
# ids of original articles  
og_ids = []
for i in range(1,6):
  cluster = clusters[clusters.cluster_labels == i]
  og_ids.append(list(cluster.id))

So what is the percentage in accuracy?

In [0]:
# proportion of original articles clustered with summaries 
np.mean([sum([id in top_id[i] for id in og_ids[i]])/10 for i in range(len(og_ids))])