<a href="https://colab.research.google.com/github/s-miramontes/News_Filter/blob/master/notebooks/eval_bert_summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Evaluation of Summarization with BERT

In [0]:
# install libraries 

#!pip install bert-extractive-summarizer

#!pip install spacy==2.1.3
#!pip install transformers==2.2.2
#!pip install neuralcoref

#!pip install torch

In [0]:
# import statements 

import pandas as pd

from summarizer import Summarizer

from sklearn.metrics.pairwise import cosine_similarity

from joblib import Parallel, delayed

import heapq
import operator

from absl import logging

import tensorflow as tf
import tensorflow_hub as hub

## Create Summaries for Clusters from Training Data

In [0]:
# import cluster data

clusters = pd.read_csv("news_filter/data/clusters.csv")

In [0]:
# instantiate summarizer
model = Summarizer()

# function to return summary of each article in cluster
def make_summaries(cluster):
  result = {}
  for i in range(len(cluster.content)):
    summary = model(cluster.content[i], min_length=50, ratio=0.20) 
    result[i] = ''.join(summary)
  return result

In [0]:
# summarize every aritcle in clusters
cluster_summaries = []
for i in range(1,6):
  summaries = make_summaries(clusters[clusters.cluster_labels == i].reset_index())
  cluster_summaries.append(summaries)

#Parallel(n_jobs=16)(delayed(make_summaries)(clusters[clusters.cluster_labels == i].reset_index()) for i in range(1,6)) # pickling error 

In [0]:
cluster_summaries

## Create Summary of Summaries for each Cluster

In [0]:
# summarize summaries of each cluster 
summary_of_summaries = []
for summaries in cluster_summaries:
  summary = ' '.join(list(summaries.values()))
  summary_of_summaries.append(model(summary))

In [0]:
summary_of_summaries

## Evaluate Summary of Summaries with Universal Sentence Encoder

Goal: each summary is clustered with original articles used to create the summaries

In [0]:
# import original training data
small_data = pd.read_csv("news_filter/data/small_data.csv")

In [0]:
# download model from https://tfhub.dev/google/universal-sentence-encoder/4 and save locally 
emb_model = hub.load("news_filter/tmp")

In [0]:
# reduce logging output
logging.set_verbosity(logging.ERROR)

# compute embeddings for each article
train_embeddings = emb_model(small_data.content)

In [0]:
# create embeddings for each user summary 
summary_embeddings = emb_model(summary_of_summaries)

In [0]:
# data frame of titles and semantic similarities
cos_df = pd.DataFrame(cosine_similarity(summary_embeddings, train_embeddings))
cos_df.columns = small_data.title
cos_df.index = [summary_of_summaries[i][:50] for i in range(len(summary_of_summaries))]

cos_df.shape

In [0]:
# function to return the column index of the top n values in a row of a dataframe
def find_topind(df, i, n):
  return list(list(zip(*heapq.nlargest(n, enumerate(df.iloc[i,:]), key=operator.itemgetter(1))))[0])

# function to return the top n values in a list
def find_top(lst, ind):
  return [lst[i] for i in ind]

# how many articles per cluster
n = 10

# find index of n most similar titles 
top_ind = Parallel(n_jobs=16)(delayed(find_topind)(cos_df, i, n) for i in range(len(cos_df)))

In [0]:
# id of most similar titles 
top_id = Parallel(n_jobs=16)(delayed(find_top)(small_data.id, ind) for ind in top_ind)

top_id