### Summarization using Sentence Weights

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# import required libraries
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
import bs4 as BeautifulSoup
import re
import urllib.request 
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
import json
  
# Opening JSON file
f = open('/content/class_text.json')
  
# returns JSON object as 
# a dictionary
data = json.load(f)

# Closing file
f.close()


In [None]:
wnl = WordNetLemmatizer()


def text_clean_preprocess(raw_text : str):
    """
    This function will clean the data and perform certain preprocessing step of stemming/lemmatizing the words in 
    the tweet. Finally the function will remove stopwords and will only consider with words with a character length
    ranging from 2 to 10
    """
    #newString = raw_text.lower() 
    #newString = BeautifulSoup(newString, "lxml").text 
    text = re.sub("[^a-zA-Z]"," ", raw_text) 
    text = re.sub("'","", text)
    text = re.sub("\\d|\\W"," ", text)
    words = text.split()
    #meaningful_words = [wnl.lemmatize(w) for w in words if w not in stopwords.words("english") and 2< len(w)<=15]
    return " ".join(words)  

In [None]:
data['1']

['\n\xa0\nAdama takes the following precautions with the data it collects from   App Users:\nData collected from the App is anonymized;\nSecurity measures have been implemented to protect the data from     unauthorized access;\nAccess to the data collected from Users is restricted to     authorized personnel; and\nParties accessing data must agree to confidentiality terms at     least as restrictive as those contained in this privacy policy\nWhere necessary Adama shall take legal action to protect the     information it collects using the App',
 '\n\xa0Adama recognizes that the precautions it takes with User Data     cannot fully guarantee that malicious third parties will not access     the information used by the App',
 '\xa0 Although we perform routine backups of data, you are solely   responsible for all data that you transmit or that relates to any   activity you have undertaken using the App']

In [None]:
article = []
for line in data['1']:
  new_line = text_clean_preprocess(line)
  article.append(new_line)



In [None]:
article

['Adama takes the following precautions with the data it collects from App Users Data collected from the App is anonymized Security measures have been implemented to protect the data from unauthorized access Access to the data collected from Users is restricted to authorized personnel and Parties accessing data must agree to confidentiality terms at least as restrictive as those contained in this privacy policy Where necessary Adama shall take legal action to protect the information it collects using the App',
 'Adama recognizes that the precautions it takes with User Data cannot fully guarantee that malicious third parties will not access the information used by the App',
 'Although we perform routine backups of data you are solely responsible for all data that you transmit or that relates to any activity you have undertaken using the App']

In [None]:
# load data
# with open('data/BBC News Summary/News Articles/tech/001.txt') as f1:
#     article = f1.read()

# with open('data/BBC News Summary/Summaries/tech/001.txt') as f2:
#     summary = f2.read()

In [None]:
# pre-processing
#remove newlines
#article = article.replace('\n', '')

In [None]:
def create_dictionary_table(text_list) -> dict:
    text_string = " ".join(text_list)
    
    #removing stop words
    stop_words = set(stopwords.words("english"))
    
    words = word_tokenize(text_string)
    
    #reducing words to their root form
    stem = PorterStemmer()
    
    #creating dictionary for the word frequency table
    frequency_table = dict()
    for wd in words:
        wd = stem.stem(wd)
        if wd in stop_words:
            continue
        if wd in frequency_table:
            frequency_table[wd] += 1
        else:
            frequency_table[wd] = 1

    return frequency_table

In [None]:
def calculate_sentence_scores(sentences, frequency_table) -> dict:   

    #algorithm for scoring a sentence by its words
    sentence_weight = dict()

    for sentence in sentences:
        sentence_wordcount = (len(word_tokenize(sentence)))
        sentence_wordcount_without_stop_words = 0
        for word_weight in frequency_table:
            if word_weight in sentence.lower():
                sentence_wordcount_without_stop_words += 1
                if sentence[:7] in sentence_weight:
                    sentence_weight[sentence[:7]] += frequency_table[word_weight]
                else:
                    sentence_weight[sentence[:7]] = frequency_table[word_weight]

        sentence_weight[sentence[:7]] = sentence_weight[sentence[:7]] / sentence_wordcount_without_stop_words

    return sentence_weight

In [None]:
def calculate_average_score(sentence_weight) -> int:
   
    #calculating the average score for the sentences
    sum_values = 0
    for entry in sentence_weight:
        sum_values += sentence_weight[entry]

    #getting sentence average value from source text
    average_score = (sum_values / len(sentence_weight))

    return average_score

In [None]:
def get_article_summary(sentences, sentence_weight, threshold):
    sentence_counter = 0
    article_summary = ''

    for sentence in sentences:
        if sentence[:7] in sentence_weight and sentence_weight[sentence[:7]] >= (threshold):
            article_summary += " " + sentence
            sentence_counter += 1

    return article_summary

In [None]:
#creating a dictionary for the word frequency table
frequency_table = create_dictionary_table(article)

#tokenizing the sentences
#sentences = sent_tokenize(article)

sentences = article

#algorithm for scoring a sentence by its words
sentence_scores = calculate_sentence_scores(sentences, frequency_table)

#getting the threshold
threshold = calculate_average_score(sentence_scores)

#producing the summary
article_summary = get_article_summary(sentences, sentence_scores, threshold)

In [None]:
article_summary

' Adama recognizes that the precautions it takes with User Data cannot fully guarantee that malicious third parties will not access the information used by the App'

In [None]:
article

['Adama takes the following precautions with the data it collects from App Users Data collected from the App is anonymized Security measures have been implemented to protect the data from unauthorized access Access to the data collected from Users is restricted to authorized personnel and Parties accessing data must agree to confidentiality terms at least as restrictive as those contained in this privacy policy Where necessary Adama shall take legal action to protect the information it collects using the App',
 'Adama recognizes that the precautions it takes with User Data cannot fully guarantee that malicious third parties will not access the information used by the App',
 'Although we perform routine backups of data you are solely responsible for all data that you transmit or that relates to any activity you have undertaken using the App']

### Summarization using Graphs

In [None]:
# import os
# import pandas as pd
# path_, filename_, category_, article_or_summary_ = [],[],[],[]
# for dirname, _, filenames in os.walk('data\BBC News Summary'):
#     for filename in filenames:
#         path_.append(os.path.join(dirname, filename))
#         filename_.append(filename)
#         category_.append(dirname.split("\\")[-1])
#         article_or_summary_.append(dirname.split("\\")[-2])

In [None]:
# df = pd.DataFrame({"path":path_, "filename":filename_, "category":category_, "article_or_summary":article_or_summary_}, columns=["path", "filename", "category", "article_or_summary"])
# df.head()

In [None]:
pip install plotly_express

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting plotly_express
  Downloading plotly_express-0.4.1-py2.py3-none-any.whl (2.9 kB)
Installing collected packages: plotly_express
Successfully installed plotly_express-0.4.1


In [None]:
import plotly_express as pe
import cufflinks as cf
cf.go_offline()

In [None]:
# from collections import Counter

# ct = Counter(df[df['article_or_summary']=="News Articles"]["category"])
# pd.DataFrame({"category":ct.keys(), "value":ct.values()}).iplot(kind='bar', x='category', y='value')

In [None]:
# pd.DataFrame({"category":ct.keys(), "value":ct.values()}).iplot(kind='pie', labels="category", values='value')

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
import numpy as np
import networkx as nx
import re

In [None]:
# def read_article(text):        
#     sentences =[]        
#     sentences = sent_tokenize(text)    
#     for sentence in sentences:        
#         sentence.replace("[^a-zA-Z0-9]"," ")
#         sentence.replace("\n", " ")
#     return sentences

In [None]:
# file_path = df[df['article_or_summary']=='News Articles'].iloc[0]['path']
# with open(file_path, "r") as f:
#     article = f.read()

In [None]:
# sent_tok = read_article(article)

In [None]:
# spell correction

# from textblob import TextBlob
# mod_sent = []
# for tok in sent_tok:
#     blob_obj = TextBlob(tok)
#     correct_sent = str(blob_obj.correct())
#     print(f"\033[94m Original Token : {tok} \033[0m")
#     print(f"\033[92m Corrected Token: {correct_sent} \033[92m")
#     mod_sent.append(correct_sent)

In [None]:
sent_tok = article

In [None]:
sent_tok

['Adama takes the following precautions with the data it collects from App Users Data collected from the App is anonymized Security measures have been implemented to protect the data from unauthorized access Access to the data collected from Users is restricted to authorized personnel and Parties accessing data must agree to confidentiality terms at least as restrictive as those contained in this privacy policy Where necessary Adama shall take legal action to protect the information it collects using the App',
 'Adama recognizes that the precautions it takes with User Data cannot fully guarantee that malicious third parties will not access the information used by the App',
 'Although we perform routine backups of data you are solely responsible for all data that you transmit or that relates to any activity you have undertaken using the App']

In [None]:
import tensorflow_hub as hub

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

def sentence_similarity(sent1,sent2,embed):  
    A = embed([sent1])[0]
    B = embed([sent2])[0]
    return 1 - (np.dot(A,B)/(np.linalg.norm(A)*np.linalg.norm(B)))

In [None]:
print(f"\033[92m Sentence 1 : {sent_tok[0]}")
print(f"\033[92m Sentence 2 : {sent_tok[1]}")
print(f"\033[92m Similarity Score : {sentence_similarity(sent_tok[0], sent_tok[1], embed)}")

[92m Sentence 1 : Adama takes the following precautions with the data it collects from App Users Data collected from the App is anonymized Security measures have been implemented to protect the data from unauthorized access Access to the data collected from Users is restricted to authorized personnel and Parties accessing data must agree to confidentiality terms at least as restrictive as those contained in this privacy policy Where necessary Adama shall take legal action to protect the information it collects using the App
[92m Sentence 2 : Adama recognizes that the precautions it takes with User Data cannot fully guarantee that malicious third parties will not access the information used by the App
[92m Similarity Score : 0.4373796582221985


In [None]:
def build_similarity_matrix(sentences,embeds):
    similarity_matrix = np.zeros((len(sentences),len(sentences)))
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1!=idx2:
                similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1],sentences[idx2],embeds)
    return similarity_matrix

In [None]:
sim_mat = build_similarity_matrix(sent_tok, embed)

In [None]:
sim_mat

array([[0.        , 0.43737966, 0.59911197],
       [0.43737966, 0.        , 0.54380095],
       [0.59911197, 0.54380095, 0.        ]])

In [None]:
# from bokeh.io import output_notebook, show, save
# from bokeh.models import Range1d, Circle, ColumnDataSource, MultiLine
# from bokeh.plotting import figure
# from bokeh.plotting import from_networkx
# import networkx
# from bokeh.io import output_notebook, show, save

# output_notebook()

# g = nx.Graph()

# for i in range(sim_mat.shape[0]):
#     for j in range(sim_mat.shape[1]):
#         if sim_mat[i][j] >=.9:
#             g.add_edge(i, j)

# HOVER_TOOLTIPS = [("sent_tok", "@index")]
# plot = figure(tooltips = HOVER_TOOLTIPS, tools="pan,wheel_zoom,save,reset", active_scroll='wheel_zoom',x_range=Range1d(-10.1, 10.1), y_range=Range1d(-10.1, 10.1))

# network_graph = from_networkx(g, networkx.spring_layout, scale=7, center=(0, 0))
# network_graph.node_renderer.glyph = Circle(size=15,fill_color='green')
# network_graph.edge_renderer.glyph = MultiLine(line_alpha=0.5, line_width=1)
# plot.renderers.append(network_graph)
# show(plot)

In [None]:
# # summarization

# file_path_summary = df[df['article_or_summary']=='Summaries'].iloc[0]['path']
# with open(file_path_summary, "r") as f:
#     actual_summary = f.read()

NameError: ignored

In [None]:
def generate_summary(text,top_n,embeds):
    summarize_text = []  
    #sentences = read_article(text) 
    sentences = text    
    print(sentences)      
    sentence_similarity_matrix = build_similarity_matrix(sentences,embeds)  
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
    scores = nx.pagerank(sentence_similarity_graph) 
    ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)),reverse=True)
    for i in range(top_n):
        summarize_text.append(ranked_sentences[i][1]) 
    return " ".join(summarize_text)

In [None]:
Original_Text = " ".join(article)
Summarized_Text = generate_summary(article, top_n=2, embeds=embed)

['Adama takes the following precautions with the data it collects from App Users Data collected from the App is anonymized Security measures have been implemented to protect the data from unauthorized access Access to the data collected from Users is restricted to authorized personnel and Parties accessing data must agree to confidentiality terms at least as restrictive as those contained in this privacy policy Where necessary Adama shall take legal action to protect the information it collects using the App', 'Adama recognizes that the precautions it takes with User Data cannot fully guarantee that malicious third parties will not access the information used by the App', 'Although we perform routine backups of data you are solely responsible for all data that you transmit or that relates to any activity you have undertaken using the App']


In [None]:
Original_Text

'Adama takes the following precautions with the data it collects from App Users Data collected from the App is anonymized Security measures have been implemented to protect the data from unauthorized access Access to the data collected from Users is restricted to authorized personnel and Parties accessing data must agree to confidentiality terms at least as restrictive as those contained in this privacy policy Where necessary Adama shall take legal action to protect the information it collects using the App Adama recognizes that the precautions it takes with User Data cannot fully guarantee that malicious third parties will not access the information used by the App Although we perform routine backups of data you are solely responsible for all data that you transmit or that relates to any activity you have undertaken using the App'

In [None]:
Summarized_Text

'Although we perform routine backups of data you are solely responsible for all data that you transmit or that relates to any activity you have undertaken using the App Adama takes the following precautions with the data it collects from App Users Data collected from the App is anonymized Security measures have been implemented to protect the data from unauthorized access Access to the data collected from Users is restricted to authorized personnel and Parties accessing data must agree to confidentiality terms at least as restrictive as those contained in this privacy policy Where necessary Adama shall take legal action to protect the information it collects using the App'

In [None]:
# actual_summary

In [None]:
# import nltk

# hypothesis = Summarized_Text
# reference = actual_summary
# BLEUscore = nltk.translate.bleu_score.sentence_bleu([reference], hypothesis)
# print(f"BLEUscore : {BLEUscore}")