<a href="https://colab.research.google.com/github/poojith18/Text_Summarization/blob/main/Text_Summarization_Cosine_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text summarization - Cosine similarity

# Preparing the environment

In [None]:
import re
import nltk
import string
import numpy as np
import networkx as nx
from nltk.cluster.util import cosine_distance

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
def preprocess(text):
  formatted_text = text.lower()
  tokens = []
  for token in nltk.word_tokenize(formatted_text):
    tokens.append(token)
  tokens = [word for word in tokens if word not in stopwords and word not in string.punctuation]
  formatted_text = ' '.join(element for element in tokens)

  return formatted_text

In [None]:
original_text = """Artificial intelligence is human like intelligence. 
                   It is the study of intelligent artificial agents. 
                   Science and engineering to produce intelligent machines. 
                   Solve problems and have intelligence. 
                   Related to intelligent behavior. 
                   Developing of reasoning machines. 
                   Learn from mistakes and successes. 
                   Artificial intelligence is related to reasoning in everyday situations."""
original_text = re.sub(r'\s+', ' ', original_text)
original_text

'Artificial intelligence is human like intelligence. It is the study of intelligent artificial agents. Science and engineering to produce intelligent machines. Solve problems and have intelligence. Related to intelligent behavior. Developing of reasoning machines. Learn from mistakes and successes. Artificial intelligence is related to reasoning in everyday situations.'

# Function to calculate similarity between sentences

- Link: https://en.wikipedia.org/wiki/Cosine_similarity
- Step by step calculations: https://janav.wordpress.com/2013/10/27/tf-idf-and-cosine-similarity/

In [None]:
original_sentences = [sentence for sentence in nltk.sent_tokenize(original_text)]
original_sentences

['Artificial intelligence is human like intelligence.',
 'It is the study of intelligent artificial agents.',
 'Science and engineering to produce intelligent machines.',
 'Solve problems and have intelligence.',
 'Related to intelligent behavior.',
 'Developing of reasoning machines.',
 'Learn from mistakes and successes.',
 'Artificial intelligence is related to reasoning in everyday situations.']

In [None]:
formatted_sentences = [preprocess(original_sentence) for original_sentence in original_sentences]
formatted_sentences

['artificial intelligence human like intelligence',
 'study intelligent artificial agents',
 'science engineering produce intelligent machines',
 'solve problems intelligence',
 'related intelligent behavior',
 'developing reasoning machines',
 'learn mistakes successes',
 'artificial intelligence related reasoning everyday situations']

In [None]:
def calculate_sentence_similarity(sentence1, sentence2):
  words1 = [word for word in nltk.word_tokenize(sentence1)]
  words2 = [word for word in nltk.word_tokenize(sentence2)]
  #print(words1)
  #print(words2)

  all_words = list(set(words1 + words2))
  #print(all_words)

  vector1 = [0] * len(all_words)
  vector2 = [0] * len(all_words)
  #print(vector1)
  #print(vector2)

  for word in words1: # Bag of words
    #print(word)
    vector1[all_words.index(word)] += 1
  for word in words2:
    vector2[all_words.index(word)] += 1
  
  #print(vector1)
  #print(vector2)

  return 1 - cosine_distance(vector1, vector2)

In [None]:
calculate_sentence_similarity(formatted_sentences[0], formatted_sentences[1])

0.18898223650461365

In [None]:
test = ['human', 'study', 'intelligence', 'agents', 'intelligent', 'artificial', 'like']
test.index('agents')

3

# Function to create the similarity matrix

In [None]:
# The higher the value, the greater the similarity between the sentences
# The more words in common, the greater the similarity

In [None]:
def calculate_similarity_matrix(sentences):
  similarity_matrix = np.zeros((len(sentences), len(sentences)))
  #print(similarity_matrix)
  for i in range(len(sentences)):
    for j in range(len(sentences)):
      if i == j:
        continue
      similarity_matrix[i][j] = calculate_sentence_similarity(sentences[i], sentences[j])
  return similarity_matrix

In [None]:
calculate_similarity_matrix(formatted_sentences)

array([[0.        , 0.18898224, 0.        , 0.43643578, 0.        ,
        0.        , 0.        , 0.46291005],
       [0.18898224, 0.        , 0.2236068 , 0.        , 0.28867513,
        0.        , 0.        , 0.20412415],
       [0.        , 0.2236068 , 0.        , 0.        , 0.25819889,
        0.25819889, 0.        , 0.        ],
       [0.43643578, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.23570226],
       [0.        , 0.28867513, 0.25819889, 0.        , 0.        ,
        0.        , 0.        , 0.23570226],
       [0.        , 0.        , 0.25819889, 0.        , 0.        ,
        0.        , 0.        , 0.23570226],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.46291005, 0.20412415, 0.        , 0.23570226, 0.23570226,
        0.23570226, 0.        , 0.        ]])

# Function to summarize the texts

- Pagerank algorithm: https://en.wikipedia.org/wiki/PageRank


In [None]:
for i, score in enumerate(original_sentences):
  print(i, score)

0 Artificial intelligence is human like intelligence.
1 It is the study of intelligent artificial agents.
2 Science and engineering to produce intelligent machines.
3 Solve problems and have intelligence.
4 Related to intelligent behavior.
5 Developing of reasoning machines.
6 Learn from mistakes and successes.
7 Artificial intelligence is related to reasoning in everyday situations.


In [None]:
def summarize(text, number_of_sentences, percentage = 0):
  original_sentences = [sentence for sentence in nltk.sent_tokenize(text)]
  formatted_sentences = [preprocess(original_sentence) for original_sentence in original_sentences]
  similarity_matrix = calculate_similarity_matrix(formatted_sentences)
  #print(similarity_matrix)

  similarity_graph = nx.from_numpy_array(similarity_matrix)
  #print(similarity_graph.nodes)
  #print(similarity_graph.edges)

  scores = nx.pagerank(similarity_graph)
  #print(scores)
  ordered_scores = sorted(((scores[i], score) for i, score in enumerate(original_sentences)), reverse=True)
  #print(ordered_scores)

  if percentage > 0:
    number_of_sentences = int(len(formatted_sentences) * percentage)

  best_sentences = []
  for sentence in range(number_of_sentences):
    best_sentences.append(ordered_scores[sentence][1])
  
  return original_sentences, best_sentences, ordered_scores

In [None]:
original_sentences, best_sentences, scores = summarize(original_text, 3)

In [None]:
original_sentences

['Artificial intelligence is human like intelligence.',
 'It is the study of intelligent artificial agents.',
 'Science and engineering to produce intelligent machines.',
 'Solve problems and have intelligence.',
 'Related to intelligent behavior.',
 'Developing of reasoning machines.',
 'Learn from mistakes and successes.',
 'Artificial intelligence is related to reasoning in everyday situations.']

In [None]:
best_sentences

['Artificial intelligence is related to reasoning in everyday situations.',
 'Artificial intelligence is human like intelligence.',
 'It is the study of intelligent artificial agents.']

In [None]:
scores

[(0.21117425706958176,
  'Artificial intelligence is related to reasoning in everyday situations.'),
 (0.16732274004260209, 'Artificial intelligence is human like intelligence.'),
 (0.14555588797159139, 'It is the study of intelligent artificial agents.'),
 (0.12904008009949403, 'Related to intelligent behavior.'),
 (0.12753383065335558,
  'Science and engineering to produce intelligent machines.'),
 (0.10880185703391372, 'Solve problems and have intelligence.'),
 (0.08959232615044063, 'Developing of reasoning machines.'),
 (0.020979020979020983, 'Learn from mistakes and successes.')]

In [None]:
from IPython.core.display import HTML
def visualize(title, sentence_list, best_sentences):
  text = ''

  display(HTML(f'<h1>Summary - {title}</h1>'))
  for sentence in sentence_list:
    if sentence in best_sentences:
      text += ' ' + str(sentence).replace(sentence, f"<mark>{sentence}</mark>")
    else:
      text += ' ' + sentence
  display(HTML(f""" {text} """))

In [None]:
visualize('Artificial intelligence', original_sentences, best_sentences)

# Extracting texts from the Internet

In [None]:
!pip install goose3

In [None]:
from goose3 import Goose
g = Goose()
url = 'https://en.wikipedia.org/wiki/Automatic_summarization'
article = g.extract(url)

In [None]:
article.cleaned_text

'Automatic summarization is the process of shortening a set of data computationally, to create a subset (a summary) that represents the most important or relevant information within the original content.\n\nIn addition to text, images and videos can also be summarized. Text summarization finds the most informative sentences in a document;[1] image summarization finds the most representative images within an image collection[citation needed]; video summarization extracts the most important frames from the video content.[2]\n\nThere are two general approaches to automatic summarization: extraction and abstraction.\n\nHere, content is extracted from the original data, but the extracted content is not modified in any way. Examples of extracted content include key-phrases that can be used to "tag" or index a text document, or key sentences (including headings) that collectively comprise an abstract, and representative images or video segments, as stated above. For text, extraction is analog

In [None]:
original_sentences, best_sentences, scores = summarize(article.cleaned_text, 120, 0.2)

In [None]:
(120 / len(original_sentences)) * 100

40.54054054054054

In [None]:
original_sentences

['Automatic summarization is the process of shortening a set of data computationally, to create a subset (a summary) that represents the most important or relevant information within the original content.',
 'In addition to text, images and videos can also be summarized.',
 'Text summarization finds the most informative sentences in a document;[1] image summarization finds the most representative images within an image collection[citation needed]; video summarization extracts the most important frames from the video content.',
 '[2]\n\nThere are two general approaches to automatic summarization: extraction and abstraction.',
 'Here, content is extracted from the original data, but the extracted content is not modified in any way.',
 'Examples of extracted content include key-phrases that can be used to "tag" or index a text document, or key sentences (including headings) that collectively comprise an abstract, and representative images or video segments, as stated above.',
 'For text, 

In [None]:
best_sentences

['The main difficulty in supervised extractive summarization is that the known summaries must be manually created by extracting sentences so the sentences in an original training document can be labeled as "in summary" or "not in summary".',
 'Another important distinction is that TextRank was used for single document summarization, while LexRank has been applied to multi-document summarization.',
 '"Summarizing Conceptual Graphs for Automatic Summarization Task".',
 'Some unsupervised summarization approaches are based on finding a "centroid" sentence, which is the mean word vector of all the sentences in the document.',
 'Like keyphrase extraction, document summarization aims to identify the essence of a text.',
 'Image collection summarization is another application example of automatic summarization.',
 'An example of a summarization problem is document summarization, which attempts to automatically produce an abstract from a given document.',
 'Text summarization finds the most in

In [None]:
scores

[(0.008066410954445168,
  'The main difficulty in supervised extractive summarization is that the known summaries must be manually created by extracting sentences so the sentences in an original training document can be labeled as "in summary" or "not in summary".'),
 (0.007128879450849699,
  'Another important distinction is that TextRank was used for single document summarization, while LexRank has been applied to multi-document summarization.'),
 (0.006955931011572642,
  '"Summarizing Conceptual Graphs for Automatic Summarization Task".'),
 (0.006863732638140646,
  'Some unsupervised summarization approaches are based on finding a "centroid" sentence, which is the mean word vector of all the sentences in the document.'),
 (0.0066832688001826545,
  'Like keyphrase extraction, document summarization aims to identify the essence of a text.'),
 (0.006670697377283339,
  'Image collection summarization is another application example of automatic summarization.'),
 (0.006633247787878496,
 

In [None]:
visualize(article.title, original_sentences, best_sentences)