<a href="https://colab.research.google.com/github/poojith18/Text_Summarization/blob/main/Text_Summarization_Frequency_based_algorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text summarization - Frequency based algorithm

# Preprocessing the texts

In [None]:
import re # relugar expression
import nltk # natural language toolkit
import string

In [None]:
# I added the word machine at the end of the last sentence
original_text = """Artificial intelligence is human like intelligence. 
                   It is the study of intelligent artificial agents. 
                   Science and engineering to produce intelligent machines. 
                   Solve problems and have intelligence. 
                   Related to intelligent behavior. 
                   Developing of reasoning machines. 
                   Learn from mistakes and successes. 
                   Artificial intelligence is related to reasoning in everyday situations."""

In [None]:
original_text

'Artificial intelligence is human like intelligence. \n                   It is the study of intelligent artificial agents. \n                   Science and engineering to produce intelligent machines. \n                   Solve problems and have intelligence. \n                   Related to intelligent behavior. \n                   Developing of reasoning machines. \n                   Learn from mistakes and successes. \n                   Artificial intelligence is related to reasoning in everyday situations.'

In [None]:
original_text = re.sub(r'\s+', ' ', original_text)

In [None]:
original_text

'Artificial intelligence is human like intelligence. It is the study of intelligent artificial agents. Science and engineering to produce intelligent machines. Solve problems and have intelligence. Related to intelligent behavior. Developing of reasoning machines. Learn from mistakes and successes. Artificial intelligence is related to reasoning in everyday situations.'

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
len(stopwords)

179

In [None]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
def preprocess(text):
  formatted_text = text.lower()
  tokens = []
  for token in nltk.word_tokenize(formatted_text):
    tokens.append(token)
  #print(tokens)
  tokens = [word for word in tokens if word not in stopwords and word not in string.punctuation]
  formatted_text = ' '.join(element for element in tokens)

  return formatted_text

In [None]:
formatted_text = preprocess(original_text)
formatted_text

'artificial intelligence human like intelligence study intelligent artificial agents science engineering produce intelligent machines solve problems intelligence related intelligent behavior developing reasoning machines learn mistakes successes artificial intelligence related reasoning everyday situations'

# Word frequency

In [None]:
word_frequency = nltk.FreqDist(nltk.word_tokenize(formatted_text))
word_frequency

FreqDist({'agents': 1,
          'artificial': 3,
          'behavior': 1,
          'developing': 1,
          'engineering': 1,
          'everyday': 1,
          'human': 1,
          'intelligence': 4,
          'intelligent': 3,
          'learn': 1,
          'like': 1,
          'machines': 2,
          'mistakes': 1,
          'problems': 1,
          'produce': 1,
          'reasoning': 2,
          'related': 2,
          'science': 1,
          'situations': 1,
          'solve': 1,
          'study': 1,
          'successes': 1})

In [None]:
word_frequency['intelligence']

4

In [None]:
word_frequency.keys()

dict_keys(['artificial', 'intelligence', 'human', 'like', 'study', 'intelligent', 'agents', 'science', 'engineering', 'produce', 'machines', 'solve', 'problems', 'related', 'behavior', 'developing', 'reasoning', 'learn', 'mistakes', 'successes', 'everyday', 'situations'])

In [None]:
len(word_frequency.keys())

22

In [None]:
highest_frequency = max(word_frequency.values())
highest_frequency

4

In [None]:
for word in word_frequency.keys():
  #print(word)
  word_frequency[word] = (word_frequency[word] / highest_frequency)

In [None]:
word_frequency

FreqDist({'agents': 0.25,
          'artificial': 0.75,
          'behavior': 0.25,
          'developing': 0.25,
          'engineering': 0.25,
          'everyday': 0.25,
          'human': 0.25,
          'intelligence': 1.0,
          'intelligent': 0.75,
          'learn': 0.25,
          'like': 0.25,
          'machines': 0.5,
          'mistakes': 0.25,
          'problems': 0.25,
          'produce': 0.25,
          'reasoning': 0.5,
          'related': 0.5,
          'science': 0.25,
          'situations': 0.25,
          'solve': 0.25,
          'study': 0.25,
          'successes': 0.25})

# Sentence tokenization

In [None]:
'Phd John went home. He arrived early.'.split('.')

['Phd John went home', ' He arrived early', '']

In [None]:
'Ph.d John went home. He arrived early.'.split('.')

['Ph', 'd John went home', ' He arrived early', '']

In [None]:
nltk.sent_tokenize('Ph.d John went home. He arrived early.')

['Ph.d John went home.', 'He arrived early.']

In [None]:
sentence_list = nltk.sent_tokenize(original_text)
sentence_list

['Artificial intelligence is human like intelligence.',
 'It is the study of intelligent artificial agents.',
 'Science and engineering to produce intelligent machines.',
 'Solve problems and have intelligence.',
 'Related to intelligent behavior.',
 'Developing of reasoning machines.',
 'Learn from mistakes and successes.',
 'Artificial intelligence is related to reasoning in everyday situations.']

# Generate the summary (score for sentences)

In [None]:
word_frequency

FreqDist({'agents': 0.25,
          'artificial': 0.75,
          'behavior': 0.25,
          'developing': 0.25,
          'engineering': 0.25,
          'everyday': 0.25,
          'human': 0.25,
          'intelligence': 1.0,
          'intelligent': 0.75,
          'learn': 0.25,
          'like': 0.25,
          'machines': 0.5,
          'mistakes': 0.25,
          'problems': 0.25,
          'produce': 0.25,
          'reasoning': 0.5,
          'related': 0.5,
          'science': 0.25,
          'situations': 0.25,
          'solve': 0.25,
          'study': 0.25,
          'successes': 0.25})

In [None]:
score_sentences = {}
for sentence in sentence_list:
  #print(sentence)
  for word in nltk.word_tokenize(sentence.lower()):
    #print(word)
    if sentence not in score_sentences.keys():
      score_sentences[sentence] = word_frequency[word]
    else:
      score_sentences[sentence] += word_frequency[word]

In [None]:
score_sentences

{'Artificial intelligence is human like intelligence.': 3.25,
 'Artificial intelligence is related to reasoning in everyday situations.': 3.25,
 'Developing of reasoning machines.': 1.25,
 'It is the study of intelligent artificial agents.': 2.0,
 'Learn from mistakes and successes.': 0.75,
 'Related to intelligent behavior.': 1.5,
 'Science and engineering to produce intelligent machines.': 2.0,
 'Solve problems and have intelligence.': 1.5}

In [None]:
score_sentences['Solve problems and have intelligence.']

1.5

In [None]:
score_sentences.keys()

dict_keys(['Artificial intelligence is human like intelligence.', 'It is the study of intelligent artificial agents.', 'Science and engineering to produce intelligent machines.', 'Solve problems and have intelligence.', 'Related to intelligent behavior.', 'Developing of reasoning machines.', 'Learn from mistakes and successes.', 'Artificial intelligence is related to reasoning in everyday situations.'])

In [None]:
import heapq
best_sentences = heapq.nlargest(3, score_sentences, key = score_sentences.get)

In [None]:
best_sentences

['Artificial intelligence is human like intelligence.',
 'Artificial intelligence is related to reasoning in everyday situations.',
 'It is the study of intelligent artificial agents.']

In [None]:
summary = ' '.join(best_sentences)
summary

'Artificial intelligence is human like intelligence. Artificial intelligence is related to reasoning in everyday situations. It is the study of intelligent artificial agents.'

In [None]:
original_text

'Artificial intelligence is human like intelligence. It is the study of intelligent artificial agents. Science and engineering to produce intelligent machines. Solve problems and have intelligence. Related to intelligent behavior. Developing of reasoning machines. Learn from mistakes and successes. Artificial intelligence is related to reasoning in everyday situations.'

# Visualizing the summary in HTML

In [None]:
from IPython.core.display import HTML

In [None]:
text = ''
display(HTML(f'<h2>Summary</h2>'))
for sentence in sentence_list:
  #print(sentence)
  #text += sentence
  if sentence in best_sentences:
    text += ' ' + sentence.replace(sentence, f"<mark>{sentence}</mark>")
  else:
    text += ' ' + sentence

display(HTML(f"""{text}"""))

# Extracting texts from the Internet

In [None]:
!pip install goose3

In [None]:
from goose3 import Goose

In [None]:
g = Goose()
url = 'https://en.wikipedia.org/wiki/Automatic_summarization'
article = g.extract(url)

In [None]:
article.infos

{'authors': [],
 'cleaned_text': 'Automatic summarization is the process of shortening a set of data computationally, to create a subset (a summary) that represents the most important or relevant information within the original content.\n\nIn addition to text, images and videos can also be summarized. Text summarization finds the most informative sentences in a document;[1] image summarization finds the most representative images within an image collection[citation needed]; video summarization extracts the most important frames from the video content.[2]\n\nThere are two general approaches to automatic summarization: extraction and abstraction.\n\nHere, content is extracted from the original data, but the extracted content is not modified in any way. Examples of extracted content include key-phrases that can be used to "tag" or index a text document, or key sentences (including headings) that collectively comprise an abstract, and representative images or video segments, as stated abov

In [None]:
article.title

'Automatic summarization - Wikipedia'

In [None]:
article.cleaned_text

'Automatic summarization is the process of shortening a set of data computationally, to create a subset (a summary) that represents the most important or relevant information within the original content.\n\nIn addition to text, images and videos can also be summarized. Text summarization finds the most informative sentences in a document;[1] image summarization finds the most representative images within an image collection[citation needed]; video summarization extracts the most important frames from the video content.[2]\n\nThere are two general approaches to automatic summarization: extraction and abstraction.\n\nHere, content is extracted from the original data, but the extracted content is not modified in any way. Examples of extracted content include key-phrases that can be used to "tag" or index a text document, or key sentences (including headings) that collectively comprise an abstract, and representative images or video segments, as stated above. For text, extraction is analog

In [None]:
len(article.cleaned_text)

36000

In [None]:
formatted_article = preprocess(article.cleaned_text)
formatted_article

"automatic summarization process shortening set data computationally create subset summary represents important relevant information within original content addition text images videos also summarized text summarization finds informative sentences document 1 image summarization finds representative images within image collection citation needed video summarization extracts important frames video content 2 two general approaches automatic summarization extraction abstraction content extracted original data extracted content modified way examples extracted content include key-phrases used `` tag '' index text document key sentences including headings collectively comprise abstract representative images video segments stated text extraction analogous process skimming summary available headings subheadings figures first last paragraphs section optionally first last sentences paragraph read one chooses read entire document detail 3 examples extraction include key sequences text terms clinic

In [None]:
len(formatted_article)

27352

In [None]:
def summarize(text, number_of_sentences, percentage = 0):
  original_text = text
  formatted_text = preprocess(original_text)

  word_frequency = nltk.FreqDist(nltk.word_tokenize(formatted_text))
  highest_frequency = max(word_frequency.values())
  for word in word_frequency.keys():
    word_frequency[word] = (word_frequency[word] / highest_frequency)
  sentence_list = nltk.sent_tokenize(original_text)
  
  score_sentences = {}
  for sentence in sentence_list:
    for word in nltk.word_tokenize(sentence):
      if word in word_frequency.keys():
        if sentence not in score_sentences.keys():
          score_sentences[sentence] = word_frequency[word]
        else:
          score_sentences[sentence] += word_frequency[word]

  import heapq
  if percentage > 0:
    best_sentences = heapq.nlargest(int(len(sentence_list) * percentage), score_sentences, key=score_sentences.get)
  else:
    best_sentences = heapq.nlargest(number_of_sentences, score_sentences, key=score_sentences.get)

  return sentence_list, best_sentences, word_frequency, score_sentences

In [None]:
len(sentence_list)

296

In [None]:
(120 / len(sentence_list)) * 100

40.54054054054054

In [None]:
sentence_list, best_sentences, word_frequency, score_sentences = summarize(article.cleaned_text, 100)

In [None]:
sentence_list

['Automatic summarization is the process of shortening a set of data computationally, to create a subset (a summary) that represents the most important or relevant information within the original content.',
 'In addition to text, images and videos can also be summarized.',
 'Text summarization finds the most informative sentences in a document;[1] image summarization finds the most representative images within an image collection[citation needed]; video summarization extracts the most important frames from the video content.',
 '[2]\n\nThere are two general approaches to automatic summarization: extraction and abstraction.',
 'Here, content is extracted from the original data, but the extracted content is not modified in any way.',
 'Examples of extracted content include key-phrases that can be used to "tag" or index a text document, or key sentences (including headings) that collectively comprise an abstract, and representative images or video segments, as stated above.',
 'For text, 

In [None]:
best_sentences

['For example, in a text about machine learning, the unigram "learning" might co-occur with "machine", "supervised", "un-supervised", and "semi-supervised" in four different sentences.',
 'For example, if we rank unigrams and find that "advanced", "natural", "language", and "processing" all get high ranks, then we would look at the original text and see that these words appear consecutively and create a final keyphrase using all four together.',
 'Consider the example text from a news article:\n\nA keyphrase extractor might select "Army Corps of Engineers", "President Bush", "New Orleans", and "defective flood-control pumps" as keyphrases.',
 'The main difficulty in supervised extractive summarization is that the known summaries must be manually created by extracting sentences so the sentences in an original training document can be labeled as "in summary" or "not in summary".',
 'Similarly, if the text contains the phrase "supervised classification", then there would be an edge betwee

In [None]:
word_frequency

FreqDist({'automatic': 0.21428571428571427,
          'summarization': 1.0,
          'process': 0.08333333333333333,
          'shortening': 0.011904761904761904,
          'set': 0.2261904761904762,
          'data': 0.07142857142857142,
          'computationally': 0.023809523809523808,
          'create': 0.08333333333333333,
          'subset': 0.023809523809523808,
          'summary': 0.39285714285714285,
          'represents': 0.023809523809523808,
          'important': 0.14285714285714285,
          'relevant': 0.05952380952380952,
          'information': 0.20238095238095238,
          'within': 0.047619047619047616,
          'original': 0.13095238095238096,
          'content': 0.13095238095238096,
          'addition': 0.023809523809523808,
          'text': 0.5476190476190477,
          'images': 0.09523809523809523,
          'videos': 0.047619047619047616,
          'also': 0.27380952380952384,
          'summarized': 0.011904761904761904,
          'finds': 0.0238095

In [None]:
score_sentences

{'"Natural" and "processing" would also be linked because they would both appear in the same string of N words.': 3.7142857142857135,
 '"Paraphrasing" is even more difficult to apply to image and video, which is why most summarization systems are extractive.': 2.964285714285714,
 '"SemCluster: Unsupervised Automatic Keyphrase Extraction Using Affinity Propagation".': 1.4642857142857142,
 '"Summarizing Conceptual Graphs for Automatic Summarization Task".': 1.4642857142857142,
 '(An absorbing random walk is like a standard random walk, except some states are now absorbing states that act as "black holes" that cause the walk to end abruptly at that state.)': 2.273809523809523,
 ', Conceptual Structures for STEM Research and Education.': 0.13095238095238096,
 ', The GRASSHOPPER algorithm\n• None Miranda-Jiménez, Sabino, Gelbukh, Alexander, and Sidorov, Grigori (2013).': 0.45238095238095244,
 '222–235.': 0.14285714285714285,
 '245–253.': 0.14285714285714285,
 '650. pp.': 0.16666666666666669

In [None]:
def visualize(title, sentence_list, best_sentences):
  from IPython.core.display import HTML
  text = ''

  display(HTML(f'<h1>Summary - {title}</h1>'))
  for sentence in sentence_list:
    if sentence in best_sentences:
      text += ' ' + str(sentence).replace(sentence, f"<mark>{sentence}</mark>")
    else:
      text += ' ' + sentence
  display(HTML(f""" {text} """))

In [None]:
visualize(article.title, sentence_list, best_sentences)

# Summarizing multiple texts

In [None]:
article_list = ['https://en.wikipedia.org/wiki/Automatic_summarization',
                'https://en.wikipedia.org/wiki/Natural_language_processing',
                'https://en.wikipedia.org/wiki/Lemmatisation']

In [None]:
for url in article_list:
  #print(article)
  g = Goose()
  article = g.extract(url)
  sentence_list, best_sentences, _, _ = summarize(article.cleaned_text, 100, percentage=0.5)
  #print(len(sentence_list), len(best_sentences))
  visualize(article.title, sentence_list, best_sentences)