Extraction Text Summarizer
  Samuel Hu

Resource: https://www.activestate.com/blog/how-to-do-text-summarization-with-python/


In [209]:
from bs4 import BeautifulSoup
from requests import get

In [210]:
def get_only_text(url):
    page = get(url)
    soup = BeautifulSoup(page.content, "lxml")
    text = ' '.join(map(lambda p: p.text, soup.find_all('p')))
    title = ' '.join(soup.title.stripped_strings)
    return title, text

In [211]:
texts = get_only_text('https://www.theverge.com/2022/8/19/23313155/samsung-galaxy-s23-ultra-rumor-camera-200mp-sensor')


In [212]:
len(texts[1])

2126

In [213]:
texts[1]

'Filed under: Get ready for cameras with lots and lots of pixels from Samsung, Motorola, and even Apple According to a new report from Korean outlet ETNews spotted by Android Authority, it’s looking very likely that the Samsung Galaxy S23 Ultra will use the 200-megapixel camera sensor that the company launched last year. That is a whole lot of pixels, and Samsung isn’t alone in this newest megapixel arms race — Motorola beat the company to the punch with the Motorola X30 Pro. Even Apple, a 12-megapixel camera devotee, looks like it will finally move to higher resolution 48-megapixel camera sensors with the iPhone 14.  It’s not all about big numbers; moving to higher-pixel-count sensors has real image quality benefits. In this chapter of the megapixel race, it’s all about pixel binning. Samsung already employs this with its 108-megapixel sensor, and taking a super high-res photo isn’t the point — rather, combining individual pixels into four-by-four or two-by-two configurations is.  Thi

In [214]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest

In [215]:
def summarize(text, per):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    tokens = [token.text for token in doc]
    word_frequencies={}
    
    # Create a dictionary that only extracted the key words from the text, removing the stop words
    for word in doc:
        if word.text.lower() not in list(STOP_WORDS):
            if (word.text.lower() not in punctuation) and (word.text.lower() not in ['\n']):
                if word.text not in word_frequencies.keys():
                    word_frequencies[word.text] = 1
                else:
                    word_frequencies[word.text] += 1

    #Finds the word with the max frequency
    max_frequency = max(word_frequencies.values())
    
    #Percentage of the word appearing in the article
    for word in word_frequencies.keys():
        word_frequencies[word]=word_frequencies[word]/max_frequency

    sentence_tokens= [sent for sent in doc.sents]
    sentence_scores = {}

    for sent in sentence_tokens:
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if sent not in sentence_scores.keys():                            
                    sentence_scores[sent]=word_frequencies[word.text.lower()]
                else:
                    sentence_scores[sent]+=word_frequencies[word.text.lower()]

    select_length=int(len(sentence_tokens)*per)
    summary=nlargest(select_length, sentence_scores,key=sentence_scores.get)
    final_summary=[word.text for word in summary]
    summary=''.join(final_summary)
    return summary

In [216]:
summarize(texts[1], 0.25)

'The 0.64μm pixels on Samsung’s 200-megapixel sensor are relatively small, considering that the pixels on Apple’s newest 12-megapixel sensor measure 1.9μm.Even Apple, a 12-megapixel camera devotee, looks like it will finally move to higher resolution 48-megapixel camera sensors with the iPhone 14.  Filed under: Get ready for cameras with lots and lots of pixels from Samsung, Motorola, and even Apple According to a new report from Korean outlet ETNews spotted by Android Authority, it’s looking very likely that the Samsung Galaxy S23 Ultra will use the 200-megapixel camera sensor that the company launched last year.Samsung already employs this with its 108-megapixel sensor, and taking a super high-res photo isn’t the point — rather, combining individual pixels into four-by-four or two-by-two configurations is.  '

nlp = spacy.load("en_core_web_sm")

extra_words=list(STOP_WORDS)+list(punctuation)+['\n']
#doc = """Your Text Content Here"""
doc = texts[1]
docx = nlp(doc)

all_words=[word.text for word in docx]
Freq_word ={}
for w in all_words:
      w1 = w.lower()
      if w1 not in extra_words and w1.isalpha():
        if w1 in Freq_word.keys():
              Freq_word[w1]+=1
        else:
              Freq_word[w1]=1

val=sorted(Freq_word.values())
max_freq=val[-3:]
print("Topic of document given :-")

for word,freq in Freq_word.items():
      if freq in max_freq:
          print(word ,end=" ")
      else:
          continue

for word in Freq_word.keys():
       Freq_word[word] = (Freq_word[word]/max_freq[-1])

sent_strength={}
for sent in docx.sents:
      for word in sent :
            if word.text.lower() in Freq_word.keys():
                if sent in sent_strength.keys():
                     sent_strength[sent]+=Freq_word[word.text.lower()]
                else:
                     sent_strength[sent]=Freq_word[word.text.lower()]
            else: 
                continue

top_sentences=(sorted(sent_strength.values())[::-1])
top30percent_sentence=int(0.3*len(top_sentences))
top_sent=top_sentences[:top30percent_sentence]

summary=[]
for sent,strength in sent_strength.items():
       if strength in top_sent:
          summary.append(sent)
       else:
          continue

for i in summary:
    print(i,end="")
