In [28]:
# Creating an Article Summarizer

import bs4 as bs
import urllib.request
import re
import nltk
nltk.download('stopwords')
import heapq

[nltk_data] Downloading package stopwords to C:\Users\Shashank
[nltk_data]     Prakash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Getting the data
source = urllib.request.urlopen("https://en.wikipedia.org/wiki/Global_warming").read()
source



In [3]:
soup = bs.BeautifulSoup(source,'lxml')
soup #just seems like a cleaner text vesion to me.
#Instead of source, here i could have used request.get("url").text

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Global warming - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"c1f2b73c-aad4-4851-b05c-add741870d78","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Global_warming","wgTitle":"Global warming","wgCurRevisionId":950495871,"wgRevisionId":950495871,"wgArticleId":5042951,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Webarchive template wayback links","Harv and Sfn template errors","Articles with short description","Wikipedia indefinitely move-protected pages","Wikipedia indefinit

In [4]:
# Parsing the data

text = ""
for paragraph in soup.find_all('p'): #so, all text in p tag will be fetched and appended
    text += paragraph.text
text



In [7]:
# Preprocessing the text

text = re.sub(r'\[[0-9]*\]',' ',text)
text = re.sub(r'\s+',' ',text)
clean_text = text.lower()
clean_text = re.sub(r'\W',' ',clean_text)
clean_text = re.sub(r'\d',' ',clean_text)
clean_text = re.sub(r'\s+',' ',clean_text)

In [13]:
sentences = nltk.sent_tokenize(text)
stop_words = nltk.corpus.stopwords.words('english')
len(sentences)

381

In [18]:
# Building the histogram

word2count = {}
for word in nltk.word_tokenize(clean_text):
    if word not in stop_words:
        word2count[word] = word2count.get(word,0) + 1
word2count

{'global': 86,
 'warming': 102,
 'mainly': 4,
 'human': 18,
 'caused': 9,
 'rise': 19,
 'average': 7,
 'temperature': 31,
 'earth': 21,
 'climate': 119,
 'system': 14,
 'demonstrated': 2,
 'direct': 4,
 'measurements': 4,
 'various': 7,
 'effects': 25,
 'major': 9,
 'aspect': 2,
 'change': 76,
 'addition': 4,
 'rising': 10,
 'surface': 27,
 'temperatures': 19,
 'also': 23,
 'includes': 2,
 'changes': 22,
 'precipitation': 6,
 'prehistoric': 1,
 'periods': 1,
 'observed': 15,
 'since': 16,
 'mid': 7,
 'th': 11,
 'century': 21,
 'unprecedented': 4,
 'rate': 13,
 'scale': 7,
 'intergovernmental': 4,
 'panel': 4,
 'ipcc': 8,
 'concluded': 2,
 'influence': 4,
 'dominant': 5,
 'cause': 9,
 'findings': 1,
 'recognized': 1,
 'national': 7,
 'science': 2,
 'academies': 2,
 'nations': 8,
 'disputed': 2,
 'scientific': 14,
 'body': 2,
 'international': 4,
 'standing': 2,
 'largest': 2,
 'emission': 10,
 'greenhouse': 44,
 'gases': 28,
 'carbon': 39,
 'dioxide': 11,
 'methane': 8,
 'nitrous': 3,
 

In [19]:
# Weighted histogram

for key in word2count.keys():
    word2count[key] = word2count[key]/max(word2count.values())
word2count

{'global': 0.7226890756302521,
 'warming': 0.8571428571428571,
 'mainly': 0.03361344537815126,
 'human': 0.15126050420168066,
 'caused': 0.07563025210084033,
 'rise': 0.15966386554621848,
 'average': 0.058823529411764705,
 'temperature': 0.2605042016806723,
 'earth': 0.17647058823529413,
 'climate': 1.0,
 'system': 0.18421052631578946,
 'demonstrated': 0.02631578947368421,
 'direct': 0.05263157894736842,
 'measurements': 0.05263157894736842,
 'various': 0.09210526315789473,
 'effects': 0.32894736842105265,
 'major': 0.11842105263157894,
 'aspect': 0.02631578947368421,
 'change': 1.0,
 'addition': 0.08163265306122448,
 'rising': 0.20408163265306123,
 'surface': 0.5510204081632653,
 'temperatures': 0.3877551020408163,
 'also': 0.46938775510204084,
 'includes': 0.04081632653061224,
 'changes': 0.4489795918367347,
 'precipitation': 0.12244897959183673,
 'prehistoric': 0.02040816326530612,
 'periods': 0.02040816326530612,
 'observed': 0.30612244897959184,
 'since': 0.32653061224489793,
 'mi

In [26]:
# Calculating the sentence scores

sent2score = {}
for sentence in sentences:
    for word in nltk.word_tokenize(sentence.lower()):
        if word in word2count.keys():
            if len(sentence.split(' ')) < 30: #we don't want long sentences, just small important ones
                sent2score[sentence] = sent2score.get(sentence,word2count[word]) + word2count[word]
                #uncomment below line to see how much importance the words hold.
                #print("sentence:",sentence,"\nword:",word,"\nscore:",sent2score[sentence])
sent2score

{'It is a major aspect of climate change which, in addition to rising global surface temperatures, also includes its effects, such as changes in precipitation.': 5.620916787767738,
 'While there have been prehistoric periods of global warming, observed changes since the mid-20th century have been unprecedented in rate and scale.': 3.6410564225690276,
 'The Intergovernmental Panel on Climate Change (IPCC) concluded that, "human influence on climate has been the dominant cause of observed warming since the mid-20th century".': 5.885954381752701,
 'These findings have been recognized by the national science academies of major nations and are not disputed by any scientific body of national or international standing.': 1.2000537056928036,
 'The largest human influence has been the emission of greenhouse gases such as carbon dioxide, methane, and nitrous oxide.': 3.2941176470588234,
 'Fossil fuel burning is the principal source of these gases, with agricultural emissions and deforestation al

In [27]:
len(sent2score) #all original sentences scored with length less than 30 words

303

In [33]:
best_sentences = heapq.nlargest(25,sent2score,key=sent2score.get)
for s in best_sentences:
    print(s) #final summary

Various scientists, politicians and news media have adopted the terms climate crisis or a climate emergency to talk about climate change, while using global heating instead of global warming.
Climate change can be mitigated through the reduction of greenhouse gas emissions or the enhancement of the capacity of carbon sinks to absorb greenhouse gases from the atmosphere.
Global anthropogenic greenhouse gas emissions in 2018 excluding land use change were equivalent to 52 billion tonnes of carbon dioxide.
The major causes of current climate change are primarily greenhouse gases, and secondarily land use changes, and aerosols and soot.
Regions dependent on glacier water, regions that are already dry, and small islands are also at increased risk of water stress due to climate change.
Public attention increased over the summer, and global warming became the dominant popular term, commonly used both by the press and in public discourse.
While aerosols typically limit global warming by reflec