In [1]:
from bs4 import BeautifulSoup
import urllib.request
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize njkp[]
from nltk.corpus import stopwords
from collections import Counter

### Importing and cleaning the article

In [2]:
# getting the data
source = urllib.request.urlopen('https://en.wikipedia.org/wiki/Global_warming').read()
source



In [3]:
# using beautifulsoup library to parse html data
soup = BeautifulSoup(source, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Global warming - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"Xl138wpAMNQAAH9LjfgAAACH","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Global_warming","wgTitle":"Global warming","wgCurRevisionId":943601042,"wgRevisionId":943601042,"wgArticleId":5042951,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 maint: multiple names: authors list","Webarchive template w

In [4]:
# get the main text in the articles
# In Wikipedia, it starts with /p
text = ''
for paragraph in soup.find_all('p'):
    text += paragraph.text
    
text



Wikipedia pages have many references, so we need to remove them from the text before cleaning it

In [5]:
# remove refernces from the article
text = re.sub(r'\[[0-9]*\]', ' ', text)
text = re.sub(r'\s+', ' ', text)
text



In [6]:
def text_cleaning(text):    
    Stopwords = stopwords.words('english')
    clean_text = text.lower()
    clean_text = re.sub(r'\d', ' ', clean_text)
    clean_text = re.sub(r'\W', ' ', clean_text)
    clean_text = ' '.join([txt for txt in clean_text.split() if txt not in Stopwords])
    clean_text = re.sub(r'\s+', ' ', clean_text)
    return clean_text

### Building the histogram
The histogram will be calculated by dividing the number of occurrences of each word by the number of occurrences of the word which occurs most in the document. It is assumed that the most frequent occuring words will be indication about the core subject of the text.

Note: Histogram is calculated on cleaned text

In [11]:
# count the occurrences of each word in the document
words = [word for word in word_tokenize(text_cleaning(text))]
word_counts = Counter(words)
print(word_counts)



In [12]:
# Calculating the frequency by dividing the maximum occurance of a word
hist = {}
for key in word_counts.keys():
    hist[key] = word_counts[key] / max(word_counts.values())
    
hist

{'global': 0.7622950819672131,
 'warming': 0.8442622950819673,
 'long': 0.06557377049180328,
 'term': 0.11475409836065574,
 'rise': 0.12295081967213115,
 'average': 0.05737704918032787,
 'temperature': 0.2540983606557377,
 'earth': 0.16393442622950818,
 'climate': 1.0,
 'system': 0.12295081967213115,
 'major': 0.07377049180327869,
 'aspect': 0.01639344262295082,
 'change': 0.6229508196721312,
 'demonstrated': 0.00819672131147541,
 'direct': 0.03278688524590164,
 'measurements': 0.03278688524590164,
 'various': 0.040983606557377046,
 'effects': 0.20491803278688525,
 'often': 0.01639344262295082,
 'used': 0.08196721311475409,
 'interchangeably': 0.00819672131147541,
 'accurately': 0.00819672131147541,
 'mainly': 0.040983606557377046,
 'human': 0.13934426229508196,
 'caused': 0.08196721311475409,
 'increase': 0.1885245901639344,
 'surface': 0.20491803278688525,
 'temperatures': 0.1721311475409836,
 'projected': 0.01639344262295082,
 'continuation': 0.00819672131147541,
 'includes': 0.0163

### Calculating sentences scores
Sentences will be scored using the word histogram calculated above. This is done by summing up the scores of each word in a sentence and hanging on to the score. The maximum length of sentences is a parameter used to reduce the scores of long sentences as they are more likely will get higher scores than shorter ones. However, in the other side it can be considered a bias towards long sentences.

In [17]:
# tokenise orignal text not the cleaned one
sentences = sent_tokenize(text)
sent_scores = {}
max_length = 30 

for sent in sentences:
    for word in word_tokenize(sent.lower()):
        if word in hist: 
            if len(word_tokenize(sent.lower())) < max_length:
                if sent not in sent_scores:
                    sent_scores[sent] = hist[word]
                else:
                    sent_scores[sent] += hist[word]
            
sent_scores

{" Global warming is the long-term rise in the average temperature of the Earth's climate system.": 3.3278688524590163,
 'It is a major aspect of climate change and has been demonstrated by direct temperature measurements and by measurements of various effects of the warming.': 3.1639344262295084,
 'Global warming and climate change are often used interchangeably.': 3.336065573770491,
 'While there have been prehistoric periods of global warming, observed changes since the mid-20th century have been unprecedented in rate and scale.': 2.3852459016393435,
 'The largest human influence has been the emission of greenhouse gases such as carbon dioxide, methane, and nitrous oxide.': 1.3770491803278686,
 'Fossil fuel burning is the dominant source of these gases, with agricultural emissions and deforestation also playing significant roles.': 1.3934426229508194,
 'These findings have been recognized by the national science academies of the major industrialized nations and are not disputed by a

In [18]:
# get the top 5 sentence scores
imp_sent = Counter(sent_scores).most_common(5)

### Finally, getting the summary

In [21]:
summary = ''
for sent in imp_sent:
    summary += sent[0] + ' '
    #print(sent[0])
summary

'People who regard climate change as catastrophic, irreversible, or rapid might label climate change as a climate crisis or a climate emergency. While aerosols typically limit global warming by reflecting sunlight, black carbon in soot that falls on snow or ice can contribute to global warming. Long-term effects of global warming: On the timescale of centuries to millennia, the magnitude of global warming will be determined primarily by anthropogenic CO2 emissions. Climate change can be mitigated through the reduction of greenhouse gas emissions or the enhancement of the capacity of carbon sinks to absorb greenhouse gases from the atmosphere. Additional disputes concern estimates of climate sensitivity, predictions of additional warming, what the consequences of global warming will be, and what to do about it. '