In [1]:
from bs4 import BeautifulSoup
import urllib.request
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize njkp[]
from nltk.corpus import stopwords
from collections import Counter

### Importing and cleaning the article

In [22]:
# getting the data
source = urllib.request.urlopen('https://en.wikipedia.org/wiki/Global_warming').read()

In [24]:
# using beautifulsoup library to parse html data
soup = BeautifulSoup(source, 'html.parser')
#print(soup.prettify())

In [25]:
# get the main text in the articles
# In Wikipedia, it starts with /p
text = ''
for paragraph in soup.find_all('p'):
    text += paragraph.text

Wikipedia pages have many references, so we need to remove them from the text before cleaning it

In [26]:
# remove refernces from the article
text = re.sub(r'\[[0-9]*\]', ' ', text)
text = re.sub(r'\s+', ' ', text)

In [27]:
def text_cleaning(text):    
    Stopwords = stopwords.words('english')
    clean_text = text.lower()
    clean_text = re.sub(r'\d', ' ', clean_text)
    clean_text = re.sub(r'\W', ' ', clean_text)
    clean_text = ' '.join([txt for txt in clean_text.split() if txt not in Stopwords])
    clean_text = re.sub(r'\s+', ' ', clean_text)
    return clean_text

### Building the histogram
The histogram will be calculated by dividing the number of occurrences of each word by the number of occurrences of the word which occurs most in the document. It is assumed that the most frequent occuring words will be indication about the core subject of the text.

Note: Histogram is calculated on cleaned text

In [29]:
# count the occurrences of each word in the document
words = [word for word in word_tokenize(text_cleaning(text))]
word_counts = Counter(words)
#print(word_counts)

In [30]:
# Calculating the frequency by dividing the maximum occurance of a word
hist = {}
for key in word_counts.keys():
    hist[key] = word_counts[key] / max(word_counts.values())
    
# print(hist)

### Calculating sentences scores
Sentences will be scored using the word histogram calculated above. This is done by summing up the scores of each word in a sentence and hanging on to the score. The maximum length of sentences is a parameter used to reduce the scores of long sentences as they are more likely will get higher scores than shorter ones. However, in the other side it can be considered a bias towards long sentences.

In [31]:
# tokenise orignal text not the cleaned one
sentences = sent_tokenize(text)
sent_scores = {}
max_length = 30 

for sent in sentences:
    for word in word_tokenize(sent.lower()):
        if word in hist: 
            if len(word_tokenize(sent.lower())) < max_length:
                if sent not in sent_scores:
                    sent_scores[sent] = hist[word]
                else:
                    sent_scores[sent] += hist[word]
            
#sent_scores

In [32]:
# get the top 5 sentence scores
imp_sent = Counter(sent_scores).most_common(5)

### Finally, getting the summary

In [33]:
summary = ''
for sent in imp_sent:
    summary += sent[0] + ' '
    #print(sent[0])
summary

'People who regard climate change as catastrophic, irreversible, or rapid might label climate change as a climate crisis or a climate emergency. While aerosols typically limit global warming by reflecting sunlight, black carbon in soot that falls on snow or ice can contribute to global warming. Long-term effects of global warming: On the timescale of centuries to millennia, the magnitude of global warming will be determined primarily by anthropogenic CO2 emissions. Climate change can be mitigated through the reduction of greenhouse gas emissions or the enhancement of the capacity of carbon sinks to absorb greenhouse gases from the atmosphere. Additional disputes concern estimates of climate sensitivity, predictions of additional warming, what the consequences of global warming will be, and what to do about it. '