## Text Summarization using NLP

In [1]:
from urllib2 import urlopen
import bs4 as bs
import re
import nltk
import heapq

### Gettings the data source

In [2]:
source = urlopen('https://en.wikipedia.org/wiki/Global_warming').read()

In [3]:
# Parsing the data/ creating BeautifulSoup object
soup = bs.BeautifulSoup(source,'lxml')

In [4]:
# Fetching the data
text = ""
for paragraph in soup.find_all('p'):
    text += paragraph.text

In [5]:
text



### Preprocessing the data

In [6]:
# Preprocessing the data
text = re.sub(r'\[[0-9]*\]',' ',text) # remove reference numbers
text = re.sub(r'\s+',' ',text)        # remove spaces
clean_text = text.lower()
clean_text = re.sub(r'\W',' ',clean_text) # remove one letter characters
clean_text = re.sub(r'\d',' ',clean_text) # remove digits
clean_text = re.sub(r'\s+',' ',clean_text)# remove extra spaces
clean_text



### Tokenize sentences

In [7]:
sentences = nltk.sent_tokenize(text)

In [8]:
len(sentences)

264

In [9]:
stop_words = nltk.corpus.stopwords.words('english')

### Word Counts

In [10]:
word2count = {}
for word in nltk.word_tokenize(clean_text):
    if word not in stop_words:
        if word not in word2count.keys():
            word2count[word] = 1
        else:
            word2count[word] += 1

In [11]:
word2count

{u'limited': 3,
 u'asian': 1,
 u'whose': 2,
 u'paris': 1,
 u'risk': 3,
 u'regional': 2,
 u'updates': 1,
 u'summarized': 2,
 u'affect': 3,
 u'bringing': 1,
 u'crops': 1,
 u'companies': 3,
 u'humidity': 2,
 u'unrelated': 1,
 u'intensification': 1,
 u'enhance': 1,
 u'methane': 5,
 u'leaders': 1,
 u'disciplines': 2,
 u'consistent': 3,
 u'estimates': 6,
 u'direct': 3,
 u'likely': 10,
 u'estimated': 1,
 u'even': 6,
 u'established': 1,
 u'deliberate': 1,
 u'selected': 1,
 u'contributed': 3,
 u'asia': 1,
 u'glaciers': 2,
 u'resilient': 1,
 u'new': 2,
 u'net': 2,
 u'increasing': 9,
 u'consisting': 1,
 u'sinks': 2,
 u'never': 1,
 u'hundreds': 1,
 u'reported': 4,
 u'china': 1,
 u'stresses': 3,
 u'study': 12,
 u'reports': 2,
 u'controversy': 1,
 u'physiological': 1,
 u'adopting': 1,
 u'fourier': 1,
 u'changes': 27,
 u'svante': 1,
 u'highly': 1,
 u'total': 1,
 u'would': 16,
 u'negative': 3,
 u'therefore': 1,
 u'assessment': 3,
 u'populations': 3,
 u'ascribe': 1,
 u'disruption': 1,
 u'warm': 5,
 u'w

In [12]:
max(word2count.values())

93

In [13]:
# Converting counts to weights
for key in word2count.keys():
    word2count[key] = word2count[key]/float(max(word2count.values()))
word2count

{u'limited': 0.03225806451612903,
 u'asian': 0.010752688172043012,
 u'whose': 0.021505376344086023,
 u'paris': 0.010752688172043012,
 u'risk': 0.03225806451612903,
 u'regional': 0.021505376344086023,
 u'updates': 0.010752688172043012,
 u'summarized': 0.021505376344086023,
 u'affect': 0.03225806451612903,
 u'bringing': 0.010752688172043012,
 u'crops': 0.010752688172043012,
 u'companies': 0.03225806451612903,
 u'humidity': 0.021505376344086023,
 u'unrelated': 0.010752688172043012,
 u'intensification': 0.010752688172043012,
 u'enhance': 0.010752688172043012,
 u'methane': 0.053763440860215055,
 u'leaders': 0.010752688172043012,
 u'disciplines': 0.021505376344086023,
 u'consistent': 0.03225806451612903,
 u'estimates': 0.06451612903225806,
 u'direct': 0.03225806451612903,
 u'likely': 0.10752688172043011,
 u'estimated': 0.010752688172043012,
 u'even': 0.06451612903225806,
 u'established': 0.010752688172043012,
 u'deliberate': 0.010752688172043012,
 u'selected': 0.010752688172043012,
 u'contri

### Product sentence scores    

In [14]:
sent2score = {}
for sentence in sentences:
    for word in nltk.word_tokenize(sentence.lower()):
        if word in word2count.keys():
            if len(sentence.split(' ')) < 30:
                if sentence not in sent2score.keys():
                    sent2score[sentence] = word2count[word]
                else:
                    sent2score[sentence] += word2count[word]

In [15]:
sent2score

{u" Global warming, also referred to as climate change, is the observed century-scale rise in the average temperature of the Earth's climate system and its related effects.": 6.24009799918334,
 u'2015 updates to account for differing methods of ocean surface temperature measurements show a positive trend over the recent decade.': 1.3227167551381516,
 u':289 Emissions scenarios, estimates of changes in future emission levels of greenhouse gases, have been projected that depend upon uncertain economic, sociological, technological, and natural developments.': 2.645977950183748,
 u':290 This mandate was sustained in the Kyoto Protocol to the Framework Convention, :290 which entered into legal effect in 2005.': 0.45569620253164556,
 u':5 At the 15th UNFCCC Conference of the Parties, held in 2009 at Copenhagen, several UNFCCC Parties produced the Copenhagen Accord.': 0.5214373213556555,
 u':71 Emissions can be attributed to different regions.': 0.7468354430379747,
 u'A 2015 report by Citiban

In [16]:
len(sent2score)

183

In [17]:
max(sent2score),max(sent2score.values())

(u'by NASA and the Royal Society.', 6.24009799918334)

In [18]:
# Gettings best 5 lines             
best_sentences = heapq.nlargest(5, sent2score, key=sent2score.get)

In [19]:
print("Text Summary:\n")

for sentence in best_sentences:
    print(sentence)

Text Summary:

 Global warming, also referred to as climate change, is the observed century-scale rise in the average temperature of the Earth's climate system and its related effects.
Possible societal responses to global warming include mitigation by emissions reduction, adaptation to its effects, building systems resilient to its effects, and possible future climate engineering.
By 2050, between 350 million and 600 million people are projected to experience increased water stress due to climate change (see Climate change in Africa).
Additional disputes concern estimates of climate sensitivity, predictions of additional warming, and what the consequences of global warming will be.
Mitigation of climate change are actions to reduce greenhouse gas emissions, or enhance the capacity of carbon sinks to absorb greenhouse gases from the atmosphere.
