In [1]:
# text retrieval

In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
pageUrl = "https://arstechnica.com/science/2017/12/there-is-now-a-climate-model-of-the-world-of-game-of-thrones/"

In [4]:
page = requests.get(pageUrl)

In [6]:
parsed_page = BeautifulSoup(page.text, 'lxml')

In [7]:
print(parsed_page)

<!DOCTYPE html>
<html lang="en-us">
<head>
<title>This Game of Thrones climate model is what’s been missing from your life | Ars Technica</title>
<script type="text/javascript">
    ars = {"ASSETS":"https:\/\/cdn.arstechnica.net\/wp-content\/themes\/ars\/assets","HOME_URL":"https:\/\/arstechnica.com","LOGIN_URL":"https:\/\/arstechnica.com\/services\/login-desktop.html?v=1","CIVIS":"\/civis","THEME":"light","VIEW":"grid","MOBILE":false,"PREMIER":false,"LOGGED":false,"ENV":"production","AD":{"kw":["section_science","discipline","earth-science-2","science","culture"],"zone":"culture","queue":[]},"TOTAL":85237,"UNREAD":0,"RECENT":[1235341,1234151,1236117,1237041,1236977,1236817,1236905,1236917,1236863,1213093,1236821,1236811,1236721,1236723,1236671,1236445,1233429,1236479,1236441,1236661,1236459,1236439,1236461,1236281,1236021],"LOGINS":true,"CROSS":false,"PARSELY":"arstechnica.com","COMMENTS":false,"HOMEPAGE":false,"SITE":1,"READY":[],"SHOW_ADS":true,"IMG_PROXY":"https:\/\/cdn.arstechnica

In [24]:
page_paragraphs = parsed_page.find('section', {'class': 'article-guts'}).findAll('p')
paragraph_text = [paragraph.text for paragraph in page_paragraphs]
article_text = '\n'.join(paragraph_text)

In [25]:
print(article_text)

A central conceit of George R.R. Martin’s A Song of Ice and Fire books (and the popular HBO series Game of Thrones based on them) is that the seasons of the planet where they take place are not as predictable as the Earth’s annual cycle. Somehow the phrase “winter is coming” wouldn’t seem as foreboding if you could reply, “Yes, that usually happens in December through February.”
But how could a planet have unruly seasons? Earth’s seasons are due to the tilt of its axis. During one part of Earth’s orbit, the Northern Hemisphere is tilted away from the Sun, with the resulting indirect sunlight spread thin over the surface of the hemisphere, causing winter. On the opposite side of its orbit, summer comes as this hemisphere is tilted toward direct sunlight. There isn’t much room in such clockwork for randomness.
Well, if you’ve ever wanted to debate fan theories, here’s an excellent new resource for you to draw from: a real climate model simulation of Westeros and Essos.
The study appears 

In [26]:
# text tokenization and parsing

In [27]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from string import punctuation

In [62]:
tokenized_sentences = sent_tokenize(article_text)
tokenized_words = word_tokenize(article_text.lower())

filter_list = stopwords.words('english') + list(punctuation) + ['’', "“", "”"]
filtered_words = [word for word in tokenized_words if word not in filter_list]

In [63]:
print(len(tokenized_words), len(filtered_words))

1055 505


In [64]:
# extraction and summarization

In [104]:
from nltk.probability import FreqDist
from heapq import nlargest

In [105]:
word_frequency = FreqDist(filtered_words)
top_10_words = nlargest(10, word_frequency, key=word_frequency.get)

In [106]:
print(top_10_words)
print([word_frequency[word] for word in top_10_words])

['earth', 'planet', 'climate', 'author', 'martin', 'seasons', 'winter', 'one', 'could', 'hemisphere']
[8, 7, 6, 6, 5, 5, 5, 5, 4, 4]


In [107]:
def score_sentence(sentence):
    """
    Only score words that are present in the top 10 list
    """
    score_words = (word for word in word_tokenize(sentence.lower()) if word in top_10_words)
    return sum(word_frequency[word] for word in score_words)

In [110]:
scored_sentences = [(score_sentence(sentence), sentence) for sentence in tokenized_sentences]

In [111]:
print(scored_sentences)

[(0, 'A central conceit of George R.R.'), (25, 'Martin’s A Song of Ice and Fire books (and the popular HBO series Game of Thrones based on them) is that the seasons of the planet where they take place are not as predictable as the Earth’s annual cycle.'), (25, 'Somehow the phrase “winter is coming” wouldn’t seem as foreboding if you could reply, “Yes, that usually happens in December through February.”\nBut how could a planet have unruly seasons?'), (13, 'Earth’s seasons are due to the tilt of its axis.'), (26, 'During one part of Earth’s orbit, the Northern Hemisphere is tilted away from the Sun, with the resulting indirect sunlight spread thin over the surface of the hemisphere, causing winter.'), (4, 'On the opposite side of its orbit, summer comes as this hemisphere is tilted toward direct sunlight.'), (0, 'There isn’t much room in such clockwork for randomness.'), (6, 'Well, if you’ve ever wanted to debate fan theories, here’s an excellent new resource for you to draw from: a real

In [116]:
top_scores = set(sentence[0] for sentence in nlargest(7, scored_sentences))
article_summary = '\n'.join(sentence[1] for sentence in scored_sentences if sentence[0] in top_scores)

In [118]:
print(article_summary)

Martin’s A Song of Ice and Fire books (and the popular HBO series Game of Thrones based on them) is that the seasons of the planet where they take place are not as predictable as the Earth’s annual cycle.
Somehow the phrase “winter is coming” wouldn’t seem as foreboding if you could reply, “Yes, that usually happens in December through February.”
But how could a planet have unruly seasons?
During one part of Earth’s orbit, the Northern Hemisphere is tilted away from the Sun, with the resulting indirect sunlight spread thin over the surface of the hemisphere, causing winter.
And while fan explanations for Martin’s strange and long-lasting seasons have included wild variations of greenhouse gases to volcanic activity to ocean circulation patterns, the author finds these lacking, as there is at least one mention of shortened days during the long winter.
The author’s idea is this: if the planet’s axis wobbled in a circle once per orbit, the summer hemisphere could stay pointed at its equiv