import re
import requests
from bs4 import BeautifulSoup
from models import Submission
def parse_out_title(text):
In comes a title directly from the hacker news html page
This function removes the domain in parenthesis, and returns
both the domain and the full title
in -> "title ("
out-> ("title", "")
match ='\([\d\w\.]+\)$', text)
just_domain =[1:-1]
title = text[:(len(just_domain)+2) * -1].strip()
return title, just_domain
def parse_front_page(html):
In comes the HTML of the hacker news front page,
out is the list of submissions in a nice list of dicts
soup = BeautifulSoup(html)
titles = []
urls = []
domains = []
for tag in soup.find_all('td', class_='title'):
if not tag.text.endswith('.') and "(" in tag.text:
# ignore when there is a peroid present, (that is the rank, not relevent here)
# and ignore when no parenthesis (the "More" link)
title = tag.text.strip()
just_title, domain = parse_out_title(title)
data = []
for tag in soup.find_all('td', class_='subtext'):
points = tag.span and tag.span.text[:-7] # ignore trailing " points"
submitter = tag('a') and tag('a')[0].text
if submitter and points:
data.append([points, submitter])
ids = []
comment_counts = []
for tag in soup.find_all(href=re.compile("item\?id=(\d){7,8}")):
# go through each link that looks like a comments link.
if 'comment' not in tag.text:
if tag.text == "discuss":
text = 0
elif tag.text == '1 comment':
text = 1
text = int(tag.text[:-9])
id = int(tag['href'][8:])
submissions = []
for i in range(len(comment_counts)):
submission = {
'current_rank': i + 1,
'title': titles[i].encode('ascii', 'xmlcharrefreplace'),
'url': unicode(urls[i]),
'domain': domains[i],
'comments': comment_counts[i],
"submitter": unicode(data[i][1]),
"points": unicode(data[i][0]),
'hn_id': ids[i],
return submissions
def get_submissions():
Crawls the front page of hacker news and returns a list of all
front = requests.get("").text
#front = open("html/front.html").read()
return parse_front_page(front)
def crawl():
Fetch the lastest front page from HN, and then update the tables