Switch branches/tags
Nothing to show
Find file
Fetching contributors…
Cannot retrieve contributors at this time
111 lines (83 sloc) 2.88 KB
import re
import requests
from bs4 import BeautifulSoup
from models import Submission
def parse_out_title(text):
In comes a title directly from the hacker news html page
This function removes the domain in parenthesis, and returns
both the domain and the full title
in -> "title ("
out-> ("title", "")
match ='\([\d\w\.]+\)$', text)
just_domain =[1:-1]
title = text[:(len(just_domain)+2) * -1].strip()
return title, just_domain
def parse_front_page(html):
In comes the HTML of the hacker news front page,
out is the list of submissions in a nice list of dicts
soup = BeautifulSoup(html)
titles = []
urls = []
domains = []
for tag in soup.find_all('td', class_='title'):
if not tag.text.endswith('.') and "(" in tag.text:
# ignore when there is a peroid present, (that is the rank, not relevent here)
# and ignore when no parenthesis (the "More" link)
title = tag.text.strip()
just_title, domain = parse_out_title(title)
data = []
for tag in soup.find_all('td', class_='subtext'):
points = tag.span and tag.span.text[:-7] # ignore trailing " points"
submitter = tag('a') and tag('a')[0].text
if submitter and points:
data.append([points, submitter])
ids = []
comment_counts = []
for tag in soup.find_all(href=re.compile("item\?id=(\d){7,8}")):
# go through each link that looks like a comments link.
if 'comment' not in tag.text:
if tag.text == "discuss":
text = 0
elif tag.text == '1 comment':
text = 1
text = int(tag.text[:-9])
id = int(tag['href'][8:])
submissions = []
for i in range(len(comment_counts)):
submission = {
'current_rank': i + 1,
'title': titles[i].encode('ascii', 'xmlcharrefreplace'),
'url': unicode(urls[i]),
'domain': domains[i],
'comments': comment_counts[i],
"submitter": unicode(data[i][1]),
"points": unicode(data[i][0]),
'hn_id': ids[i],
return submissions
def get_submissions():
Crawls the front page of hacker news and returns a list of all
front = requests.get("").text
#front = open("html/front.html").read()
return parse_front_page(front)
def crawl():
Fetch the lastest front page from HN, and then update the tables