-
Notifications
You must be signed in to change notification settings - Fork 0
/
html_generator.py
196 lines (176 loc) · 6.76 KB
/
html_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#!/usr/bin/env python
# coding=utf-8
# Local imports
from config import DBPATH, LINKEXPIRE, LINKFRESH, OUTPUTDIR, \
ARTICLES_FRONT, SOURCE_URLS, DISQUS, PENALIZE_N, \
MAINURL
# Library imports
from jinja2 import Environment, PackageLoader
import os
import sqlite3
import time
import dateutil.parser
import datetime
import calendar
from shutil import copyfile
from urlparse import urlparse
from math import log
import re
__author__ = "José María Mateos"
__license__ = "GPL"
def get_articles():
"""
Returns the articles currently in the database. Now that we are
at it, it also returns the age of each article, in seconds, for
the aging algorithm (a simple linear function), instead of the
original article date, which we won't need anymore at this point.
Returns a list of [id, url, title, score, age].
"""
conn = sqlite3.connect(DBPATH)
c = conn.cursor()
q1 = conn.execute("""SELECT id, url, title, score, datetime
FROM current""")
rows = [list(x) for x in q1] # Get every entry into a list,
# so it can be modified.
now = time.mktime(datetime.datetime.utctimetuple(datetime.datetime.now()))
# Put age instead of date.
for i in range(len(rows)):
tmp1 = dateutil.parser.parse(rows[i][4])
itemdate = calendar.timegm(tmp1.timetuple())
age = now - itemdate
rows[i][4] = age
conn.close()
return rows
def get_top_level(url):
"""
Returns the top-level domain for this URL.
For instance, well.blogs.nytimes.com -> nytimes.com.
"""
domain = urlparse(url)
domain = domain.netloc
dots_idx = [m.start() for m in re.finditer('\\.', domain)]
if (len(dots_idx) >= 2):
return domain[dots_idx[-2] + 1:]
else:
return domain
def get_normalization_factor(articles, penalize_length):
"""
Returns the mean score per site, for normalization purposes.
May use a term for length penalization, increasing the median
value by log(number_articles) per site. In effect, that causes
that sites that publish more quantity need also more quality
to appear at the top of the ranking. True by default.
"""
sites_score = {}
for x in articles:
url = x[1]
site = get_top_level(url)
if not site in sites_score:
sites_score[site] = [x[3]]
else:
sites_score[site] += [x[3]]
# Compute the average
for k in sites_score:
factor = sum(sites_score[k]) / float(len(sites_score[k]))
# Also use the log of the maximum score
factor /= log(max(sites_score[k]) + 1)
if penalize_length:
sites_score[k] = factor * log(len(sites_score[k]) + 1)
else:
sites_score[k] = factor
return(sites_score)
def get_name(names, url):
"""
Return the source name from the names dictionary using the url,
or the top-level domain if not present in the dictionary. This
function is used to ensure that the HTML page can be generated,
even after changing SOURCE_URLS in a way that old articles in the
database do not match any of the current source names.
"""
site = get_top_level(url)
if site in names:
return names[site]
else:
return site
def get_age_modifier(age):
"""
Returns a linear score modifier depending on the age of the link.
"""
if age < LINKFRESH:
return 1.0
else:
return (LINKEXPIRE - age + LINKFRESH) / float(LINKEXPIRE)
def get_pagefile_from_title(title):
"""
Returns an filename in the form: words-from-the-article-title
in order to create pages for single articles for the commenting
system.
"""
title = title.lower()
title = re.sub(r"[^a-z0-9 ]", "", title)
title = title.replace(" ", "-")
return("/pages/" + title + ".html")
if __name__ == "__main__":
# Create the output dir if it does not exist.
if not os.path.isdir(OUTPUTDIR):
os.mkdir(OUTPUTDIR)
# Create the output dir for the article pages, as well
artdir = OUTPUTDIR + "/pages/"
if not os.path.isdir(artdir):
os.mkdir(artdir)
# Get the template for index.html
env = Environment(loader = PackageLoader('html_generator', \
'templates'))
tindex = env.get_template('index.html')
# Create a lookup table for the titles of the sources.
names = dict((get_top_level(x[0]), x[2]) for x in SOURCE_URLS)
# Obtain the articles from the database and create a dictionary
# with them after aging, factoring and sorting them so we have
# a more or less coherent rank.
articles = get_articles()
normfactors = get_normalization_factor(articles, PENALIZE_N)
for article in articles:
age = article[4]
url = article[1]
agefactor = get_age_modifier(age)
domain = get_top_level(url)
article[3] = article[3] * agefactor / normfactors[domain]
articles.sort(key = lambda x: x[3], reverse = True)
articles = articles[:ARTICLES_FRONT]
# Normalize the score for presentation
scores = [x[3] for x in articles]
max_score = max(scores)
for i in xrange(len(articles)):
articles[i][3] = "%.1f %%" % (articles[i][3] / max_score * 100)
template_values = {}
links = [{"url": x[1], "title": x[2], \
"sourcename": get_name(names, x[1]), \
"score": x[3], "qtitle": x[2].replace(' ', '+'), \
"singlepage": get_pagefile_from_title(x[2])[1:]} \
for x in articles]
template_values['links'] = links
template_values['DISQUS'] = DISQUS
template_values['MAINURL'] = MAINURL
rendered_index = tindex.render(template_values)
# Save to output dir
f = open(OUTPUTDIR + "/index.html", "w")
f.write(rendered_index.encode("utf-8"))
f.close()
# Copy the CSS file
copyfile("styles/style.css", OUTPUTDIR + "/style.css")
# Create individual pages.
single_template_values = {}
tpage = env.get_template('single_page.html')
for entry in articles:
single_template_values = entry[2]
single_template_values = {"url": entry[1],
"title": entry[2], \
"sourcename": get_name(names, entry[1]), \
"score": entry[3], \
"qtitle": entry[2].replace(' ', '+'), \
"DISQUS": DISQUS, \
"DISQUSID" : str(DISQUS) + str(entry[0])}
rendered_page = tpage.render(single_template_values)
f = open(OUTPUTDIR + get_pagefile_from_title(entry[2]), "w")
f.write(rendered_page.encode("utf-8"))
f.close()