This repository has been archived by the owner on Feb 25, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
extract-albums.py
executable file
·65 lines (53 loc) · 1.84 KB
/
extract-albums.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/env python
from bs4 import BeautifulSoup
import sys
import os
from datetime import datetime
import unicodedata
import re
from review import Review
from pymongo import Connection
def arff_escape(val):
# replace all nonascii characters
val = unicodedata.normalize('NFKD', val).encode('ascii', 'ignore')
return val.replace("'", "\\'").replace('\n',' ')
if len(sys.argv) < 3:
"Usage:"
sys.exit(1)
start = int(sys.argv[1])
end = int(sys.argv[2])
#out = open('data/%d-%d-albums.arff' % (start, end) , 'w')
#out.write(Review().arff_header())
connection = Connection()
db = connection.pitchfork
count = 0
for cur in range(start, end):
print 'page: ' + str(cur)
data_dir = 'data/' + str(cur) + '/'
paths = os.listdir(data_dir)
paths = [relpath for relpath in paths if \
relpath.find('index.html') == -1 and \
relpath != "urls" and relpath.find('.arff') == -1]
for relpath in paths:
path = data_dir + relpath
print path
soup = BeautifulSoup(open(path))
try:
meta = soup.find('ul', {"class":"review-meta"})
rev = Review()
rev.id = int(re.search('\d+', relpath).group(0))
rev.artist = meta.find('h1').find('a').get_text()
rev.album = meta.find('h2').get_text()
str_date = meta.find('span', {"class":"pub-date"}).get_text()
rev.date = datetime.strptime(str_date, '%B %d, %Y')
rev.score = float(meta.find('span', {"class":"score"}).get_text())
rev.text = soup.find('div', {"class":"editorial"}).get_text()
db.reviews.insert(rev.__dict__)
count = count + 1
print count
#out.write(rev.arff_row())
#print str(rev)
except Exception as e:
print e
print 'failed to parse ' + path
#out.close()