-
Notifications
You must be signed in to change notification settings - Fork 0
/
process_updates.py
97 lines (86 loc) · 3.46 KB
/
process_updates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import pickle
import uuid
import re
class Processor(object):
"""
A utility for processing the raw text data pulled from the HCCPD site.
"""
def __init__(self, update_file):
self.update_file = update_file
self.raw_updates = self._load(self.update_file)
self.people = {}
@staticmethod
def _load(infile):
"""
Load pickled data from the provided file.
"""
return pickle.load(open(infile, 'rb'))
def _add_new_person(self, bio, segments):
person = {
'bio': bio,
'segments': segments
}
self.people[str(uuid.uuid4())] = person
def segment(self):
"""
Process the raw update data into Person objects.
"""
for page in self.raw_updates.itervalues():
segments = dict()
# Exact weekday varies from year to year, so extract when updates
# were due for the current page.
breaks = re.findall(r'(?:Jan|Feb|Mar|Apr|May) \d\d?', page)
if len(breaks) == 0:
continue # Skip non-update pages
bio = page[:page.find(breaks[0])]
for i, brk in enumerate(breaks):
# Grab and save from the current break to the next break.
try:
segments[brk] = self.seg_helper(page[page.find(breaks[i]):
page.find(breaks[i+1])])
except IndexError:
# Last segment is from current break to end of string
segments[brk] = self.seg_helper(page[page.find(breaks[i]):])
self._add_new_person(bio, segments)
def seg_helper(self, seg):
one = seg.find('What I did:')
two = seg.find('What I did to develop my professional awareness and/or my human network:')
three = seg.find('How long I spent:')
four = seg.find('What I plan to do next week:')
try:
split = {
'did': seg[one + 10:two],
'pro': seg[two + 71:three],
'hrs': seg[three:four],
'fut': seg[four:]
}
hrsexpr = r'How long I spent:\n?(.*?)(?:Recall that you .+\))?(.*)'
matches = re.findall(hrsexpr, split['hrs'])
if not matches:
split['hrs'] = -1
elif matches:
for match in matches[0]:
discard_expr = r'@|http|Don\'t|Update'
if not re.findall(discard_expr, match):
clean_expr = r'(\d{1,2}(?:\.5)?) ?(?:hrs?|hours?)'
stripped = re.findall(clean_expr, match, re.IGNORECASE)
if stripped:
split['hrs'] = float(stripped[0])
else: # Entries without a time unit?
try:
tim = float(match)
if tim < 15:
split['hrs'] = tim
else: # Probably minutes: toss out
split['hrs'] = -1
except ValueError:
split['hrs'] = -1
except TypeError:
split = {}
return split
if __name__ == "__main__":
import json
pro = Processor('scraped_updates.p')
pro.segment()
with open('people.json', 'w') as outfile:
json.dump(pro.people, outfile)