/
bills.rb
executable file
·392 lines (315 loc) · 11.9 KB
/
bills.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
class Bills
# options:
# congress: The congress to update.
# bill_id: The particular bill to update. Useful for development.
# limit: A limit on the number of processed bills. Useful for development.
def self.run(options = {})
congress = options[:congress] ? options[:congress].to_i : Utils.current_congress
count = 0
missing_legislators = []
missing_committees = []
bad_bills = []
# we track when new summaries are added, for analytical interest
new_summaries = []
unless File.exists?("data/unitedstates/congress/#{congress}/bills")
Report.failure self, "Data not available on disk for the requested Congress."
return
end
legislators = {}
committee_cache = {}
if options[:bill_id]
bill_ids = [options[:bill_id]]
else
paths = Dir.glob("data/unitedstates/congress/#{congress}/bills/*/*")
bill_ids = paths.map {|path| "#{File.basename path}-#{congress}"}
if options[:limit]
bill_ids = bill_ids.first options[:limit].to_i
end
end
bill_ids.each do |bill_id|
bill = Bill.find_or_initialize_by bill_id: bill_id
type, number, congress, chamber = Utils.bill_fields_from bill_id
path = "data/unitedstates/congress/#{congress}/bills/#{type}/#{type}#{number}/data.json"
doc = Oj.load open(path)
introduced_on = doc['introduced_at'] # must get done before summary check
if doc['sponsor']
sponsor = sponsor_for doc['sponsor'], legislators
missing_legislators << [bill_id, doc['sponsor']] if sponsor.nil?
else
sponsor = nil # occurs at least in hjres45-111, debt ceiling bill
end
cosponsors, withdrawn, missing = cosponsors_for doc['cosponsors'], legislators
missing_legislators += missing.map {|m| [bill_id, m]} if missing.any?
actions = actions_for doc['actions'], committee_cache
summary = summary_for doc['summary']
summary_short = short_summary_for summary
summary_date = summary_date_for doc['summary']
# if a summary is here, and it wasn't before, record this
if bill['summary'].blank? and summary.present?
new_summaries << {bill_id: bill_id, introduced_on: introduced_on, new_record: bill.new_record?}
end
committees, missing = committees_for doc['committees'], committee_cache
missing_committees += missing.map {|m| [bill_id, m]} if missing.any?
# todo: when amendments are supported,
# pass on a full related_bills field with the original fields.
related_bill_ids = doc['related_bills'].map {|details| details['bill_id']}.compact
votes = votes_for actions
# in rare cases, there are no actions. in those cases:
# * make last_action null
# * set last_action_at to introduced_on, so there's always something to sort on
last_action = actions.any? ? actions.last : nil
last_action_at = actions.any? ? actions.last['acted_at'] : introduced_on
bill.attributes = {
bill_type: type,
number: number,
congress: congress,
chamber: {'h' => 'house', 's' => 'senate'}[type.first.downcase],
short_title: doc['short_title'],
official_title: doc['official_title'],
popular_title: doc['popular_title'],
titles: doc['titles'],
keywords: doc['subjects'],
summary: summary,
summary_short: summary_short,
summary_date: summary_date,
sponsor: sponsor,
sponsor_id: (sponsor ? sponsor['bioguide_id'] : nil),
cosponsors: cosponsors,
cosponsor_ids: cosponsors.map {|c| c['legislator']['bioguide_id']},
cosponsors_count: cosponsors.size,
withdrawn_cosponsors: withdrawn,
withdrawn_cosponsor_ids: withdrawn.map {|c| c['legislator']['bioguide_id']},
withdrawn_cosponsors_count: withdrawn.size,
introduced_on: introduced_on,
history: history_for(doc['history']),
enacted_as: enacted_as_for(doc),
actions: actions,
last_action: last_action,
last_action_at: last_action_at,
votes: votes,
last_vote_at: votes.last ? votes.last['acted_at'] : nil,
committees: committees,
committee_ids: committees.map {|c| c['committee']['committee_id']},
related_bill_ids: related_bill_ids,
urls: urls_for(bill_id)
}
if bill.save
# work-around - last_action_at and last_vote_at can both be dates or times,
# and Mongo does not order these correctly together when times are turned into
# Mongo native time objects. So, we serialize them to a string before saving it.
['last_action_at', 'last_vote_at'].each do |field|
if bill[field]
bill[field] = bill[field].xmlschema unless bill[field].is_a?(String)
bill.set(field, bill[field])
end
end
count += 1
puts "[#{bill_id}] Saved successfully" if options[:debug]
else
bad_bills << {attributes: bill.attributes, error_messages: bill.errors.full_messages}
puts "[#{bill_id}] Error saving, will file report"
end
end
if missing_legislators.any?
missing_legislators = missing_legislators.uniq
Report.warning self, "Found #{missing_legislators.size} unmatchable legislators.", {missing_legislators: missing_legislators}
end
if missing_committees.any?
missing_committees = missing_committees.uniq
# Report.warning self, "Found #{missing_committees.size} missing committee IDs or subcommittee names.", {missing_committees: missing_committees}
end
if bad_bills.any?
Report.failure self, "Failed to save #{bad_bills.size} bills.", bill: bad_bills.last
end
if new_summaries.any?
Event.new_summaries! new_summaries
# Report.warning self, "Summaries added for #{new_summaries.size} bills, data attached", new_summaries: new_summaries
end
Report.success self, "Synced #{count} bills for congress ##{congress} from THOMAS.gov."
end
def self.sponsor_for(sponsor, legislators)
# cached by thomas ID
if legislators[sponsor['thomas_id']]
legislators[sponsor['thomas_id']]
elsif legislator = legislator_for(sponsor['thomas_id'])
# cache it for next time
legislators[sponsor['thomas_id']] = legislator
legislator
else
# no match, this needs to get reported
nil
end
end
# just make sure all the dates are in UTC
def self.history_for(history)
new_history = history.dup
history.each do |key, value|
if (key =~ /_at$/) and (value[":"])
new_history[key] = Utils.utc_parse(value)
end
end
new_history
end
def self.cosponsors_for(cosponsors, legislators)
new_cosponsors = []
withdrawn_cosponsors = []
missing = []
cosponsors.each do |cosponsor|
person = nil
if legislators[cosponsor['thomas_id']]
person = legislators[cosponsor['thomas_id']]
elsif person = legislator_for(cosponsor['thomas_id'])
# cache it for next time
legislators[cosponsor['thomas_id']] = person
end
if person
cosponsorship = {'sponsored_on' => cosponsor['sponsored_at']}
if cosponsor['withdrawn_at']
cosponsorship['withdrawn_on'] = cosponsor['withdrawn_at']
withdrawn_cosponsors << cosponsorship.merge('legislator' => person)
else
new_cosponsors << cosponsorship.merge('legislator' => person)
end
else
missing << cosponsor
end
end
[new_cosponsors, withdrawn_cosponsors, missing]
end
# clean up on some fields in actions
def self.actions_for(actions, committee_cache)
now = Time.now
actions.map do |action|
if action['acted_at'].is_a?(String)
time = Time.parse(action['acted_at'])
else
time = action['acted_at']
end
# discard future 'actions', that's not what this is about
next if time > now
if action['acted_at'] =~ /:/
action['acted_at'] = Utils.utc_parse action['acted_at']
end
if where = action.delete('where')
action['chamber'] = {'h' => 'house', 's' => 'senate'}[where]
# can only do this if 'where' is present (which it should be)
if roll = action.delete('roll')
action['roll_id'] = "#{where}#{roll}-#{time.year}"
end
end
committees = []
if committee_ids = action.delete('committees')
committee_ids.each do |committee_id|
if match = committee_match(committee_id, committee_cache)
committees << {
'committee_id' => committee_id,
'name' => match['name']
}
end
end
end
action['committees'] = committees if committees.any?
# these are old, unsupported forms of attaching committees to actions
action.delete 'committee'
action.delete 'subcommittee'
action.delete 'in_committee'
# we don't use this one
action.delete 'status'
action
end.compact
end
def self.votes_for(actions)
actions.select do |action|
(action['type'] =~ /vote/)
end
end
def self.committees_for(elements, committee_cache)
committees = []
missing = []
elements.each do |committee|
# we're not getting subcommittees, way too hard to match them up
if committee['subcommittee_id'].present?
committee_id = committee['committee_id'] + committee['subcommittee_id']
elsif committee['subcommittee']
puts "unitedstates layer failed to normalize subcommittee (#{committee['subcommittee']}) for committee ID #{committee['committee_id']} -- skipping"
next
else
committee_id = committee['committee_id']
end
if match = committee_match(committee_id, committee_cache)
committees << {
'activity' => committee['activity'],
'committee' => match
}
else
missing << committee_id
end
end
[committees, missing]
end
def self.legislator_for(thomas_id)
legislator = Legislator.where(thomas_id: thomas_id).first
legislator ? Utils.legislator_for(legislator) : nil
end
def self.committee_match(id, committee_cache)
committee_cache ||= {}
unless committee_cache[id]
if committee = Committee.where(committee_id: id).first
committee_cache[id] = Utils.committee_for(committee)
end
end
committee_cache[id]
end
def self.summary_for(summary)
summary ? summary['text'] : nil
end
def self.summary_date_for(summary)
summary ? summary['date'] : nil
end
def self.short_summary_for(summary)
return nil unless summary
max = 1000
if summary.size <= max
summary
else
summary[0..max] + "..."
end
end
def self.urls_for(bill_id)
type, number, congress, chamber = Utils.bill_fields_from bill_id
{
congress: congress_gov_url(congress, type, number),
govtrack: govtrack_url(congress, type, number),
opencongress: opencongress_url(bill_id)
}
end
def self.opencongress_url(bill_id)
"http://www.opencongress.org/bill/#{bill_id}"
end
def self.govtrack_url(congress, type, number)
"https://www.govtrack.us/congress/bills/#{congress}/#{type}#{number}"
end
# todo: when they expand to earlier (or later) congresses, 'th' is not a universal ordinal
def self.congress_gov_url(congress, type, number)
"http://beta.congress.gov/bill/#{congress}th/#{congress_gov_type type}/#{number}"
end
def self.congress_gov_type(bill_type)
{
"hr" => "house-bill",
"hres" => "house-resolution",
"hconres" => "house-concurrent-resolution",
"hjres" => "house-joint-resolution",
"s" => "senate-bill",
"sres" => "senate-resolution",
"sconres" => "senate-concurrent-resolution",
"sjres" => "senate-joint-resolution"
}[bill_type]
end
def self.enacted_as_for(doc)
return nil unless doc['enacted_as']
enacted_as = doc['enacted_as'].dup
enacted_as['congress'] = enacted_as['congress'].to_i
enacted_as['number'] = enacted_as['number'].to_i
enacted_as
end
end