-
Notifications
You must be signed in to change notification settings - Fork 1
/
functions_hybrid_model.py
682 lines (553 loc) · 27.4 KB
/
functions_hybrid_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
import json
from collections import defaultdict
import string
import random
import itertools
import os
MAX_TOKEN_MATCH_POSTS = 5 #maximum number of token-matching posts to add to graph when inferring params for a
#post by an unseen user with all new tokens
MIN_EDGE_WEIGHT = 0.1 #minimum token weight threshold, try to keep edge explosion to a minimum
#filepaths of input files
subreddits_filepath = "model_files/subreddits.pkl" #dictionary of subreddit -> domain code
posts_filepath = "model_files/posts/%s_posts.pkl" #processed post data for each post, one file per subreddit
#each post maps original post id to numeric id, set of tokens, and user id
params_filepath = "model_files/params/%s_params.txt" #text file of fitted cascade params, one file per subreddit
#one line per cascade: cascade numeric id, params(x6), sticky factor (1-quality)
graph_filepath = "model_files/graphs/%s_graph.txt" #edgelist of post graph for this subreddit
DISPLAY = False #toggle debug print statements
POST_PREFIX = "t3_"
COMMENT_PREFIX = "t1_"
#creates submission json file
#domain - simulation domain, must be one of 'crypto', 'cyber', or 'CVE'
#events - list of simulation events, each a dictionary with the following fields:
'''
"rootID": id of root post for cascade
"actionType": 'post' or 'comment'
"parentID": id of parent (comment or post), null/self if this object is post
"nodeTime": timestamp of event
"nodeUserID": id of user posting this event
"nodeID": id of post/comment
"communityID": id of subreddit ???
'''
#outfile - location of output json file
#generates file of the following form:
"""
{"team" : 'usc',
"scenario" : 2,
"domain" : domain,
"platform" : reddit,
"data":[
JSON_datapoint,
JSON_datapoint,
:
:
:
JSON_datapoint,
JSON_datapoint]
}
"""
def reddit_sim_to_json(domain, events, outfile):
print("Saving results to", outfile + "...")
#header values
team_name = 'usc'
scenario = 2
platform = 'reddit'
if domain == 'cve':
domain = "CVE"
#crypto, cyber, or CVE domain
if domain != 'crypto' and domain != 'cyber' and domain != 'CVE':
print("Invalid domain")
return False
#write to json
submission = {'team' : team_name,
'scenario' : scenario,
'domain' : domain,
'platform' : platform,
'data' : events}
with open(outfile,'w') as f:
json.dump(submission, f, indent=3, sort_keys=False)
print("Done")
return
#end reddit_sim_to_json
#given a filename of cascade seed test data, load all seed posts
#lucky us, the input data is multiple json objects in the same file - not a list of them
#at least they're one per line!
def load_reddit_seeds(filename):
data = []
with open(filename,'r') as f:
for line in f:
d = json.loads(line)
data.append(d)
print("Read", len(data), "events")
return data
#end load_reddit_seeds
#given a sim_tree return (tree structure of comment times), and original seed post, convert to desired dictionary structure
#{"rootID": "t3_-dt8ErhaKuULHekBf_ke3A", "communityID": "t5_2s3qj", "actionType": "post", "parentID": "t3_-dt8ErhaKuULHekBf_ke3A", "nodeTime": "1421194091", "nodeUserID": "XHa80wD0LQJNlMcrUD32pQ", "nodeID": "t3_-dt8ErhaKuULHekBf_ke3A"}
'''
"rootID": id of root post for cascade
"actionType": 'post' or 'comment'
"parentID": id of parent (comment or post), null/self if this object is post
"nodeTime": timestamp of event
"nodeUserID": id of user posting this event
"nodeID": id of post/comment
"communityID": id of subreddit ???
'''
def build_cascade_events(root, post, user_ids, subreddit):
#get start of event structure by converting sim tree to events list
events = tree_to_events(root, post['id_h'], post['created_utc'], subreddit)
#set the rest of the fields, the same for all events in this cascade
for event in events:
event['rootID'] = POST_PREFIX + post['id_h']
event['communityID'] = post['subreddit_id']
#assign user to each event - random from known users for now
event['nodeUserID'] = post['author_h'] if event['nodeID'] == POST_PREFIX + post['id_h'] else random.choice(user_ids)
if DISPLAY:
print(post['id_h'], post['author_h'], post['created_utc'])
print("")
for event in events:
print(event)
print("")
return events
#end build_cascade_events
NEXT_ID = 0 #global counter of next id to assign to simulated comments
#given root of sim tree, return as list of dictionary events, with correct parent pointers
def tree_to_events(root, seedID, seedTime, subreddit):
global NEXT_ID
visited = set() #set of visited nodes
stack = [(root, None)] #node processing stack
events = [] #list of events, where each event is a dictionary
while len(stack) != 0:
curr, parent = stack.pop() #get last node added to stack
#create event dictionary for this comment, add to events list
new_event = {'nodeTime': str(int(curr['time'] * 60) + seedTime)} #convert offset to seconds, add to seed time, convert entire time to string
#set nodeid if comment
if parent != None:
new_event['nodeID'] = COMMENT_PREFIX + subreddit + "_comment" + str(NEXT_ID) #include comment prefix
NEXT_ID += 1
#nodeID = seed post ID if at root
else:
new_event['nodeID'] = POST_PREFIX + seedID
#set parent of this event
new_event['parentID'] = POST_PREFIX + seedID if parent == None else parent
#set actionType of this event
new_event['actionType'] = 'post' if parent == None else 'comment'
#add new event to list
events.append(new_event)
visited.add(curr['id']) #add this node to visited list
#append children in reverse time order so final output is sorted (not that it matters)
stack.extend([(child, new_event['nodeID']) for child in curr['children']][::-1])
return events
#end tree_to_events
#given a post, extract words by tokenizing and normalizing (no limitization for now)
#removes all leading/trailing punctuation
#converts list to set to remove duplicates (if we want that?)
#(duplicate of method in functions_prebuild_model.py, but copied here for convenience)
def extract_tokens(post):
punctuations = list(string.punctuation) #get list of punctuation characters
punctuations.append('—') #kill these too
title = post['title_m'] #grab post title as new string
tokens = [word.lower() for word in title.split()] #tokenize and normalize (to lower)
tokens = [word.strip("".join(punctuations)) for word in tokens] #strip trailing and leading punctuation
tokens = [word for word in tokens if word != '' and word not in punctuations and word != ''] #remove punctuation-only tokens and empty strings
return set(tokens) #convert to set before returning
#end extract_tokens
#given token sets from two posts, compute weight of edge between them
#(duplicate from functions_prebuild_model.py, copied for convenience)
def compute_edge_weight(tokens1, tokens2):
#get count of words shared by these posts
intersect_size = len(tokens1.intersection(tokens2))
#if intersection is non-empty, compute edge weight and add edge
if intersect_size > 0:
#weight = # tokens in common / # of tokens in shorter post
weight = intersect_size / (len(tokens1) if len(tokens1) < len(tokens2) else len(tokens2))
return weight
#no intersection, return False indicating no edge
return False
#end compute_edge_weight
#given set of posts and fitted params, build two helpful dictionaries
# user->set of posts
# tokens->set of posts
#use original post ids, translate to numeric in build_graph
#(duplicate of same method in functions_prebuild_model.py, but copied here for convenience)
def graph_dictionaries(posts):
tokens = defaultdict(set) #dictionary of tokens -> set of posts
users = defaultdict(set) #user->set of post-ids
for post_id, post in posts.items():
post_words = post['tokens'] #fetch tokens for this post
users[post['user']].add(post_id) #add this post to user set
for word in post_words:
tokens[word].add(post_id) #add post to each token set
return tokens, users
#end graph_dictionaries
TOKENS = None #global dictionaries for post graph, used in add_post_edges
USERS = None
GRAPH_SUBREDDIT = None #track subreddit used for current dictionaries (because need to clear when change)
def add_post_edges(graph, isolated_nodes, graph_posts, new_post, new_post_numeric_id, subreddit, all_posts, estimate_initial_params=False, fitted_params=False, fitted_quality=False):
global TOKENS, USERS, GRAPH_SUBREDDIT
#if don't have token/user dictionaries yet, get them now
if TOKENS == None or USERS == None or subreddit != GRAPH_SUBREDDIT:
TOKENS, USERS = graph_dictionaries(graph_posts)
GRAPH_SUBREDDIT = subreddit
#set isolated flag to make sure this node gets connected - otherwise need to handle as isolated
isolated = True
#tokenize this post, grab post user
new_tokens = extract_tokens(new_post)
new_user = new_post['author_h']
#compute average of neighbor params if estimating initial parameters option specified
if estimate_initial_params:
neighbor_count = 0
neighbor_sum = [0,0,0,0,0,0]
#add connecting edges for this post to the graph, token edges first
#add edges with scaled weight between posts with same tokens as this one
for token in new_tokens:
for other_post in TOKENS[token]:
#already an edge (and therefore a token edge) in the graph? skip
if (new_post_numeric_id, graph_posts[other_post]['id']) in graph or (graph_posts[other_post]['id'], new_post_numeric_id) in graph:
continue
#compute edge weight based on post token sets
weight = compute_edge_weight(graph_posts[other_post]['tokens'], new_tokens)
#if valid weight, add edge
if weight != False:
graph[(new_post_numeric_id, graph_posts[other_post]['id'])] = weight #add edge
isolated = False
if estimate_initial_params and all_posts[other_post]['id'] in fitted_params:
for i in range(6):
neighbor_sum[i] += (1-fitted_quality[all_posts[other_post]['id']]) * fitted_params[all_posts[other_post]['id']][i]
neighbor_count += (1-fitted_quality[all_posts[other_post]['id']])
#add edges of weight=1 between posts also by this user
for prev_post in USERS[new_user]:
#already an edge (token edge) between these users? just increase the weight
#want impact of both common user and common tokens
#check both edge orientations to be sure
if (new_post_numeric_id, graph_posts[prev_post]['id']) in graph:
graph[(new_post_numeric_id, graph_posts[prev_post]['id'])] += 1.0
elif (graph_posts[prev_post]['id'], new_post_numeric_id) in graph:
graph[(graph_posts[prev_post]['id'], new_post_numeric_id)] += 1.0
#new edge, just set weight
else:
graph[(new_post_numeric_id, graph_posts[prev_post]['id'])] = 1.0
isolated = False
if estimate_initial_params and all_posts[prev_post]['id'] in fitted_params:
for i in range(6):
neighbor_sum[i] += (1-fitted_quality[all_posts[prev_post]['id']]) * fitted_params[all_posts[prev_post]['id']][i]
neighbor_count += (1-fitted_quality[all_posts[prev_post]['id']])
#cve only: edge of weight 1 between posts in same subreddit
if 'sub' in new_post:
for prev_post_id, prev_post in graph_posts:
#same subreddit, add edge
if prev_post['sub'] == new_post['sub']:
#already an edge between these posts? just increase the weight
#check both edge orientations to be sure
if (new_post_numeric_id, prev_post['id']) in graph:
graph[(new_post_numeric_id, prev_post['id'])] += 1.0
elif (prev_post['id'], new_post_numeric_id) in graph:
graph[(prev_post['id'], new_post_numeric_id)] += 1.0
#new edge, just set weight
else:
graph[(new_post_numeric_id, prev_post['id'])] = 1.0
isolated = False
if estimate_initial_params and all_posts[prev_post]['id'] in fitted_params:
for i in range(6):
neighbor_sum[i] += (1-fitted_quality[all_posts[prev_post]['id']]) * fitted_params[all_posts[prev_post]['id']][i]
neighbor_count += (1-fitted_quality[all_posts[prev_post]['id']])
#if node is isolated (no edges added), include in isolated_nodes list
if isolated:
isolated_nodes.append(new_post_numeric_id)
#print("Isolated node")
#add this post to graph tracking, so we can connect it to other seed posts
graph_posts[new_post['id_h']] = {'tokens': new_tokens, 'id': new_post_numeric_id}
USERS[new_user].add(new_post['id_h'])
for token in new_tokens:
TOKENS[token].add(new_post['id_h'])
#for initial param estimate, finish average and return result
if estimate_initial_params and neighbor_count != 0:
for i in range(6):
neighbor_sum[i] /= neighbor_count
return graph, isolated_nodes, graph_posts, neighbor_sum
#no initial param estimate, return None
return graph, isolated_nodes, graph_posts, None
#end add_post_edges
#load fitted/inferred params from file
def load_params(filename, posts, inferred=False, quality=False):
#read all lines of file
with open(filename, 'r') as f:
lines = f.readlines()
#if reading inferred file, skip first line
if inferred:
lines.pop(0)
all_params = {}
all_quality = {}
#process each line, extract params
for line in lines:
values = line.split()
post_id = int(values[0]) #get original id for this post
params = []
for i in range(1, 7):
params.append(float(values[i]))
if inferred == False:
all_quality[post_id] = float(values[7])
all_params[post_id] = params
print("Loaded", len(all_params), "fitted params from", filename)
if quality:
return all_params, all_quality
return all_params
#end load_params
#given sampled graph posts, build sampled params file
def get_sampled_params(posts, in_filename, out_filename):
#build list of post ids included in graph
included_ids = [value['id'] for key, value in posts.items()]
print("Filtering params to", len(included_ids), "sampled posts")
#only save the params we actually need
post_params = {}
#read file, one line at at a time
with open(in_filename, "r") as ins:
for line in ins:
values = line.split()
post_id = int(values[0]) #get original id for this post
if post_id in included_ids:
params = []
for i in range(1, 8):
params.append(float(values[i]))
post_params[post_id] = params
#dump filtered params to file
with open(out_filename, "w") as f:
for post_id, params in post_params.items():
f.write(str(post_id) + " ") #write numeric post id
for i in range(len(params)):
f.write((' ' if i > 0 else '') + str(params[i]))
f.write("\n")
print("Saved sampled params to", out_filename)
#end get_sampled_params
#given graph posts, and set of fitted params, build sampled params file
#leave out params for exclude_post
#varies from get_sampled_params above because it assumes fitted params and qualities are already loaded (instead of reading from file)
#if estimate_initial_params argument is not False, include those in output file as well
def get_graph_params(posts, exclude_post, post_params, quality, out_filename, param_estimate=False):
#build list of post ids included in graph
included_ids = [value['id'] for key, value in posts.items()]
included_ids.remove(exclude_post) #don't include params for exclude_post
print("Filtering params to", len(included_ids), "graph posts")
#dump filtered params to file
with open(out_filename, "w") as f:
for post_id, params in post_params.items():
if post_id in included_ids:
f.write(str(post_id) + " ") #write numeric post id
for i in range(len(params)):
f.write(str(params[i]) + ' ')
f.write(str(quality[post_id]) + "\n")
if param_estimate != False:
f.write("%d %f %f %f %f %f %f\n" % (exclude_post, param_estimate[0], param_estimate[1], param_estimate[2], param_estimate[3], param_estimate[4], param_estimate[5]))
print("Saved graph params to", out_filename)
#end get_graph_params
#given complete set of posts/params for current subreddit, and seed posts,
#sample down to reach the target graph size (hopefully feasible for inference)
def user_sample_graph(raw_sub_posts, seeds, max_nodes, subreddit, min_node_quality, fitted_quality):
#is this a cve run? if so, set flag
cve = False
if subreddit == "cve":
cve = True
#if have minimum node quality threshold, throw out any posts with too low a quality
if min_node_quality != -1:
filtered_sub_posts = {key: value for key, value in raw_sub_posts.items() if value['id'] in fitted_quality and fitted_quality[value['id']] > min_node_quality}
print(" Filtered to", len(filtered_sub_posts), "based on minimum node quality of", min_node_quality)
else:
filtered_sub_posts = raw_sub_posts
#set of authors of seed
authors = set([post['author_h'] for post in seeds])
print(" ", len(authors), "authors in seed posts")
#cve: get list of subreddits of seed posts
if cve:
subs = set([post['subreddit'] for post in seeds])
#cve: keep posts with same subreddit or user
if cve:
sub_posts = {key: value for key, value in raw_sub_posts.items() if value['user'] in authors or value['sub'] in subs}
graph_subs = set([post['sub'] for post_id, post in sub_posts.items()])
#filter the posts: only those by users in the author pool
else:
sub_posts = {key: value for key, value in raw_sub_posts.items() if value['user'] in authors}
graph_authors = set([post['user'] for post_id, post in sub_posts.items()])
print(" Filtered to", len(sub_posts), "posts by", len(graph_authors), "authors")
#no more than MAX_GRAPH_POSTS posts, or this will never finish
if len(sub_posts) > max_nodes:
print(" Sampling down...")
#keep at least one post by each author that we have posts for (pick a random one)
#and keep any seed posts that we have
keep = set([seed['id_h'] for seed in seeds if seed['id_h'] in sub_posts])
for author in graph_authors:
keep.add(random.choice([key for key, value in sub_posts.items() if value['user'] == author]))
#if cve, keep at least one per sub
if cve:
for sub in graph_subs:
keep.add(random.choice([key for key, value in sub_posts.items() if value['sub'] == sub]))
#sample down (as far as we can, while maintaining one post per author (and sub if cve))
if max_nodes - len(keep) > 0:
keep.update(random.sample([key for key in sub_posts.keys() if key not in keep], max_nodes-len(keep)))
sub_posts = {key: sub_posts[key] for key in keep}
#too few? draw more
if len(sub_posts) < max_nodes:
print(" Drawing more posts...")
#add more
draw_keys = [key for key in raw_sub_posts.keys() if key not in sub_posts.keys()]
num_draw = (max_nodes - len(sub_posts)) if (max_nodes - len(sub_posts)) < len(draw_keys) else len(draw_keys)
more = random.sample(draw_keys, num_draw)
sub_posts.update({key: raw_sub_posts[key] for key in more})
graph_authors = set([post['user'] for post_id, post in sub_posts.items()]) #update graph authors list
print(" Sampled to", len(sub_posts), "posts by", len(set(graph_authors)), "authors for inference (" + str(len([author for author in authors if author in graph_authors])), "seed authors)")
if cve:
graph_subs = set([post['sub'] for post_id, post in sub_posts.items()])
print(" ", len(graph_subs), "subreddits in graph (" + str(len([sub for sub in subs if sub in graph_subs])), "seed subs)")
#can we connect all the seed posts to this sampled version of the graph? check authors and tokens
#build list of tokens represented in graph (extra work, yes, repeated in graph build)
graph_tokens = set()
for post_id, post in sub_posts.items():
graph_tokens.update(post['tokens'])
print(" ", len(graph_tokens), "tokens in graph")
#check to see if all seed posts can be connected to this graph
for post in seeds:
#post author in graph, no need for token match
if post['author_h'] in graph_authors:
continue
#if cve and post sub in graph, no need for token match
if cve and post['subreddit'] in graph_subs:
continue
post_tokens = extract_tokens(post) #get tokens from this post
#if no token connection, try to draw some more posts to allow for connection
if len(graph_tokens.intersection(post_tokens)) == 0:
#can we find a post in our library with some of these tokens? to get the node connected
matching_posts = {key:value for key, value in raw_sub_posts.items() if len(post_tokens.intersection(value['tokens'])) != 0}
#if too many matching token posts, sample down
if len(matching_posts) > MAX_TOKEN_MATCH_POSTS:
keep = random.sample(matching_posts.keys(), MAX_TOKEN_MATCH_POSTS)
matching_posts = {key: matching_posts[key] for key in keep}
#if no matching token/user posts - params will basically be a guess
if len(matching_posts) == 0:
print(" Cannot connect seed post to graph - parameter inference compromised")
#add the matching posts to the graph
else:
print(" Adding", len(matching_posts), "token-matching posts to graph")
sub_posts.update(matching_posts)
#return sampled posts and params
return sub_posts
#end user_sample_graph
#given a set of processed posts, "build" the post parameter graph
#but don't actually build it, just get an edgelist
#save edgelist to specified file
#no return, because graph is periodically dumped and not all stored in memory at once
#(near-duplicate of functions_prebuild_model.py for convenience)
def build_graph(posts, filename):
print("Building param graph for", len(posts), "posts")
#build the multi-graph
# one node for each post
# edge of weight=1 connecting posts by the same user
# edge of weight=(# shared)/(# in shortest title) between posts with common words
#(but really only one edge, with sum of both weights)
#store graph as edge-list dictionary, where (node, node) edge -> weight
graph = {}
nodes = set()
edge_count = 0
#loop all post-pairs and determine weight of edge, if any, between them
for post_pair in itertools.combinations(posts, 2): #pair contains ids of two posts
#fetch numeric ids for these posts
node1 = posts[post_pair[0]]['id']
node2 = posts[post_pair[1]]['id']
#compute edge weight based on post token sets
weight = compute_edge_weight(posts[post_pair[0]]['tokens'], posts[post_pair[1]]['tokens'])
if weight <= MIN_EDGE_WEIGHT: #minimum token weight threshold, try to keep edge explosion to a minimum
weight = 0
#cve only: if posts have same subreddit, add 1 to weight
if 'sub' in posts[post_pair[0]] and posts[post_pair[0]]['sub'] == posts[post_pair[1]]['sub']:
weight += 1
#if posts have same author, add 1 to weight
if posts[post_pair[0]]['user'] == posts[post_pair[1]]['user']:
weight += 1
#if edge weight is nonzero, add edge to graph
if weight != 0:
graph[(node1, node2)] = weight #add edge
nodes.add(node1) #track edges in graph so we can find isolated nodes later
nodes.add(node2)
edge_count += 1 #keep count of all edges
#if added edge, and current edgelist has reached dump level, dump and clear before continuing
if len(graph) == 25000000:
save_graph(graph, filename)
print(" Saved", edge_count, "edges")
graph = {}
#handle isolated/missing nodes - return a list of them, code into edgelist during output
isolated_nodes = [value['id'] for key, value in posts.items() if value['id'] not in nodes]
print("Finished graph has", len(nodes) + len(isolated_nodes), "nodes (" + str(len(isolated_nodes)), "isolated) and", edge_count, "edges")
#dump any remaining edges/isolated nodes to edgelist file (final save)
save_graph(graph, filename, isolated_nodes)
print("Saved post-graph to", filename)
#end build_graph
#given a set of processed posts, "build" the post parameter graph
#but don't actually build it, just get an edgelist
#save edgelist to specified file
#no return, because graph is periodically dumped and not all stored in memory at once
#performs same function as build_graph above, but also estimates initial params for specified node as average of neighbors
#post_id must be the NUMERIC id of the post, not the original id
def build_graph_estimate_node_params(posts, fitted_params, fitted_quality, post_id, filename):
print("Building param graph for", len(posts), "posts and estimating params for post", post_id)
#build the multi-graph
# one node for each post
# edge of weight=1 connecting posts by the same user
# edge of weight=(# shared)/(# in shortest title) between posts with common words
#(but really only one edge, with sum of both weights)
#store graph as edge-list dictionary, where (node, node) edge -> weight
graph = {}
nodes = set()
edge_count = 0
#for estimating params of post with post_id
neighbor_count = 0
neighbor_sum = [0,0,0,0,0,0]
#loop all post-pairs and determine weight of edge, if any, between them
for post_pair in itertools.combinations(posts, 2): #pair contains ids of two posts
#fetch numeric ids for these posts
node1 = posts[post_pair[0]]['id']
node2 = posts[post_pair[1]]['id']
#compute edge weight based on post token sets
weight = compute_edge_weight(posts[post_pair[0]]['tokens'], posts[post_pair[1]]['tokens'])
if weight <= MIN_EDGE_WEIGHT: #minimum token weight threshold, try to keep edge explosion to a minimum
weight = 0
#cve only: if posts have same subreddit, add 1 to weight
if 'sub' in posts[post_pair[0]] and posts[post_pair[0]]['sub'] == posts[post_pair[1]]['sub']:
weight += 1
#if posts have same author, add 1 to weight
if posts[post_pair[0]]['user'] == posts[post_pair[1]]['user']:
weight += 1
#if edge weight is nonzero, add edge to graph
if weight != 0:
graph[(node1, node2)] = weight #add edge
nodes.add(node1) #track edges in graph so we can find isolated nodes later
nodes.add(node2)
edge_count += 1 #keep count of all edges
#if added edge, and current edgelist has reached dump level, dump and clear before continuing
if len(graph) == 25000000:
save_graph(graph, filename)
print(" Saved", edge_count, "edges")
graph = {}
#if this edge involves specified post, update param estimate
if node1 == post_id or node2 == post_id:
other_id = node1 if node1 != post_id else node2 #numeric id of other post
for i in range(6):
neighbor_sum[i] += (1-fitted_quality[other_id]) * fitted_params[other_id][i]
neighbor_count += (1-fitted_quality[other_id])
#for initial param estimate, finish average and return result
if neighbor_count != 0:
for i in range(6):
neighbor_sum[i] /= neighbor_count
#handle isolated/missing nodes - return a list of them, code into edgelist during output
isolated_nodes = [value['id'] for key, value in posts.items() if value['id'] not in nodes]
print("Finished graph has", len(nodes) + len(isolated_nodes), "nodes (" + str(len(isolated_nodes)), "isolated) and", edge_count, "edges")
#dump any remaining edges/isolated nodes to edgelist file (final save)
save_graph(graph, filename, isolated_nodes)
print("Saved post-graph to", filename)
return neighbor_sum #return estimated params
#end build_graph_estimate_node_params
#save graph to txt file
def save_graph(edgelist, filename, isolated_nodes = []):
#and save graph to file
with open(filename, "w") as f:
for edge, weight in edgelist.items():
f.write("%d %d %f\n" % (edge[0], edge[1], weight))
for node in isolated_nodes:
f.write("%d\n" % node)
#end save_graph