-
Notifications
You must be signed in to change notification settings - Fork 1
/
assess_sim.py
138 lines (110 loc) · 5.07 KB
/
assess_sim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#stripped-down version of hybrid_model, only simulates cascades for which we have fitted parameters and does not save output
#used to examine simulated cascade against actual cascade
import file_utils
import sim_tree
import fit_cascade
from functions_hybrid_model import *
import cascade_manip
from shutil import copyfile
import sys
import subprocess
#filepaths of input files
posts_filepath = "model_files/posts/%s_posts.pkl" #processed post data for each post, one file per subreddit
#each post maps original post id to numeric id, set of tokens, and user id
params_filepath = "model_files/params/%s_params.txt" #text file of fitted cascade params, one file per subreddit
#one line per cascade: cascade numeric id, params(x6), sticky factor (1-quality)
DISPLAY = False
#verify command line args
if len(sys.argv) != 3:
print("Incorrect command line arguments\nUsage: python3 hybrid_model.py <seed filename> <domain>")
exit(0)
#extract arguments
infile = sys.argv[1]
domain = sys.argv[2]
#print some log-ish stuff in case output being piped and saved
print("Input", infile)
#load post seeds
raw_post_seeds = load_reddit_seeds(infile)
#convert to dictionary of subreddit->list of post objects
post_seeds = defaultdict(list)
for post in raw_post_seeds:
post_seeds[post['subreddit']].append(post)
print({key : len(post_seeds[key]) for key in post_seeds})
post_counter = 1 #counter of posts to simulate, across all subreddits
#process each subreddit
for subreddit, seeds in post_seeds.items():
#TESTING ONLY!!!!
if subreddit != "Lisk":
continue
print("\nProcessing", subreddit, "with", len(seeds), "posts to simulate")
#load preprocessed posts for this subreddit
if file_utils.verify_file(posts_filepath % subreddit):
posts = file_utils.load_pickle(posts_filepath % subreddit)
print("Loaded", len(posts), "processed posts from", posts_filepath % subreddit)
else:
print("Cannot simulate for subreddit", subreddit, "without processed posts file", posts_filepath % subreddit)
exit(0)
#correct key format of seed posts, if necessary
for seed_post in seeds:
#if post id contains the t3_ prefix, strip it off so we don't have to change everything
if seed_post['id_h'].startswith(POST_PREFIX):
seed_post['id_h'] = seed_post['id_h'][3:]
#load fitted params for this subreddit
#fitted_params, quality = load_params(params_filepath % subreddit, posts, quality=True)
#if have cached cascades/comments for this set of posts, load them
if file_utils.verify_file("%s_test_cascades.pkl" % subreddit):
cascades = file_utils.load_pickle("%s_test_cascades.pkl" % subreddit)
comments = file_utils.load_pickle("%s_test_comments.pkl" % subreddit)
print("Loaded", len(cascades), "filtered test cascades")
#ptherwise, load filtered cascades for this subreddit, and build filtered list
else:
all_cascades, all_comments = cascade_manip.load_filtered_cascades(domain, subreddit)
seed_ids = [post['id_h'] for post in seeds]
cascades = {post_id: post for post_id, post in all_cascades.items() if post_id in seed_ids}
file_utils.save_pickle(cascades, "%s_test_cascades.pkl" % subreddit)
cascades, comments = cascade_manip.filter_comments_by_posts(cascades, all_comments)
file_utils.save_pickle(comments, "%s_test_comments.pkl" % subreddit)
#node2vec finished, on to the simulation!
#for each post, infer parameters and simulate
print("Simulating comment trees...")
for seed_post in seeds:
#grap real post for this cascade
test_post = cascades[seed_post['id_h']]
'''
#if we can, use fitted params
if test_post['id'] in fitted_params:
post_params = fitted_params[test_post['id']]
post_quality = quality[test_post['id']]
#otherwise, skip this cascade
else:
print("No fitted params for this post - skipping")
continue
'''
#fit params instead, because reasons
res = fit_cascade.fit_cascade_model(test_post, comments)
post_params = res[:6]
post_quality = res[6]
#print some stuff
print("\nTitle:", test_post['title_m'])
print("Params:", post_params)
print("Quality:", post_quality)
#some data on actual cascade
actual_root_replies = fit_cascade.get_root_comment_times(test_post, comments)
actual_all_replies = sorted(actual_root_replies + fit_cascade.get_other_comment_times(test_post, comments))
print("Actual cascade has", len(actual_root_replies), "replies and", len(actual_all_replies), "total comments")
actual_levels = fit_cascade.count_nodes_per_level(test_post, comments)
for level, count in actual_levels.items():
#print(" ", str(level) + ":", count)
print(count, end=" ")
print("")
#simulate a comment tree!
#actually, simulate a few
for i in range(5):
sim_root, all_times = sim_tree.simulate_comment_tree(post_params, display=True)
sim_levels = sim_tree.sim_count_nodes_per_level(sim_root)
for level, count in sim_levels.items():
#print(" ", str(level) + ":", count)
print(count, end=" ")
print("")
#finished all posts across all subreddit
print("Finished all simulations")