-
Notifications
You must be signed in to change notification settings - Fork 1
/
prebuild_model.py
137 lines (115 loc) · 5.88 KB
/
prebuild_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#create and save all the necessary files for the new c++/python hybrid model
#save everything to model_files for easy server loading later
import cascade_analysis
import fit_cascade
import file_utils
from functions_prebuild_model import *
from itertools import count
import sys
#optional command line argument to limit to a single subreddit
if len(sys.argv) == 2:
subreddit_filter = sys.argv[1]
#filepaths of output files
subreddits_filepath = "model_files/subreddits.pkl" #dictionary of subreddit -> domain code
posts_filepath = "model_files/posts/%s_posts.pkl" #processed post data for each post, one file per subreddit
#each post maps original post id to numeric id, set of tokens, and user id
params_filepath = "model_files/params/%s_params.txt" #text file of fitted cascade params, one file per subreddit
#one line per cascade: cascade numeric id, params(x6), sticky factor (1-quality)
graph_filepath = "model_files/graphs/%s_graph.txt" #edgelist of post graph for this subreddit
users_filepath = "model_files/users/%s_users.txt" #list of users seen in posts/comments, one file per subreddit
#load the subreddit distribution for all cascades (just need a list of subreddits)
if file_utils.verify_file(subreddits_filepath):
print("Loading subreddit list from", subreddits_filepath)
subreddit_dict = file_utils.load_pickle(subreddits_filepath)
#file doesn't exist, build it
else:
#load all three domain breakdown files
crypto_subreddit_dist = file_utils.load_json("results/crypto_post_subreddit_dist.json")
cve_subreddit_dist = file_utils.load_json("results/cve_post_subreddit_dist.json")
cyber_subreddit_dist = file_utils.load_json("results/cyber_post_subreddit_dist.json")
#combine into single dictionary of subreddit -> list of corresponding domain codes
subreddit_dict = build_domain_dict([set(crypto_subreddit_dist.keys()), set(cve_subreddit_dist.keys()), set(cyber_subreddit_dist.keys())], ["crypto", "cve", "cyber"])
#now, kill all the duplicates! crypto and cyber scraped entire subreddits,
#so any overlap is redudant and can be thrown away
#(yes, there are neater ways to do this, but I don't care!)
for item in subreddit_dict.keys():
if len(subreddit_dict[item]) > 1:
#crypto and cyber drowns out cve, so remove it
if ("crypto" in subreddit_dict[item] or "cyber" in subreddit_dict[item]) and "cve" in subreddit_dict[item]:
subreddit_dict[item].remove("cve")
subreddit_dict[item] = subreddit_dict[item][0]
#save as pickle for later
print("Saving subreddit->domain mapping to", subreddits_filepath)
file_utils.save_pickle(subreddit_dict, subreddits_filepath)
#verify directories for output files
file_utils.verify_dir("model_files/params")
file_utils.verify_dir("model_files/posts")
file_utils.verify_dir("model_files/graphs")
file_utils.verify_dir("model_files/users")
#loop all subreddits
for subreddit, domain in subreddit_dict.items():
if subreddit != subreddit_filter:
continue
'''
if domain != "crypto":
continue
'''
print("\nProcessing", subreddit, "in", domain, "domain")
#load filtered cascades for this subreddit
#cascades, comments = load_subreddit_cascades(subreddit, domain)
#delay loading until we actually need them
cascades = None
comments = None
#load processed version of posts
if file_utils.verify_file(posts_filepath % subreddit):
posts = file_utils.load_pickle(posts_filepath % subreddit)
print("Loaded", len(posts), "processed posts from", posts_filepath % subreddit)
#doesn't exist, build processed posts file
else:
#load cascades if not yet loaded
cascades, comments = load_cascades(subreddit, domain, cascades, comments)
#assign numeric ids to each post for node2vec input files
#get set of tokens
#extract and maintain user
c = count()
posts = {key: {'user': value['author_h'], 'tokens': extract_tokens(value), 'id': next(c)} for key, value in cascades.items()}
#save this to file
file_utils.save_pickle(posts, posts_filepath % subreddit)
print("Saved", len(posts), "processed posts to", posts_filepath % subreddit)
#build list of users active in this subreddit - list, not set, so more active users are more likely to get drawn in the simulation
if file_utils.verify_file(users_filepath % subreddit):
print("Active users exist in", users_filepath % subreddit)
else:
#load cascades if not yet loaded
cascades, comments = load_cascades(subreddit, domain, cascades, comments)
#build active users list
active_users = []
for post_id, post in cascades.items():
active_users.append(post['author_h'])
for comment_id, comment in comments.items():
active_users.append(comment['author_h'])
file_utils.save_pickle(active_users, users_filepath % subreddit)
print("Saved", len(active_users), "active users to", users_filepath % subreddit)
#fit params to all of the cascades, if no file
#no need to load if we have them, won't use them again
if file_utils.verify_file(params_filepath % subreddit):
print("Params exist in", params_filepath % subreddit)
else:
#load cascades if not yet loaded
cascades, comments = load_cascades(subreddit, domain, cascades, comments)
#fit params to all cascades
all_params = cascade_analysis.fit_all_cascades(domain, cascades, comments, False, subreddit)
#save to text file now
with open(params_filepath % subreddit, "w") as f:
for post_id, params in all_params.items():
f.write(str(posts[post_id]['id']) + " ") #write numeric post id
for i in range(len(params)):
f.write((' ' if i > 0 else '') + str(params[i]))
f.write("\n")
print("Saved text-readable params to", params_filepath % subreddit)
#check for graph
if file_utils.verify_file(graph_filepath % subreddit):
print("Post-graph exists in", graph_filepath % subreddit)
else:
#now we have to build the graph... ugh
build_graph(posts, graph_filepath % subreddit)