forked from Parent5446/facebook-research
-
Notifications
You must be signed in to change notification settings - Fork 0
/
facebook.py
439 lines (376 loc) · 18.3 KB
/
facebook.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
#!/usr/bin/env python
#
# This Facebook SDK is adapted from the official Facebook Graph API Python
# SDK. All original code from that SDK is licensed under the Apache License
# Version 2.0, a copy of which can be found at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# All changes, additions, etc. are dually licensed under the Apache License
# Version 2.0 and the GNU General Public License Version 3.0 as indicated below:
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
Python client library for the Facebook Platform. This client library is designed to support the
Graph API. Read more about the Graph API at http://developers.facebook.com/docs/api.
"""
import urllib
import datetime
import random
import operator
# Find a JSON parser
def find_json(logger, parser=True):
logger.debug("Searching for JSON parser...")
try:
import json
if parser:
_parse_json = lambda s: json.loads(s)
else:
_parse_json = lambda s: json.dumps(s)
except ImportError:
try:
import simplejson
if parser:
_parse_json = lambda s: simplejson.loads(s)
else:
_parse_json = lambda s: simplejson.dumps(s)
except ImportError:
try:
# For Google AppEngine
from django.utils import simplejson
if parser:
_parse_json = lambda s: simplejson.loads(s)
else:
_parse_json = lambda s: simplejson.dumps(s)
except ImportError:
logger.critical("JSON parser not found.")
raise
finally:
return _parse_json
class GraphAPI(object):
"""A client for the Facebook Graph API.
The Graph API is made up of the objects in Facebook (e.g., people, pages,
events, photos) and the connections between them (e.g., friends,
photo tags, and event RSVPs).
You can see a list of all of the objects and connections supported
by the API at http://developers.facebook.com/docs/reference/api/.
You can obtain an access token via OAuth. See http://developers.facebook.com/docs/authentication/
for details.
"""
def __init__(self, logger, json, access_token=None):
"""
Store the access token.
@param access_token: The Oauth access token from Facebook
@type access_token: C{Str}
"""
self.logger = logger
self._parse_json = json
self.access_token = access_token
def get_object(self, ids, **args):
"""
Fetchs the given object from the graph.
@param ids: An ID or a list of IDs to get
@type ids: C{int} or C{list} of C{int}s
@return: Either the object or a list of objects
@rtype: mixed
"""
if isinstance(ids, list) or isinstance(ids, set):
args["ids"] = ",".join(ids)
elif not isinstance(ids, str) and not isinstance(ids, unicode):
self.logger.error("Invalid object ID type passed to graph API.")
raise Exception("Invalid id type {0}.".format(type(ids)))
return self.request(ids, args)
def get_connection(self, conn_id, connection_name, **args):
"""
Fetchs the connections for given object.
Gets a given connection for an object. Pass the limit argument to
set how many connections to get.
@param conn_id: The ID of the parent object
@type conn_id: C{int}
@param connection_name: The name of the connection to get
@type connection_name: C{str}
@return: A list of connections
@rtype: C{list}
"""
return self.request(conn_id + "/" + connection_name, args)
def request(self, path, args=None):
"""
Fetches the given path in the Graph API.
We translate args to a valid query string. If post_args is given,
we send a POST request to the given path with the given arguments.
@param path: The path to the object to retrieve from the graph
@type path: C{str}
@param args: GET arguments to append to the request
@type args: C{list}
@return: The requested object or connection
@rtype: mixed
"""
if not args: args = {}
if self.access_token:
args["access_token"] = self.access_token
self.logger.debug("Requesting {0} from Facebook.".format(path))
self.logger.debug("URL: https://graph.facebook.com/" + path + "?" + urllib.urlencode(args))
success = 0
while success < 3:
try:
file = urllib.urlopen("https://graph.facebook.com/" + path + "?" + urllib.urlencode(args))
response = self._parse_json(file.read())
except IOError:
continue
finally:
pass
if not response.get("error"):
success = 4
else:
success += 1;
if response.get("error"):
self.logger.debug("Error received from Facebook: {0}".format(response["error"]["message"]))
self.logger.error("Failed to retrieve {0} from Facebook.".format(path))
raise Exception(response["error"]["type"], response["error"]["message"])
return response
class User:
"""
A class for a Facebook user.
Stores a list of the user's wall posts, a list of friends (and IDs), and the user's likes.
"""
import_fields = 'comments', 'created_time', 'from', 'likes', 'message', 'id'
"""The keys that should be kept in wall posts
@type: C{tuple}"""
def __init__(self, graph, logger, user_id, friend_data=1):
"""
Get all information about the user and process it.
Get the user object, the user's friends, wall, and likes, remove unnecessary
properties, and process the wall posts.
@param graph: A GraphAPI object
@type graph: L{Graph}
@param user_id: ID of the user
@type user_id: C{int}
@param friend_data: 0 to ignore friends, 1 to get friend list, and 2 to recurse friends
@type friend_data: C{int}
"""
self.logger = logger
self.graph = graph
self.logger.info("Retrieving data about user {0}.".format(user_id))
# Get the user
self.me = graph.get_object(user_id)
# If recurse_friends, make a user object for each friend, which in turn gets their
# wall and likes.
if friend_data == 2:
self.logger.info("Retrieving friend data from user {0}.".format(user_id))
self.friends = [User(graph, logger, friend['id'], 0) for friend in graph.get_connection(user_id, 'friends', limit=500).get('data', [])]
elif friend_data == 1:
self.logger.debug("Getting friend list from user {0}.".format(user_id))
self.friends = graph.get_connection(user_id, 'friends', limit=500).get('data', [])
else:
self.friends = []
# Get the user's wall and likes. Filter the wall to only get the fields we need
# and only keep the IDs from the likes
self.logger.debug("Getting wall data from user {0}.".format(user_id))
raw_wall = [dict([(key, value) for key, value in post.iteritems() if key in self.import_fields])
for post in graph.get_connection(user_id, 'feed', limit=500).get('data', [])]
self.logger.debug("Getting likes and activities from user {0}.".format(user_id))
self.likes = [like['id'] for like in graph.get_connection(user_id, 'likes').get('data', [])]
# Convert created_time into datetime
self.logger.debug("Processing wall posts from user {0}.".format(user_id))
wall = []
for post in raw_wall:
post['created_time'] = datetime.datetime.strptime(post['created_time'][:-5], "%Y-%m-%dT%H:%M:%S")
post['to'] = {'name': self.me['name'], 'id': user_id}
post['likes'] = post.get('likes', {'data': []})
post['comments'] = post.get('comments', {'data': []})
wall.append(post)
self.wall = wall
self.identity = {'name': self.me['name'], 'id': self.me['id']}
def intersect(self, friend):
"""
Determine which likes the user has in common with a friend.
@param friend: The friend to compare to
@type friend: L{User}
@return: A list of common like IDs
@rtype: C{list}
"""
self.logger.debug("Creating likes intersect with user {0} and {1}.".format(self.identity['id'], friend.identity['id']))
likes1 = self.likes
likes2 = friend.likes
return list(set(likes1) & set(likes2))
def wall_sample(self, n):
"""
Generate a sample of n posts from the user's wall and the user's friends' walls.
@param n: The number of posts to retrieve
@type n: C{int}
@return: A list of posts
@rtype: C{list}
"""
self.logger.debug("Generating {0} post wall sample for user {0}.".format(n, self.identity['id']))
posts = []
for friend in self.friends:
map(posts.append, friend.wall)
return random.sample(posts, n)
def wall_filter(self, time_start=False, time_end=False, author=False, liked_by=False, commented_by=False, intersect=True):
"""
Filter the wall posts with various filters.
Filter the wall posts by a time interval, authors, who liked the post, who
commented on the post, or any combination of those filters. By default, all
filters are off, but by setting a value to the appropriate parameter, the
filter is turned on.
@param time_start: Only show posts after this time
@type time_start: datetime.datetime
@param time_end: Only show posts before this time
@type time_end: datetime.datetime
@param author: Only show posts made by this user (name and id)
@type author: C{dict}
@param liked_by: Only show posts made liked by this user (name and id)
@type liked_by: C{dict}
@param commented_by: Only show posts made commented on by this user (name and id)
@type commented_by: C{dict}
@return: List of matching posts
@rtype: C{list}
"""
# Make user-readable log entry representing this filter.
logging_string = "Filter wall posts from user {0} for posts with ".format(self.identity['id'])
if intersect:
logging_string += "all:"
else:
logging_string += "any:"
if time_start:
logging_string += " after " + str(time_start) + ";"
if time_end:
logging_string += " before " + str(time_end) + ";"
if author:
logging_string += " posted by " + author.identity['id'] + ";"
if liked_by:
logging_string += " liked by " + liked_by.identity['id'] + ";"
if commented_by:
logging_string += " commented by " + commented_by.identity['id'] + ";"
self.logger.debug(logging_string)
def unique_posts(posts):
found = set()
for post in posts:
if not post.get('id', False):
raise Exception(post)
if post['id'] not in found:
yield post
found.add(post['id'])
def list_intersection(*args):
final = []
if len(args) == 0:
return []
for item in args[0]:
for list in args[1:]:
if item not in list:
continue
final.append(item)
return final
# Start filtering
posts = self.wall
if intersect:
default = posts
else:
default = []
if isinstance(time_start, datetime.datetime):
posts = [post for post in posts if post['created_time'] > time_start]
if isinstance(time_end, datetime.datetime):
posts = [post for post in posts if post['created_time'] < time_end]
if isinstance(author, User):
posts1 = list(unique_posts([post for post in posts if post['from'] == author.identity]))
else:
posts1 = default
if isinstance(liked_by, User):
posts2 = list(unique_posts([post for post in posts if [like for like in post['likes'].get('data', []) if like == liked_by.identity]]))
else:
posts2 = default
if isinstance(commented_by, User):
posts3 = list(unique_posts([post for post in posts if [comm for comm in post['comments'].get('data', []) if comm['from'] == commented_by.identity]]))
else:
posts3 = default
if intersect:
return list_intersection(posts, posts1, posts2, posts3)
else:
return list(unique_posts(posts1 + posts2 + posts3))
def make_training_data(self):
"""
Creates a set of training data for the support vector machine.
Creates a sample of posts, uses an internal function to gather data
from each post, then return the dataset. The length of each post,
the number of likes the user and author have in common, the time since
the author and user last communicated, the number of user posts the
author liked or commented on, and vice-versa are the data that is
collected.
@return: A list of tuples with an importance indicator and a tuple of data
@rtype: C{list} of C{tuple} with C{str} and C{tuple}
"""
posts = self.wall_sample(1000)
training_data = []
data = map(self.__fitness_internal, posts)
return data
def __fitness_internal(self, post):
"""
Takes an individual post and gathers the necessary data to
give to the support vector machine.
This is an internal and private function.The length of each post,
the number of likes the user and author have in common, the time since
the author and user last communicated, the number of user posts the
author liked or commented on, and vice-versa are the data that is
collected.
@param post: A post directly from the Graph API to evaluate
@return: Whether the post is important and a tuple of parameters
@rtype: C{tuple} of C{str} and a C{tuple}
"""
# If the user is the author, if the user liked it, or if the user commented, it is important.
if post['from'] == self.identity or self.identity in post['likes'].get('data', []) or\
[comm for comm in post['comments'].get('data', []) if comm['from'] == self.identity]:
important = True
else:
important = False
# Get the author and number of words
author = User(self.graph, self.logger, post['from']['id'], 0)
size = len(post.get('message', '').split())
# Find out how long since the two users last interacted.
if author.identity != self.identity:
# For each wall, filter posts that the other person either wrote or commented on.
wall_me = self.wall_filter(time_end=post['created_time'], author=author, commented_by=author, intersect=False)
wall_you = author.wall_filter(time_end=post['created_time'], author=self, commented_by=self, intersect=False)
# Sort and get the earliest from each.
wall_me = sorted(wall_me, key=operator.itemgetter('created_time'))
wall_you = sorted(wall_you, key=operator.itemgetter('created_time'))
if len(wall_me) > 0:
first_me = wall_me[0]
else:
first_me = {'created_time': datetime.datetime(1970, 1, 1, 0, 0, 0)}
if len(wall_you) > 0:
first_you = wall_you[0]
else:
first_you = {'created_time': datetime.datetime(1970, 1, 1, 0, 0, 0)}
# Find which one is the earliest and calculate the time difference.
if first_me['created_time'] > first_you['created_time']:
last_post = first_me
else:
last_post = first_you
time_diff = post['created_time'] - last_post['created_time']
else:
# The author is the user, thus the last interaction time is 0.
time_diff = 0
# Find how many of the author's posts the user liked or commented on in past three days
three_days_ago = post['created_time'] - datetime.timedelta(3)
posts_user_liked = author.wall_filter(time_start=three_days_ago, time_end=post['created_time'], author=author, liked_by=self)
posts_user_commented = author.wall_filter(time_start=three_days_ago, time_end=post['created_time'], author=author, commented_by=self)
interact_me2you = len(posts_user_liked) + len(posts_user_commented)
# Find how many of the user's posts the author liked or commented on in past three days
posts_author_liked = self.wall_filter(time_start=three_days_ago, time_end=post['created_time'], author=self, liked_by=author)
posts_author_commented = self.wall_filter(time_start=three_days_ago, time_end=post['created_time'], author=self, commented_by=author)
interact_you2me = len(posts_author_liked) + len(posts_author_commented)
# Check which likes the user and author have in common
common_likes = len(self.intersect(author))
# Finally, add the data onto the training set
return int(important), (size, time_diff.total_seconds(), interact_me2you, interact_you2me, common_likes)