/
fetch.py
79 lines (68 loc) · 2.08 KB
/
fetch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import warnings
warnings.simplefilter('ignore', DeprecationWarning)
import httplib2, urllib, time
try:
import json
except ImportError:
import simplejson as json
from config import USERNAME, PASSWORD
USER_TIMELINE = "http://twitter.com/statuses/user_timeline.json"
FILE = "my_tweets.json"
h = httplib2.Http()
h.add_credentials(USERNAME, PASSWORD, 'twitter.com')
def load_all():
try:
return json.load(open(FILE))
except IOError:
return []
def fetch_and_save_new_tweets():
tweets = load_all()
old_tweet_ids = set(t['id'] for t in tweets)
if tweets:
since_id = max(t['id'] for t in tweets)
else:
since_id = None
new_tweets = fetch_all(since_id)
num_new_saved = 0
for tweet in new_tweets:
if tweet['id'] not in old_tweet_ids:
tweets.append(tweet)
num_new_saved += 1
tweets.sort(key = lambda t: t['id'], reverse=True)
# Delete the 'user' key
for t in tweets:
if 'user' in t:
del t['user']
# Save back to disk
json.dump(tweets, open(FILE, 'w'), indent = 2)
print "Saved %s new tweets" % num_new_saved
def fetch_all(since_id = None):
all_tweets = []
seen_ids = set()
page = 0
args = {'count': 200}
if since_id is not None:
args['since_id'] = since_id
all_tweets_len = len(all_tweets)
while True:
args['page'] = page
headers, body = h.request(
USER_TIMELINE + '?' + urllib.urlencode(args), method='GET'
)
page += 1
tweets = json.loads(body)
if 'error' in tweets:
raise ValueError, tweets
if not tweets:
break
for tweet in tweets:
if tweet['id'] not in seen_ids:
seen_ids.add(tweet['id'])
all_tweets.append(tweet)
#print "Fetched another %s" % (len(all_tweets) - all_tweets_len)
all_tweets_len = len(all_tweets)
time.sleep(2)
all_tweets.sort(key = lambda t: t['id'], reverse=True)
return all_tweets
if __name__ == '__main__':
fetch_and_save_new_tweets()