In [1]:
import tweepy

In [None]:
# Define and open the local OAuth token configuration file
oauth_token_file_name = 'twitterOAuth.cfg'
oauth_token_file = open(oauth_token_file_name,'r')

# Read tokens from local file. This file consists of four lines, 
# ended in a newline. Each of these lines contains the in the order
# specified below.
consumer_key = oauth_token_file.readline()[:-1] # Strip '\n'
consumer_secret = oauth_token_file.readline()[:-1]
access_token = oauth_token_file.readline()[:-1]
access_token_secret = oauth_token_file.readline()[:-1]

In [None]:
# Configure and intialize the OAuth class
from tweepy import OAuthHandler

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

# Initialize Twitter API access
api = tweepy.API(auth,wait_on_rate_limit = True, wait_on_rate_limit_notify = True)

In [None]:
# Lets get information about a user
target_user_name = 'RealDonaldTrump'
target_user = api.get_user(target_user_name)

# Some information
print('@%s' % target_user_name)
print('----------------------')
print('ID : %s' % target_user.id)
print('Followers: %s' % target_user.followers_count)

## Fetch the Latest 3,200 Tweets
----

In [None]:
# Open database CSV file
tweet_database_file_name = 'donald_dump_fresh.dat'
tweet_database_file = open(tweet_database_file_name, 'w')
# Write the header
tweet_database_file.write('id\tsource\tcreated_at\ttext\tfavorite_count\tretweet_count\tcoords\n')

# Define a function which will store the necessary details of the tweets
def store_tweet(file, status, delim = '\t'):
    file.write('%d%s' % (status.id,delim))
    file.write('"%s"%s' % (status.source,delim))
    # Write time
    timestamp = status.created_at.timestamp()
    file.write('%s%s' % (status.created_at.strftime('%a %b %d %H:%M:%S %z %Y'),delim))
    # Process text ...?    
    text = status.text.replace('\n',' ')
    file.write('"%s"%s' % (text,delim))
    file.write('%d%s' % (status.favorite_count,delim))
    file.write('%d%s' % (status.retweet_count,delim))
    # Is there a coordinate ? 
    coord = 'NA' if (status.coordinates == None) else '"%s"' % status.coordinates
    file.write('%s' % coord)
    file.write('\n')

# Process and write tweets
maximum_tweets = 100

for status in tweepy.Cursor(api.user_timeline, id = target_user.id).items(maximum_tweets):
    store_tweet(tweet_database_file,status)

## Recover Older Trump Tweets
----
Because of access restrictions we cannot get to tweets older than the latest 3,200 via the  Twitter API. Thanks to some hard work by other users, we can access a less-frequently updated but futher-reaching database of Trump tweets. 

This dataset is updated irregularily, so it could be some days inbetween 

In [3]:
import pandas as pd
old_donald_dump_url = 'https://raw.githubusercontent.com/bpb27/political_twitter_archive/master/realdonaldtrump/realdonaldtrump.csv'

# Retrieve the list of tweet ids
old_tweets = pd.read_csv(old_donald_dump_url)
old_tweet_ids = old_tweets['id_str']

# Create a DataFrame for the Fresh Tweets we aleady have
tweet_database_file_name = 'donald_dump_fresh.dat'
new_tweets = pd.read_table(tweet_database_file_name, delimiter='\t', header=0)

# Find all tweets older than the last one we viewed
oldest_fresh_tweet_id = new_tweets['id'][len(new_tweets) - 1]

# Not a "yuuuuuge" data problem...grab the sublist for older tweet ids
old_tweets_to_keep = [True if x < oldest_fresh_tweet_id else False for x in old_tweet_ids]
old_tweets = old_tweets.iloc[old_tweets_to_keep]



The problem is that Twitter limits us to 15 API calls every 15 minutes, or 1 API call per minute. Considering the very large number of tweets Trump has made since 2009, this means that it is impractical to call each of these tweets individually.

Instead, lets concatenate the fields that we can into the existing dataset. 

In [4]:
# Modify the columns of the old tweets to match the fresh tweets
from numpy import nan
n = len(old_tweets)
old_tweets['coords'] = [nan for i in range(0,n)]
old_tweets.sort_values('id_str',ascending=False, inplace=True)
old_tweets['id'] = old_tweets['id_str']

In [15]:
# Brute-force merge
column_choice = ['id','created_at','text','retweet_count','favorite_count','source']
all_tweets = pd.DataFrame(columns = column_choice)
all_tweets = all_tweets.append(new_tweets.ix[:,column_choice], ignore_index = True)
all_tweets = all_tweets.append(old_tweets.ix[:,column_choice], ignore_index = True)

# Ensure integer fields
all_tweets[['retweet_count','favorite_count']] = all_tweets[['retweet_count','favorite_count']].apply(pd.to_numeric,downcast='integer')
# Ensure strings when needed
all_tweets['id'] = all_tweets['id'].astype(str)

In [20]:
# Write to disk
import csv
final_database_file = 'donald_dump.dat'
all_tweets.to_csv(final_database_file,sep='\t',header=True,quoting=csv.QUOTE_NONNUMERIC,float_format='%.1f',index=False)

In [None]:
type(all_tweets['id'][3000])

In [6]:
import numpy as np
a = pd.DataFrame(np.random.randn(5,5))

In [26]:
import math
'%.1f' % 123456789123456789.0

'123456789123456784.0'