In [9]:
# Load all of the necessary libararies
import tweepy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import requests
import json
import collections
import warnings

# Allow inline plotting, have plots use seaborn styling, turn off seaborn warnings
%matplotlib inline
sns.set()
warnings.filterwarnings('ignore')

In [None]:
# Start by retrieving image-predictions.tsv file
image_pred_url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
with open('image-predictions.tsv', 'wb') as image_pred_file:
    image_pred_gathered = requests.get(image_pred_url)
    image_pred_file.write(image_pred_gathered.content)

In [2]:
# Code for retrieving tweet data for each of the tweet ids contained in the WeRateDogs twitter
# archive. This code can be run but since the file containing the api keys isn't included in
# the project submission for security reasons, the code is presented here for review purposes
# only. A reviewer is free to try this code using his/her own api keys. From this point on,
# assume that this code has been run and the file containing the gathered data is present in the
# current working directory
import credentials as cred

# Create tweepy api object
auth = tweepy.OAuthHandler(cred.consumer_key, cred.consumer_secret)
auth.set_access_token(cred.access_token, cred.access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

# Get all of the tweet data for each tweet id
tweet_ids = pd.read_csv('twitter-archive-enhanced.csv').tweet_id.astype(str).tolist()
with open('tweet_json.txt', 'w') as tweet_data_file:
    for tweet_id in tweet_ids:
        try:
            tweet_data = api.get_status(tweet_id, tweet_mode='extended')
        except tweepy.TweepError:
            pass
        else:
            tweet_data_file.write(json.dumps(tweet_data._json) + '\n')
            print('Retrieved data for tweet id: {}'.format(tweet_id))
print('Retrieved data for all tweet ids.')

In [None]:
# Create a dataframe from the tweet data that we just retrieved.
tweet_data = []
with open('tweet_json.txt') as data_file:
    for json_obj in data_file:
        tweet_data_sub = collections.OrderedDict()
        tweet_data_all = json.loads(json_obj)

        # Get all of the data we're interested in
        tweet_data_sub['id'] = tweet_data_all['id']
        tweet_data_sub['retweet_count'] = tweet_data_all['retweet_count']
        tweet_data_sub['favorite_count'] = tweet_data_all['favorite_count']

        # Append it to the data gathering list
        tweet_data.append(tweet_data_sub)

# Create a dataframe from the data
tweet_data_extra = pd.DataFrame(tweet_data)