In [1]:
# Import libraries
import json
import time
import threading
from datetime import datetime
from twython import Twython

# Imports the keys from the python file
from twitter_key import api_key, api_secret

# Import some additional libraries that will allow us to plot and interact with the operating system
from os import listdir
from os.path import isfile, join
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Assigns the keys to the variables
APP_KEY = api_key
APP_SECRET = api_secret

# Create a Twython object called Twitter
# Set this up using your Twitter Keys
# Say we are going to use OAuth 2
twython_setup = Twython(APP_KEY, APP_SECRET, oauth_version=2)

# Get an OAuth2 access token, save as variable so we can launch our 
OAUTH2_ACCESS_TOKEN = twython_setup.obtain_access_token()

# Create a Twython Object we will use for our access to the API
my_twython = Twython(APP_KEY, access_token=OAUTH2_ACCESS_TOKEN)

In [3]:
# Functions for Twitter scraper defined here

def continuous_scrape(duration):
    """Gets maximum allowed number of tweets from Twitter API multiple times.
    Operates in 15 minute cycles.
    Saves results as json.
    """

    # Number of times the code will be executed. 
    print ("Initiating continuous scraping for " + str(duration) + " minutes.")
    cycles = float(duration/15)
    while cycles > 0 and duration > 0:
        # Uses try/except structure to handle API errors.
        try:
            t = get_max_tweets(duration, latlong)
            # Names file by time.
            timestr = time.strftime("%Y%m%d-%H%M%S")
            # We write a new JSON into the target path
            print ("Writing file " + "%stweets.json" %(timestr + "_" + search_term + "_"))
            with open( 'data/' + '%stweets.json' %(timestr + "_" + search_term), 'w' ) as f:
                f.write(json.dumps(t))
            cycles -= 1
            duration -= 15
        except:
            pass
    print ("All done.")

    
def get_max_tweets(duration, latlong=0):
    """Gets tweets at maximum allowable rate of 450 tweets/15 mins.
    Rate is fixed in function, but frequency of request can be adjusted.
    Will run for up to 15 minutes.
    Returns dictionary of results.
    """
    
    # Create a dictionary to parse the JSON
    all_tweets = {}
    
    # Set length of time function will run, with a max time of 15 minutes
    if duration > 15:
        total_time = 900
    elif duration <= 15:
        total_time = duration*60
    print ("Remaining time = " + str(duration) + " minutes.")    
    
    # Adjust the number_of_tweets to determine frequency of API requests
    number_of_tweets = 30
    
    remaining_seconds = total_time
    
    # Waiting interval is set to produce API request rate of 450 tweets/15 mins
    interval = number_of_tweets*2
    while remaining_seconds > 0: 
        added = 0
        # Hit the Twitter API using our function
        new_tweets = get_tweets_by_location(number_of_tweets, latlong)
        # Parse the resulting JSON, and save the rest of the raw content
        for tweet in new_tweets:
            tid = tweet['id']
            if tid not in all_tweets:
                properties = {}
                if tweet['coordinates'] != None:
                    properties['lat'] = tweet['coordinates']['coordinates'][0]
                    properties['lon'] = tweet['coordinates']['coordinates'][1]
                else:
                    properties['lat'] = None
                    properties['lon'] = None
                properties['location'] = tweet['user']['location'] #This will get us the location associated with the profile
                properties['tweet_id'] = tid
                properties['content'] = tweet['text']
                properties['user'] = tweet['user']['id']
                properties['raw_source'] = tweet
                properties['data_point'] = 'none'
                properties['time'] = tweet['created_at']
                properties['language'] = tweet['lang']
                all_tweets[ tid ] = properties
                added += 1
        print("At %d seconds, added %d new tweets, for a total of %d" % ( total_time - remaining_seconds, added, len( all_tweets )))
        time.sleep(interval)
        remaining_seconds -= interval
    print(str(len(all_tweets)) + ' Tweets retrieved.')
    # We return the final dictionary to work with in Python
    return all_tweets


def get_tweets_by_location(number_of_tweets, latlong=None):
    """Gets specified number of tweets with a given query at a given lat-long.
    Returns results as a list.
    """ 
    
    # Uses the search function to hit the APIs endpoints and look for recent tweets within the area
    results = my_twython.search(q=search_term, geocode=str(latlong[0])+','+str(latlong[1])+','+ distance, result_type=type_of_result, count=number_of_tweets)
    # Returns the only the statuses from the resulting JSON
    return results['statuses']

In [4]:
# Initialize search variables and scraper duration here

# Sets search term, can be blank for all tweets
search_term='FBI' 

# Sets a Lat Lon
latlong=[42.3601,-71.0942] # Set to MIT campus, Cambridge, MA, USA

# Sets search radius
distance='20mi'

# Sets result type: 'recent', 'popular', or 'mixed'
type_of_result='mixed'

# Sets length of time in minutes that Twitter scraper will run
duration = 30





In [5]:
# Runs Twitter scraper

continuous_scrape (duration)

Initiating continuous scraping for 30 minutes.
Remaining time = 30 minutes.
At 0 seconds, added 1 new tweets, for a total of 1
At 60 seconds, added 0 new tweets, for a total of 1
At 120 seconds, added 1 new tweets, for a total of 2
At 180 seconds, added 0 new tweets, for a total of 2
At 240 seconds, added 0 new tweets, for a total of 2
At 300 seconds, added 0 new tweets, for a total of 2
At 360 seconds, added 0 new tweets, for a total of 2
At 420 seconds, added 0 new tweets, for a total of 2
At 480 seconds, added 0 new tweets, for a total of 2
At 540 seconds, added 0 new tweets, for a total of 2
At 600 seconds, added 0 new tweets, for a total of 2
At 660 seconds, added 0 new tweets, for a total of 2
At 720 seconds, added 0 new tweets, for a total of 2
At 780 seconds, added 0 new tweets, for a total of 2
At 840 seconds, added 0 new tweets, for a total of 2
2 Tweets retrieved.
Writing file 20170306-101304_FBI_tweets.json
Remaining time = 15 minutes.
At 0 seconds, added 2 new tweets, for 

In [None]:
def json_to_df (folder):
    """Reads json files from given directory into new pandas dataframe.
    Takes name of folder where files are saved.
    Returns dataframe."""

    # specifies location of files
    file_dir = folder 
    # gets only the files we have saved
    onlyfiles = [ f for f in listdir(file_dir) if isfile(join(file_dir,f)) and not f.startswith('.')]

    # creates empty dataframe with columns for each property
    df = pd.DataFrame(columns = ['tweet_id', 'lat', 'lon', 'content','language','location','user','raw_source','data_point','time'])

    # Loops through all json files to create single, compiled file
    for file in onlyfiles:
        full_dir = join(file_dir,file) 
        with open(full_dir) as json_data:
            dict = json.load(json_data) 
            if not isinstance(dict, list):
                for key, val in dict.items():
                    df.loc[key,val] = val
    return df
   
    
def count_attribute (df, attribute):
    """Creates data frame counting any one variable of a dataframe.
    Takes name of dataframe, attribute name.
    Returns new dataframe of counts."""

    grouped = df.groupby(attribute)
    count = grouped[attribute].count()
    df_count = count.to_frame()
    df_count.columns = ['Count']
    df_count.index.names = [attribute]
    df_count.sort_index()
    return df_count

    
def make_piechart (df):
    """Makes a pie chart of dataframe data.
    Takes dataframe with column 'Count'."""
    
    # Create a list of colors (from iWantHue)
    colors = ["#697dc6","#5faf4c","#7969de","#b5b246",
              "#cc54bc","#4bad89","#d84577","#4eacd7",
              "#cf4e33","#894ea8","#cf8c42","#d58cc9",
              "#737632","#9f4b75","#c36960"]

    # Create a pie chart
    plt.pie( df['Count'], labels=df.index.get_values(), shadow=False, colors=colors)

    # View the plot drop above
    plt.axis('equal')

    # View the plot
    plt.tight_layout()
    plt.show()

    
def make_scatterplot (df):
    
    # Create a filter from df_tweets filtering only those that have values for lat and lon
    df_tweets_with_location = df_tweets[df_tweets.lon.notnull() & df_tweets.lat.notnull()]
    df_tweets_with_location
    
    # Use a scatter plot to make a quick visualization of the data points
    # NOTE: WHEN I DID THIS, I ONLY HAD SIX OUT OF ABOUT 100 TWEETS!
    plt.scatter(df_tweets_with_location['lon'],df_tweets_with_location['lat'], s=25)
    plt.show()
    


In [None]:
# Initialize variables for data analysis 
folder = "data"
attribute = 'location'


In [None]:
# Perform data analysis

df_tweets = json_to_df (folder)
#automagically clean data - remove duplicates
df_attribute_count = count_attribute (df_tweets, attribute)

#df_tweets_clean = manually clean data

make_piechart (df_attribute_count)
make_scatterplot

#export df_tweets_clean data to CSV