In [1]:
from __future__ import unicode_literals
import json 
import requests
import pandas 
import os
import youtube_dl
import numpy as np
import time

In [2]:
# Function to load json file
def loadKeys(key_file:str):
    with open(key_file) as f:
        key_dict = json.load(f)
    return key_dict['api_key'], key_dict['api_secret'], key_dict['bearer_token'], key_dict['token'], key_dict['token_secret']

In [3]:
# Creating a url for the api to search from
def createFirstSearchURL(wordSearch:str):
    query = '(%23{}) -is:retweet has:media -has:videos lang:en  -is:retweet'.format(wordSearch)
    # Tweet fields are adjustable.
    # Options include:
    # attachments, author_id, context_annotations,
    # conversation_id, created_at, entities, geo, id,
    # in_reply_to_user_id, lang, non_public_metrics, organic_metrics,
    # possibly_sensitive, promoted_metrics, public_metrics, referenced_tweets,
    # source, text, and withheld
    tweet_fields = "media.fields=type,url"
    expansions = "expansions=attachments.media_keys"
    max_results = "max_results=100"
    url = "https://api.twitter.com/2/tweets/search/recent?query={}&{}&{}&{}".format(
        query, tweet_fields, expansions, max_results
    )
    return url


In [4]:
# Creating a url for the api to search from
def createSearchURL(wordSearch:str, token:str):
    query = '(%23{}) -is:retweet has:media -has:videos lang:en  -is:retweet'.format(wordSearch)
    # Tweet fields are adjustable.
    # Options include:
    # attachments, author_id, context_annotations,
    # conversation_id, created_at, entities, geo, id,
    # in_reply_to_user_id, lang, non_public_metrics, organic_metrics,
    # possibly_sensitive, promoted_metrics, public_metrics, referenced_tweets,
    # source, text, and withheld
    nextToken = "next_token={}".format(token)
    tweet_fields = "media.fields=type,url"
    expansions = "expansions=attachments.media_keys"
    max_results = "max_results=100"
    url = "https://api.twitter.com/2/tweets/search/recent?query={}&{}&{}&{}&{}".format(
        query, tweet_fields, expansions, max_results, nextToken
    )
    return url

In [5]:
# Creating a url for the api to search from
def getSearchRequestCount(headers:str):
    url = "https://api.twitter.com/1.1/application/rate_limit_status.json?"
    data = connect_to_endpoint(url, headers)
    return int(data['resources']['search']['/search/tweets']['remaining'])


In [6]:
# Creating authorization 
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

In [7]:
# Connectign to the url api with the token headers
def connect_to_endpoint(url, headers):
    response = requests.request("GET", url, headers=headers)
    print(response.status_code)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

In [8]:
# Creating json file from a response file and filename
def create_json(filename, json_response):
    with open(filename, "w") as write_file:
        json.dump(json_response, write_file, indent=4)

In [9]:
# loading up data of a json file
def load_json(filename):
    file = open(filename)
    data = json.load(file)
    file.close()
    return data

In [10]:
# Downloading gif file with given url 
def downloadGIF(url, filename):
    try:
        ydl_opts = {
            'outtmpl': 'gifs/{}.mp4'.format(filename)
        }
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
        return True
    except:
        print("Could Not Download File")
        return False

In [11]:
# Class TweetData holds the mediaKey, TweetURL, Text, TweetID for one data object in a json file
class TweetData:
    def __init__(self, jsonData):
        # Media Key Of Tweet
        mediaKey = jsonData['attachments']['media_keys'][0]
        # Splitting the text into the text and url
        tempTextArray = jsonData['text'].rpartition("https://")
        # Creating URL
        tweetURL = tempTextArray[1] + tempTextArray[2]
        # Creating tweet text
        text = tempTextArray[0]
        # Setting mediaKey, URL, and text
        self.mediaKey = mediaKey
        self.tweetURL = tweetURL
        self.text = text
        self.tweetID = jsonData['id']

In [36]:
# Create CSVandGIFS 
def createDFandGIFS(data:dict, finalTweetDF:pandas.DataFrame):
    # Headers for the data
    headers = ["Tweet ID", "Media Key", "Tweet URL", "Tweet Text", "GIF Title"]
    # Initializing 2d array that will contain all of the data
    tweetData = np.array([headers])   
    # Iterate over all of the collected tweets
    for temp in data['data']:
        
        try:# Creating a tweet obejct that holds the data
            tempTweet = TweetData(temp)
            # Check if the current media tweet contains a gif
            if "16_" in tempTweet.mediaKey: 
                # Create File Name
                curGifTitle =  "{}".format(tempTweet.mediaKey)
                # Try to download new file, if an error occurs and cant download gif, the file is not added
                if downloadGIF(tempTweet.tweetURL, curGifTitle):
                    # If Downloading causes no error we add this twitter obeject to our array 
                    # Create a numpy row to add to data
                    curRow = np.array([[
                        tempTweet.tweetID,
                        tempTweet.mediaKey,
                        tempTweet.tweetURL,
                        tempTweet.text,
                        curGifTitle + ".mp4"
                    ]])
                    # Adding indivisual data to whole data
                    tweetData = np.append(tweetData, curRow, axis=0)   
        except:
            print("Error with tweet")
    #Create a pandas file of the 
    tweetDF = pandas.DataFrame(data = tweetData[1:,:],  columns=tweetData[0])
    finalTweetDF = finalTweetDF.append(tweetDF, ignore_index=True)

    return finalTweetDF

In [41]:
# Main Scraper function 
def scraper(query:str , bearer_token:str , csvFileName:str, dataFileName:str):
    # Loading current csv file into a pandas data frame
    finalTwitterDF = pandas.read_csv(csvFileName)
    # Headers for the endpoint connection
    headers = create_headers(bearer_token)
    # Instantiating capCounter
    capCounter = 450000
    # Gets the total ammount of searches allowed with in the current 15 minute time frame. default = 450
    searchLimit = getSearchRequestCount(headers)
    firstSearch = True
    # Looping through twitter api and ensuring rate limits have not been exceeded
    while True:
        # Creating twitter api url on first and subsequent tries
        if firstSearch:
            url = createFirstSearchURL(query)
            firstSearch = False 
        else:
            if "next_token" in data["meta"].keys(): 
                nextToken = data["meta"]["next_token"] 
            else:
                return finalTwitterDF
            url = createSearchURL(query, nextToken) #<------ edit
        # Data collected from twitter api
        data = connect_to_endpoint(url, headers)
        # Saving previously used data as json file 
        create_json(dataFileName, data)
        # Updates df with newly  collected data
        finalTwitterDF = createDFandGIFS(data, finalTwitterDF)
        # Subtract one from the seachres
        searchLimit -= 1
        capCounter -= 100
        finalTwitterDF.to_csv(CSV_FILE_NAME, index=False)
        if capCounter < 100:
            return finalTwitterDF
        if searchLimit ==  0:
            time.sleep(60 * 15) # wait for 15 minutes so that we can reset the rate limits
        


In [42]:
# Creating keys for the twitter api
api_key, api_secret, bear_token, token, token_secret = loadKeys("keys.json")
bearer_token = bear_token
# CSV File Name
CSV_FILE_NAME = "twitterData.csv"
DATA_FILE_NAME = "data_file.json"
QUERY = "sad"
df = scraper(QUERY, bear_token, CSV_FILE_NAME, DATA_FILE_NAME)

wnload] gifs\16_1370353476258164739.mp4 has already been downloaded
[download] 100% of 185.61KiB
[generic] 2RzqfGq4NC: Requesting header
[generic] 2RzqfGq4NC: Downloading webpage
[generic] 2RzqfGq4NC: Extracting information
[redirect] Following redirect to https://twitter.com/TsunaYosh_Kun/status/1370281423215599617/photo/1
[twitter] 1370281423215599617: Downloading guest token
[twitter] 1370281423215599617: Downloading JSON metadata
[download] gifs\16_1370281413426081792.mp4 has already been downloaded
[download] 100% of 14.88KiB
[generic] rEaljML642: Requesting header
[generic] rEaljML642: Downloading webpage
[generic] rEaljML642: Extracting information
[redirect] Following redirect to https://twitter.com/usniffglue/status/1370253387296608256/photo/1
[twitter] 1370253387296608256: Downloading guest token
[twitter] 1370253387296608256: Downloading JSON metadata
[download] gifs\16_1370253382036901895.mp4 has already been downloaded
[download] 100% of 10.06KiB
[generic] UlxwawhoD2: Requ

In [43]:
df

Unnamed: 0,Tweet ID,Media Key,Tweet URL,Tweet Text,GIF Title
0,1371972970961371139,16_1371972960265908229,https://t.co/n1oU3XlnxI,Now mitch trubisky doesn't suck? Love all thes...,16_1371972960265908229.mp4
1,1371925808734945280,16_1371925802229637129,https://t.co/4t4mBvLE5I,I'll be a happy husband for that deserving wif...,16_1371925802229637129.mp4
2,1371909563490967557,16_1371909557786701826,https://t.co/EqHXb6qeYS,@CamHeyward @Bud_Dupree The @steelers are show...,16_1371909557786701826.mp4
3,1371899134446342145,16_1371899128003842059,https://t.co/ebKdZmlSkz,Telling. The previous GIF is actually from #AB...,16_1371899128003842059.mp4
4,1371874398517391362,16_1371874388929220611,https://t.co/QgdmNH2dxw,Stache season isn’t the same without spring br...,16_1371874388929220611.mp4
...,...,...,...,...,...
179,1369714491152166912,16_1369714485624070146,https://t.co/7g1Ck79RNp,@deadliquorstore @RepLizCheney @All435Reps Rig...,16_1369714485624070146.mp4
180,1369713015524843521,16_1369713009891885056,https://t.co/Ap4jVQs0Tg,@SenMikeLee realizes that even Satan has given...,16_1369713009891885056.mp4
181,1369684684121210887,16_1369684678517678083,https://t.co/Qrkjh4h6KS,"@LouGarza86 We are America, were you can actu...",16_1369684678517678083.mp4
182,1369542455952080899,16_1369542450495299586,https://t.co/KRemLWJdG5,@HE4BARBIE @clydesblair @jansARTPOP @CaseForTh...,16_1369542450495299586.mp4


In [33]:
nextToken = data["meta"]["next_token"] 

NameError: name 'data' is not defined