In [1]:
from __future__ import unicode_literals
import json 
import requests
import pandas 
import os
import youtube_dl
import numpy as np
import time

In [2]:
# Function to load json file
def loadKeys(key_file:str):
    with open(key_file) as f:
        key_dict = json.load(f)
    return key_dict['api_key'], key_dict['api_secret'], key_dict['bearer_token'], key_dict['token'], key_dict['token_secret']

In [3]:
# Creating a url for the api to search from
def createFirstSearchURL(wordSearch:str):
    query = '{} -is:retweet has:media -has:videos lang:en  -is:retweet'.format(wordSearch)
    # Tweet fields are adjustable.
    # Options include:
    # attachments, author_id, context_annotations,
    # conversation_id, created_at, entities, geo, id,
    # in_reply_to_user_id, lang, non_public_metrics, organic_metrics,
    # possibly_sensitive, promoted_metrics, public_metrics, referenced_tweets,
    # source, text, and withheld
    tweet_fields = "media.fields=type,url"
    expansions = "expansions=attachments.media_keys"
    max_results = "max_results=100"
    url = "https://api.twitter.com/2/tweets/search/recent?query={}&{}&{}&{}".format(
        query, tweet_fields, expansions, max_results
    )
    return url


In [4]:
# Creating a url for the api to search from
def createSearchURL(wordSearch:str, token:str):
    query = '{} -is:retweet has:media -has:videos lang:en  -is:retweet'.format(wordSearch)
    # Tweet fields are adjustable.
    # Options include:
    # attachments, author_id, context_annotations,
    # conversation_id, created_at, entities, geo, id,
    # in_reply_to_user_id, lang, non_public_metrics, organic_metrics,
    # possibly_sensitive, promoted_metrics, public_metrics, referenced_tweets,
    # source, text, and withheld
    nextToken = "next_token={}".format(token)
    tweet_fields = "media.fields=type,url"
    expansions = "expansions=attachments.media_keys"
    max_results = "max_results=100"
    url = "https://api.twitter.com/2/tweets/search/recent?query={}&{}&{}&{}&{}".format(
        query, tweet_fields, expansions, max_results, nextToken
    )
    return url

In [5]:
# Creating a url for the api to search from
def getSearchRequestCount(headers:str):
    url = "https://api.twitter.com/1.1/application/rate_limit_status.json?"
    data = connect_to_endpoint(url, headers)
    return int(data['resources']['search']['/search/tweets']['remaining'])


In [6]:
# Creating authorization 
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

In [7]:
# Connectign to the url api with the token headers
def connect_to_endpoint(url, headers):
    response = requests.request("GET", url, headers=headers)
    print(response.status_code)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

In [8]:
# Creating json file from a response file and filename
def create_json(filename, json_response):
    with open(filename, "w") as write_file:
        json.dump(json_response, write_file, indent=4)

In [9]:
# loading up data of a json file
def load_json(filename):
    file = open(filename)
    data = json.load(file)
    file.close()
    return data

In [10]:
# Downloading gif file with given url 
def downloadGIF(url, filename):
    try:
        ydl_opts = {
            'outtmpl': 'gifs/{}.mp4'.format(filename)
        }
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
        return True
    except:
        print("Could Not Download File")
        time.sleep(60 * 5)
        return False

In [11]:
# Class TweetData holds the mediaKey, TweetURL, Text, TweetID for one data object in a json file
class TweetData:
    def __init__(self, jsonData):
        # Media Key Of Tweet
        mediaKey = jsonData['attachments']['media_keys'][0]
        # Splitting the text into the text and url
        tempTextArray = jsonData['text'].rpartition("https://")
        # Creating URL
        tweetURL = tempTextArray[1] + tempTextArray[2]
        # Creating tweet text
        text = tempTextArray[0]
        # Setting mediaKey, URL, and text
        self.mediaKey = mediaKey
        self.tweetURL = tweetURL
        self.text = text
        self.tweetID = jsonData['id']

In [12]:
# Create CSVandGIFS 
def createDFandGIFS(data:dict, finalTweetDF:pandas.DataFrame):
    # Headers for the data
    headers = ["Tweet ID", "Media Key", "Tweet URL", "Tweet Text", "GIF Title"]
    # Initializing 2d array that will contain all of the data
    tweetData = np.array([headers])   
    # Iterate over all of the collected tweets
    for temp in data['data']:
        
        try:# Creating a tweet obejct that holds the data
            tempTweet = TweetData(temp)
            # Check if the current media tweet contains a gif
            if "16_" in tempTweet.mediaKey: 
                # Create File Name
                curGifTitle =  "{}".format(tempTweet.mediaKey)
                # Try to download new file, if an error occurs and cant download gif, the file is not added
                if downloadGIF(tempTweet.tweetURL, curGifTitle):
                    # If Downloading causes no error we add this twitter obeject to our array 
                    # Create a numpy row to add to data
                    curRow = np.array([[
                        tempTweet.tweetID,
                        tempTweet.mediaKey,
                        tempTweet.tweetURL,
                        tempTweet.text,
                        curGifTitle + ".mp4"
                    ]])
                    # Adding indivisual data to whole data
                    tweetData = np.append(tweetData, curRow, axis=0)   
        except:
            print("Error with tweet")
    #Create a pandas file of the 
    tweetDF = pandas.DataFrame(data = tweetData[1:,:],  columns=tweetData[0])
    finalTweetDF = finalTweetDF.append(tweetDF, ignore_index=True)

    return finalTweetDF

In [13]:
# Main Scraper function 
def scraper(query:str , bearer_token:str , csvFileName:str, dataFileName:str):
    # Loading current csv file into a pandas data frame
    finalTwitterDF = pandas.read_csv(csvFileName)
    # Headers for the endpoint connection
    headers = create_headers(bearer_token)
    # Instantiating capCounter
    capCounter = 10000
    # Gets the total ammount of searches allowed with in the current 15 minute time frame. default = 450
    searchLimit = getSearchRequestCount(headers)
    firstSearch = True
    # Looping through twitter api and ensuring rate limits have not been exceeded
    while True:
        # Creating twitter api url on first and subsequent tries
        if firstSearch:
            url = createFirstSearchURL(query)
            firstSearch = False 
        else:
            if "next_token" in data["meta"].keys(): 
                nextToken = data["meta"]["next_token"] 
            else:
                return finalTwitterDF
            url = createSearchURL(query, nextToken) #<------ edit
        # Data collected from twitter api
        data = connect_to_endpoint(url, headers)
        # Saving previously used data as json file 
        create_json(dataFileName, data)
        # Updates df with newly  collected data
        finalTwitterDF = createDFandGIFS(data, finalTwitterDF)
        # Subtract one from the seachres
        searchLimit -= 1
        capCounter -= 100
        finalTwitterDF.to_csv(csvFileName, index=False)
        if capCounter < 100:
            return finalTwitterDF
        if searchLimit ==  0:
            time.sleep(60 * 15) # wait for 15 minutes so that we can reset the rate limits
        


In [17]:
# Creating keys for the twitter api
api_key, api_secret, bear_token, token, token_secret = loadKeys("keys.json")
bearer_token = bear_token
# CSV File Name
CSV_FILE_NAME = "twitterWeekdayData.csv" # If you want to start a new one just delete the contents excepted the headers of the csv
DATA_FILE_NAME = "data_file.json"
QUERY = "(monday OR tuesday OR wednesday OR thursday OR friday OR saturday OR sunday)"
df = scraper(QUERY, bear_token, CSV_FILE_NAME, DATA_FILE_NAME)

200
200
[generic] 1dEzSBjxi9: Requesting header
[generic] 1dEzSBjxi9: Downloading webpage
[generic] 1dEzSBjxi9: Extracting information
[redirect] Following redirect to https://twitter.com/Mnqwazam/status/1385502189716660224/photo/1
[twitter] 1385502189716660224: Downloading guest token
ERROR: Unable to download JSON metadata: HTTP Error 429: Too Many Requests (caused by <HTTPError 429: 'Too Many Requests'>); please report this issue on https://yt-dl.org/bug . Make sure you are using the latest version; see  https://yt-dl.org/update  on how to update. Be sure to call youtube-dl with the --verbose flag and include its complete output.
Could Not Download File


In [14]:
api_key, api_secret, bear_token, token, token_secret = loadKeys("keys.json")
bearer_token = bear_token
searchLimit = getSearchRequestCount(create_headers(bearer_token))
print(searchLimit)

200
450


In [16]:
CSV_FILE_NAME = "twitterSubjectData.csv" # If you want to start a new one just delete the contents excepted the headers of the csv
DATA_FILE_NAME = "data_file1.json"
QUERY = "(movie OR vacation OR hangout OR party OR study OR sing OR watch)"
df = scraper(QUERY, bear_token, CSV_FILE_NAME, DATA_FILE_NAME)

In [17]:
x

'the=407614\ni=401053\nto=350737\na=298042\nand=190859\nis=186112\nin=170708\nit=169459\nyou=168292\nof=167362\ntinyurl.com=160107\nfor=158808\non=134841\nmy=133454\n‘s=127925\nthat=110142\nat=82029\nwith=81213\nme=78441\ndo=74404\nhave=73171\njust=72089\nthis=68154\nbe=65201\nn’t=58643\nso=58605\nare=56793\n‘m=56576\nnot=56418\nwas=54965\nbut=54594\nout=52032\nup=50213\nwhat=48950\nnow=48291\nnew=46258\nfrom=46047\nyour=45972\nlike=45609\ngood=44233\nno=44183\nget=44019\nall=43107\nabout=41218\nwe=39158\nif=37093\ntime=34183\nas=33925\nday=33820\nwill=33571\none=33547\ntwitter=33526\nhow=32999\ncan=32994\nsome=31082\nan=30930\nam=30658\nby=30632\ngoing=29784\nthey=29659\ngo=29536\nor=28701\nhas=28408\nrt=28218\nknow=27901\ntoday=27859\nthere=27713\nlove=27483\nmore=27072\nwork=26924\n==26623\ntoo=25932\ngot=25743\nhe=25222\n2=24748\nback=24616\nthink=24520\ndid=24384\nlol=24305\nwhen=23783\nsee=22579\nreally=22264\nhad=21180\ngreat=21132\noff=20792\nwould=20530\nneed=20450\nhere=19983

In [24]:
x = x.split("\n")
for i in range(len(x)):
    x[i] = x[i].split('=')[0]

In [17]:
# Creating keys for the twitter api
api_key, api_secret, bear_token, token, token_secret = loadKeys("keys.json")
bearer_token = bear_token
# CSV File Name
CSV_FILE_NAME = "twitterWeekdayData.csv" # If you want to start a new one just delete the contents excepted the headers of the csv
DATA_FILE_NAME = "data_file.json"
QUERY = "(monday)"
df = scraper(QUERY, bear_token, CSV_FILE_NAME, DATA_FILE_NAME)

n extractor.
[generic] K6CEsYGTsw: Downloading webpage
[generic] K6CEsYGTsw: Extracting information
[redirect] Following redirect to https://twitter.com/CassieJAllDay/status/1384938447245303808/photo/1
[twitter] 1384938447245303808: Downloading guest token
ERROR: Unable to download JSON metadata: HTTP Error 429: Too Many Requests (caused by <HTTPError 429: 'Too Many Requests'>); please report this issue on https://yt-dl.org/bug . Make sure you are using the latest version; see  https://yt-dl.org/update  on how to update. Be sure to call youtube-dl with the --verbose flag and include its complete output.
Could Not Download File
[generic] 0bthCdl5z6: Requesting header
[generic] 0bthCdl5z6: Downloading webpage
[generic] 0bthCdl5z6: Extracting information
[redirect] Following redirect to https://twitter.com/rhshornetsathl/status/1384935518492839939
[twitter] 1384935518492839939: Downloading guest token
ERROR: Unable to download JSON metadata: HTTP Error 429: Too Many Requests (caused by <H

In [15]:
# Creating keys for the twitter api
api_key, api_secret, bear_token, token, token_secret = loadKeys("keys.json")
bearer_token = bear_token
# CSV File Name
CSV_FILE_NAME = "twitterWeekdayData.csv" # If you want to start a new one just delete the contents excepted the headers of the csv
DATA_FILE_NAME = "data_file.json"
QUERY = "(tuesday)"
df = scraper(QUERY, bear_token, CSV_FILE_NAME, DATA_FILE_NAME)

 544.64KiB in 00:00                          
[generic] a6KaIHCQmZ: Requesting header
[generic] a6KaIHCQmZ: Downloading webpage
[generic] a6KaIHCQmZ: Extracting information
[redirect] Following redirect to https://twitter.com/Arcticwolff/status/1384821398691065857/photo/1
[twitter] 1384821398691065857: Downloading guest token
[twitter] 1384821398691065857: Downloading JSON metadata
[download] Destination: gifs\16_1384821391791427585.mp4
[download] 100% of 17.63KiB in 00:00                  
[generic] iCh69BgYHZ: Requesting header
[generic] iCh69BgYHZ: Downloading webpage
[generic] iCh69BgYHZ: Extracting information
[redirect] Following redirect to https://twitter.com/CactusRedDragon/status/1384821178385281026/photo/1
[twitter] 1384821178385281026: Downloading guest token
[twitter] 1384821178385281026: Downloading JSON metadata
[download] Destination: gifs\16_1384821171330355201.mp4
[download] 100% of 147.22KiB in 00:00                          
[generic] yzfiJZixmD: Requesting header
[

In [16]:
# Creating keys for the twitter api
api_key, api_secret, bear_token, token, token_secret = loadKeys("keys.json")
bearer_token = bear_token
# CSV File Name
CSV_FILE_NAME = "twitterWeekdayData.csv" # If you want to start a new one just delete the contents excepted the headers of the csv
DATA_FILE_NAME = "data_file.json"
QUERY = "(wednesday)"
df = scraper(QUERY, bear_token, CSV_FILE_NAME, DATA_FILE_NAME)

on extractor.
[generic] xs49bMwCHY: Downloading webpage
[generic] xs49bMwCHY: Extracting information
[redirect] Following redirect to https://twitter.com/GMGIRL63/status/1385019868592939011/photo/1
[twitter] 1385019868592939011: Downloading guest token
[twitter] 1385019868592939011: Downloading JSON metadata
[download] Destination: gifs\16_1385019863354257408.mp4
[download] 100% of 64.99KiB in 00:00                  
[generic] uO5ytlYkx4: Requesting header
[generic] uO5ytlYkx4: Downloading webpage
[generic] uO5ytlYkx4: Extracting information
[redirect] Following redirect to https://twitter.com/Matthew669691/status/1385019685566046210/photo/1
[twitter] 1385019685566046210: Downloading guest token
[twitter] 1385019685566046210: Downloading JSON metadata
[download] Destination: gifs\16_1385019677575946240.mp4
[download] 100% of 27.73KiB in 00:00                  
[generic] Hb5IreuI81: Requesting header
[generic] Hb5IreuI81: Downloading webpage
[generic] Hb5IreuI81: Extracting information


In [17]:
# Creating keys for the twitter api
api_key, api_secret, bear_token, token, token_secret = loadKeys("keys.json")
bearer_token = bear_token
# CSV File Name
CSV_FILE_NAME = "twitterWeekdayData.csv" # If you want to start a new one just delete the contents excepted the headers of the csv
DATA_FILE_NAME = "data_file.json"
QUERY = "(thursday)"
df = scraper(QUERY, bear_token, CSV_FILE_NAME, DATA_FILE_NAME)

load] 100% of 54.85KiB in 00:00                   
[generic] sR7JroMCaa: Requesting header
[generic] sR7JroMCaa: Downloading webpage
[generic] sR7JroMCaa: Extracting information
[redirect] Following redirect to https://twitter.com/13e_BTS/status/1385302847344898048/photo/1
[twitter] 1385302847344898048: Downloading guest token
[twitter] 1385302847344898048: Downloading JSON metadata
[download] Destination: gifs\16_1385302840852180993.mp4
[download] 100% of 53.19KiB in 00:00                  
[generic] XLBrAygAZn: Requesting header
[generic] XLBrAygAZn: Downloading webpage
[generic] XLBrAygAZn: Extracting information
[redirect] Following redirect to https://twitter.com/sunrinder13/status/1385302773927981057/photo/1
[twitter] 1385302773927981057: Downloading guest token
[twitter] 1385302773927981057: Downloading JSON metadata
[download] Destination: gifs\16_1385302765812043778.mp4
[download] 100% of 15.81KiB in 00:00                   
[generic] wrc2PfVCkH: Requesting header
[generic] wr

In [None]:
# Creating keys for the twitter api
api_key, api_secret, bear_token, token, token_secret = loadKeys("keys.json")
bearer_token = bear_token
# CSV File Name
CSV_FILE_NAME = "twitterWeekdayData.csv" # If you want to start a new one just delete the contents excepted the headers of the csv
DATA_FILE_NAME = "data_file.json"
QUERY = "(friday)"
df = scraper(QUERY, bear_token, CSV_FILE_NAME, DATA_FILE_NAME)

In [14]:
# Creating keys for the twitter api
api_key, api_secret, bear_token, token, token_secret = loadKeys("keys.json")
bearer_token = bear_token
# CSV File Name
CSV_FILE_NAME = "twitterWeekdayData.csv" # If you want to start a new one just delete the contents excepted the headers of the csv
DATA_FILE_NAME = "data_file.json"
QUERY = "(saturday)"
df = scraper(QUERY, bear_token, CSV_FILE_NAME, DATA_FILE_NAME)

a
[download] Destination: gifs\16_1385390514028044289.mp4
[download] 100% of 62.91KiB in 00:00                  
[generic] JAeC7Swpq7: Requesting header
[generic] JAeC7Swpq7: Downloading webpage
[generic] JAeC7Swpq7: Extracting information
[redirect] Following redirect to https://twitter.com/Biostockbeginn1/status/1385389745786740749/photo/1
[twitter] 1385389745786740749: Downloading guest token
[twitter] 1385389745786740749: Downloading JSON metadata
[download] gifs\16_1385389739398909952.mp4 has already been downloaded
[download] 100% of 37.47KiB
[generic] f2zrRV0g8L: Requesting header
[generic] f2zrRV0g8L: Downloading webpage
[generic] f2zrRV0g8L: Extracting information
[redirect] Following redirect to https://twitter.com/StopTCensorship/status/1385389434405855234/photo/1
[twitter] 1385389434405855234: Downloading guest token
[twitter] 1385389434405855234: Downloading JSON metadata
[download] Destination: gifs\16_1385389421210517505.mp4
[download] 100% of 23.46KiB in 00:00          

In [15]:
# Creating keys for the twitter api
api_key, api_secret, bear_token, token, token_secret = loadKeys("keys.json")
bearer_token = bear_token
# CSV File Name
CSV_FILE_NAME = "twitterWeekdayData.csv" # If you want to start a new one just delete the contents excepted the headers of the csv
DATA_FILE_NAME = "data_file.json"
QUERY = "(sunday)"
df = scraper(QUERY, bear_token, CSV_FILE_NAME, DATA_FILE_NAME)

to/1
[twitter] 1385459419274641408: Downloading guest token
[twitter] 1385459419274641408: Downloading JSON metadata
[download] gifs\16_1385459413910114305.mp4 has already been downloaded
[download] 100% of 62.97KiB
[generic] oqHZontYtU: Requesting header
[generic] oqHZontYtU: Downloading webpage
[generic] oqHZontYtU: Extracting information
[redirect] Following redirect to https://twitter.com/highmotorracing/status/1385459323090898944/photo/1
[twitter] 1385459323090898944: Downloading guest token
[twitter] 1385459323090898944: Downloading JSON metadata
[download] gifs\16_1385459315390226436.mp4 has already been downloaded
[download] 100% of 131.20KiB
[generic] tyv8x1b47S: Requesting header
[generic] tyv8x1b47S: Downloading webpage
[generic] tyv8x1b47S: Extracting information
[redirect] Following redirect to https://twitter.com/rainbowscaandy/status/1385457704471171073/photo/1
[twitter] 1385457704471171073: Downloading guest token
[twitter] 1385457704471171073: Downloading JSON metadata