In [2]:
# Importing packages/libraries
import datetime
import tweepy
import pandas as pd

### Testing Twitter Search API using Tweepy


In [45]:
# Setting the bearer_token for authentification

# Essential Access : 
#bearer_token = 'AAAAAAAAAAAAAAAAAAAAAISGbQEAAAAAiydrn91VCyHyuB1ASiIQofGIGV4%3D8nlmDY77NJPRZQ0uOiGHOllLazV3BXpRfjpunqEdWiXUughu5P'

# Research Access :
bearer_token = 'AAAAAAAAAAAAAAAAAAAAAGjtbgEAAAAAyx9oZIai1hXZ9OyDhUUFMZQivpc%3DUunewUXR9hw3nyKQhjqdmfg7zSAoa1nPv6WKLLSPB7OwKwYBP3'

# Initializing the client to request the Twitter API
client = tweepy.Client(bearer_token, wait_on_rate_limit=True)

query = 'from:Luisamneubauer'
#extinctionr #extinctionrebellion

#expansions = ['attachments.poll_ids', 'attachments.media_keys', 'author_id', 'entities.mentions.username', 'geo.place_id', 'in_reply_to_user_id', 'referenced_tweets.id', 'referenced_tweets.id.author_id']
expansions = ['attachments.media_keys','author_id','geo.place_id','referenced_tweets.id','referenced_tweets.id.author_id']

max_results = 500

media_fields = ["duration_ms", "height", "media_key", "preview_image_url", "type", "url", "width", "public_metrics", "non_public_metrics", "organic_metrics", "promoted_metrics", "alt_text"]

#next_token = ''#(str | None) – This parameter is used to get the next ‘page’ of results. The value used with the parameter is pulled directly from the response provided by the API, and should not be modified. You can learn more by visiting our page on pagination.

place_fields = ['country', 'country_code', 'full_name', 'geo', 'id', 'name', 'place_type']

#poll_fields #(list[str] | str | None) – poll_fields

#since_id #(int | str | None) – Returns results with a Tweet ID greater than (for example, more recent than) the specified ID. The ID specified is exclusive and responses will not include it. If included with the same request as a start_time parameter, only since_id will be used.

sort_order = 'recency'

start_time = '2017-01-01T00:00:00Z' #(datetime.datetime | str | None) – YYYY-MM-DDTHH:mm:ssZ (ISO 8601/RFC 3339). The oldest UTC timestamp from which the Tweets will be provided. Timestamp is in second granularity and is inclusive (for example, 12:00:01 includes the first second of the minute). By default, a request will return Tweets from up to 30 days ago if you do not include this parameter.
#end_time = '2019-07-15T23:00:00Z'

#tweet_fields = ["attachments", "author_id", "context_annotations", "conversation_id", "created_at", "entities", "geo", "id", "in_reply_to_user_id", "lang", "non_public_metrics", "public_metrics", "organic_metrics", "promoted_metrics", "possibly_sensitive", "referenced_tweets", "reply_settings", "source", "text", "withheld"]
tweet_fields = ["attachments","author_id","conversation_id","created_at","referenced_tweets","geo","public_metrics"]

#until_id #(int | str | None) – Returns results with a Tweet ID less than (that is, older than) the specified ID. Used with since_id. The ID specified is exclusive and responses will not include it.

# user_fields can be extracted from the .includes attribute of the tweepy response
user_fields = ["created_at", "description", "location", "name", "pinned_tweet_id", "profile_image_url", "protected", "public_metrics", "url", "username", "verified", "withheld"]

In [4]:
def search_recent_tweets():
    response = client.search_recent_tweets(
                               query=query
                              ,expansions=expansions
                              ,max_results=100
                              ,media_fields=media_fields
                              ,place_fields=place_fields
                              ,sort_order=sort_order
                              #,start_time=start_time
                              ,tweet_fields=tweet_fields
                              ,user_fields=user_fields
                            )
    return response

In [5]:
def search_recent_tweets_with_next_token(next_token):
    response = client.search_recent_tweets(
                               query=query
                              ,expansions=expansions
                              ,max_results=100
                              ,media_fields=media_fields
                              ,place_fields=place_fields
                              ,sort_order=sort_order
                              #,start_time=start_time
                              ,tweet_fields=tweet_fields
                              ,user_fields=user_fields
                              ,next_token=next_token
                            )
    return response

In [6]:
def search_all_tweets(query):
    response = client.search_all_tweets(
                               query=query
                              ,expansions=expansions
                              ,max_results=max_results
                              ,media_fields=media_fields
                              ,place_fields=place_fields
                              ,sort_order=sort_order
                              ,start_time=start_time
                              #,end_time=end_time
                              ,tweet_fields=tweet_fields
                              ,user_fields=user_fields
                            )
    return response

In [7]:
def search_all_tweets_with_next_token(query, next_token):
    response = client.search_all_tweets(
                               query=query
                              ,expansions=expansions
                              ,max_results=max_results
                              ,media_fields=media_fields
                              ,place_fields=place_fields
                              ,sort_order=sort_order
                              ,start_time=start_time
                              #,end_time=end_time
                              ,tweet_fields=tweet_fields
                              ,user_fields=user_fields
                              ,next_token=next_token
                            )
    return response

In [48]:
def fetch_recent_data(iteration_count, next_token):

    """ Fetches iteration_count * 100 Tweets from Twitter API using the set query params, including the initial next_token
    
    Parameters:
    ------------
    * iteration_count     : Integer indicating the max number of requests 
    * next_token          : String containing the next token for pagination
    
    """
    
    # Initializing the return parameters
    tweets = []
    users = []
    ref_tweets = []
    media = []
    places = []
    
    for i in range(0, iteration_count):

        if(len(next_token) == 0):
            response = search_recent_tweets()
        else:
            response = search_recent_tweets_with_next_token(next_token)

        # Parsing data into DataFrames for return 
        tweets.append(response.data)
        users.append(response.includes['users'])
        ref_tweets.append(response.includes['tweets'])
        if "media" in response:
            media.append(response.includes['media'])
        
        if('places' in response.includes.keys()):
            places.append(response.includes['places'])

        try:
            next_token = response.meta['next_token']
        except:
            print('No more Tweets to fetch')
            break


    # Logging the last next_token for pagination into console --> needed for later requesting of the next tweets
    if('next_token' in response.meta):
        print(response.meta['next_token'])
        
    return tweets, users, ref_tweets, media, places

In [55]:
def fetch_all_data(query, iteration_count, next_token):

    """ Fetches iteration_count * 100 Tweets from Twitter API using the set query params, including the initial next_token
    
    Parameters:
    ------------
    * iteration_count     : Integer indicating the max number of requests 
    * next_token          : String containing the next token for pagination
    
    """
    
    # Initializing the return parameters
    tweets = []
    users = []
    ref_tweets = []
    media = []
    places = []
    
    for i in range(0, iteration_count):

        if(len(next_token) == 0):
            response = search_all_tweets(query)
        else:
            response = search_all_tweets_with_next_token(query, next_token)

        tweets.append(response.data)
        users.append(response.includes['users'])
        ref_tweets.append(response.includes['tweets'])
        if "media" in response.includes.keys():
            media.append(response.includes['media'])
        
        if('places' in response.includes.keys()):
            places.append(response.includes['places'])
            
        try:
            next_token = response.meta['next_token']
        except:
            print('No more Tweets to fetch')
            break


    # Logging the last next_token for pagination into console --> needed for later requesting of the next tweets
    if('next_token' in response.meta):
        print(response.meta['next_token'])
        
    return tweets, users, ref_tweets, media, places

In [56]:
def unpack_dict(unpack_dict):
    unpacked = [v for k, v in unpack_dict.items()]
    return unpacked
def unpack_attachment(attachment_dict):
    return "|".join(attachment_dict["media_keys"]) if "media_keys" in attachment_dict.keys() else np.nan
def unpack_geo(geo_dict):
    return geo_dict["type"], "|".join([str(x) for x in geo_dict["bbox"]])
def unpack_geo_2(geo_dict):
    return geo_dict["place_id"]

In [70]:
def query_tweets(query):    
    tweets, users, ref_tweets, media, places = fetch_all_data(query, 300, '')

    user_df = pd.DataFrame(data=[user for sublist in users for user in sublist])
    tweet_df = pd.DataFrame(data=[tweet for sublist in tweets for tweet in sublist])
    ref_tweet_df = pd.DataFrame(data=[tweet for sublist in ref_tweets for tweet in sublist])
    media_df = pd.DataFrame(data=[item for sublist in media for item in sublist])
    place_df = pd.DataFrame(data=[place for sublist in places for place in sublist])
    
    user_df[["followers_count", "following_count", "tweet_count", "listed_count"]] = user_df.apply(lambda x: unpack_dict(x["public_metrics"]), axis=1, result_type="expand")
    del user_df["public_metrics"]

    tweet_df[["retweet_count", "reply_count", "like_count", "quote_count"]] = tweet_df.apply(lambda x: unpack_dict(x["public_metrics"]), axis=1, result_type="expand")
    tweet_df["media_keys"] = tweet_df.loc[tweet_df["attachments"].notna()].apply(lambda x: unpack_attachment(x["attachments"]), axis=1, result_type="expand")
    del tweet_df["public_metrics"]
    del tweet_df["attachments"]
    del tweet_df["referenced_tweets"]

    ref_tweet_df[['retweet_count', 'reply_count', 'like_count', 'quote_count']] = ref_tweet_df.apply(lambda x: unpack_dict(x["public_metrics"]), axis=1, result_type="expand")
    ref_tweet_df["media_keys"] = ref_tweet_df.loc[ref_tweet_df["attachments"].notna()].apply(lambda x: unpack_attachment(x["attachments"]), axis=1, result_type="expand")
    ref_tweet_df['place_id'] = ref_tweet_df.loc[ref_tweet_df["geo"].notna()].apply(lambda x: unpack_geo_2(x["geo"]), axis=1, result_type="expand")
    del ref_tweet_df["public_metrics"]
    del ref_tweet_df["attachments"]
    del ref_tweet_df["geo"]
    del ref_tweet_df["referenced_tweets"]

    if "geo" in place_df.columns:
        place_df[["type", "coordinates"]] = place_df.loc[place_df["geo"].notna()].apply(lambda x: unpack_geo(x["geo"]), axis=1, result_type="expand")
        del place_df["geo"]
        
    tweet_df.text = tweet_df.text.apply(lambda x: re.sub(r"[^A-Za-z0-9\w\s:@,]", "", x))
    return user_df, tweet_df, ref_tweet_df, place_df, media_df

In [63]:
import pandas as pd
import numpy as np
import sqlalchemy as sa
import tweepy
import dotenv
import os
import yaml
import datetime
import re
from pprint import pprint
dotenv.load_dotenv()

username = os.environ["DB_USERNAME"]
password = os.environ["DB_PASSWORD"]
host = os.environ["DB_HOST"]
port = os.environ["DB_PORT"]
name = os.environ["DB_NAME"]
engine = sa.create_engine("mssql+pymssql://{}:{}@{}/{}".format(username, password, host, name))

True

In [99]:
all_user = all_tweets = all_ref_tweets = all_places = all_media = pd.DataFrame()
for i in ['from:Fridays4future', 'from:FridayForFuture', 'from:GretaThunberg', 'from:Luisamneubauer', 'EndCoal',
    'EndFossilFuels', 
    'PeopleNotProfit', 'NoMoreEmptyPromises', 'UprootTheSystem', 'FridaysForFuture', 
    'ClimateAction', 'ClimateJustice', 'ClimateEmergency', 'ClimateStrike', 'SaveThePlanet', 'climatescam', 
    'climatechangehoax', 'fakeclimate', 'climatehoax', 'globalwarmingisahoax', 'ClimateCrisis', 
    'ClimateChange', 'Climate', 'GlobalWarming']:
    print(i)
    user_df, tweet_df, ref_tweet_df, place_df, media_df = query_tweets(i)
    all_user = pd.concat([all_user, user_df])
    all_tweets = pd.concat([all_user, tweet_df])
    all_ref_tweets = pd.concat([all_user, ref_tweet_df])
    all_places = pd.concat([all_user, place_df])
    all_media = pd.concat([all_user, media_df])
    
    #user_df.created_at = user_df.created_at.astype(str)
    #tweet_df.created_at = tweet_df.created_at.astype(str)
    #ref_tweet_df.created_at = ref_tweet_df.created_at.astype(str)
    #for k, v in {"users": user_df, "tweets": tweet_df, "ref_tweets": ref_tweet_df, "places": place_df, "media": media_df}.items():
    #    v.to_sql(k, con=engine, index=False, if_exists="append")

from:Fridays4future
No more Tweets to fetch
from:FridayForFuture
No more Tweets to fetch
from:GretaThunberg
No more Tweets to fetch
from:Luisamneubauer
No more Tweets to fetch
EndCoal


TwitterServerError: 503 Service Unavailable

## Export cleaned Data

In [None]:
# Export to csv files
user_df.to_csv('./data/User_Data_' + query.replace('from:','') + '_' + datetime.datetime.now().strftime("%Y%m%d%H%M%S") + '.csv', index=False)
tweet_df.to_csv('./data/Tweet_Data_' + query.replace('from:','') + '_' + datetime.datetime.now().strftime("%Y%m%d%H%M%S") + '.csv', index=False)
ref_tweet_df.to_csv('./data/Ref_Tweet_Data_' + query.replace('from:','') + '_' + datetime.datetime.now().strftime("%Y%m%d%H%M%S") + '.csv', index=False)
media_df.to_csv('./data/Media_Data_' + query.replace('from:','') + '_' + datetime.datetime.now().strftime("%Y%m%d%H%M%S") + '.csv', index=False)
place_df.to_csv('./data/Place_Data_' + query.replace('from:','') + '_' + datetime.datetime.now().strftime("%Y%m%d%H%M%S") + '.csv', index=False)