# AIT724 Project - pull tweets
# Melissa Cirtain

#### Deliverables:  10-20k tweets

In [28]:
import os
import requests
import json
import pandas as pd
from pandas import json_normalize
from datetime import datetime

# Instructions to run:
1. enter your twitter token value in the cell below
1. enter your topic value (what you want to search for) in the cell below
  - Pull 5000 **"maga"** and 5000 **"statehood"**.  
1. run this noteboook until you have 5000 tweets on your topic.  You may need to take a 15-minute break between runs to not cap out.  References below explain limitations.
1. upload your data files to https://github.com/sareek/NLP_on_Social_Media_Data.  

In [32]:
my_token = "PUT YOUR TOKEN HERE"
my_topic = "maga"



### Helper functions and setup

In [33]:
# Set the token, which you specified above
os.environ['TOKEN'] = my_token 


def auth():
    '''return the Bearer Token (which should be set in the env)'''
    return os.getenv('TOKEN')


def create_headers(bearer_token):
    '''return the authorization header with bearer token'''
    headers = {"Authorization": f"Bearer {bearer_token}"}
    return headers


def create_url(keyword, max_results=10):  
    '''Generate the URL with query baked in and return search results
    There is an API limit of 200(?) tweets, and time limits as well. 
    20 requests per 15 min.
    '''
    search_url = 'https://api.twitter.com/2/tweets/search/recent' # use search recent not search all (no access)
    
    # define query parameters:
    query_params = {
        'query': keyword,
        'max_results': max_results,
        'expansions': 'author_id,in_reply_to_user_id,geo.place_id',
        'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,' \
                        'created_at,lang,public_metrics,referenced_tweets,reply_settings,source',
        'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
        'place.fields':'full_name,id,country,country_code,geo,name,place_type',
        'next_token': {}
    }
    
    return (search_url, query_params)


def connect_to_endpoint(url, headers, params, next_token=None):
    '''given search arguments, connect to API and return response'''
    params['next_token'] = next_token  # params obbject received fromo create_url()
    response = requests.request('GET', url, headers=headers, params=params)
    print(f'Endpiont Response Code: {response.status_code}')
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
        
    return response.json()  # return response as JSON


# Revise connect_to_endpoint so I can iterate over pages
def connect_to_endpoint_2(url, headers, params):
    '''given search arguments, connect to API and return response'''
    #params['next_token'] = next_token  # params object received fromo create_url()
    response = requests.request('GET', url, headers=headers, params=params)
    print(f'Endpiont Response Code: {response.status_code}')
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
        
    return response.json()  # return response as JSON



def collect_tweets_to_df(keyword:str, tweets_count=100):  # TODO: max_results here should be variable, but call to create_url() maxes at 100.
    '''Return a pandas df containing results of a query based on
    provided keyword and a parameters list.  
    '''
    max_results_per_free_request = 100
    headers = create_headers(auth())
    search_url, query_params = create_url(keyword, max_results_per_free_request)
    my_data = []
    next_token = None  # to keep getting more tweets if available
    
    while len(my_data) < tweets_count:
        # keep getting responses until we have needed number of tweets
        my_response = connect_to_endpoint_2(search_url, headers, query_params)
        
        # collect each set of responses until we have them all or meet our requirement
        my_data.extend(my_response['data'])  
        
        # get next page of tweets, if available
        if 'next_token' in my_response['meta'].keys():
            print(f'next_token found: {my_response["meta"]["next_token"]}')
            #return None, my_response
            
            # get next page of tweets
            query_params['next_token'] = my_response['meta']['next_token']

        
    # convert to dataframe keeping the interesting fields
    fields_to_keep = ['text','created_at', 'id', 'conversation_id', 'referenced_tweets','lang',
                      'author_id','public_metrics.retweet_count','public_metrics.reply_count',
                      'public_metrics.like_count','public_metrics.quote_count',
                      'public_metrics.impression_count'
                     ]
    
    #df = json_normalize(my_response['data'])
    df = json_normalize(my_data)
    
    # Keeping all the data for now.  We can reduce later if needed.
    #df = df[fields_to_keep]  # warning - will break if twitter changes their API/fieldnames
    return df, my_response



# Pull the tweets

Just run the following cell and you'll pull some tweets.  You should take a break between runs.  I will add timestamps to the saved files so you don't accidentally overwrite your output.  Run until you have 5000 tweets for both topics, then upload the files to github.

In [39]:
# try it out
#keyword = "memphis lang:en -is:retweet"
#my_df, my_response = collect_tweets_to_df(keyword)

#my_keyword = f'{my_topic} lang:en'  # by default keyword search is case-insensitive; retweets appear truncated

# Test time bounds:
'''Windowing over time ranges using start_time and end_time in query parameters. Type should be ISO 8601 date, i.e. YYYY-MM-DDTHH:mm:ssZ. This sets the newest, most recent UTC timestap to which the Tweets will be provided.'''
#YYYY-MM-DDTHH:mm:ssZ
start = datetime(2023, 3, 6, 0, 0, 0)
end = datetime(2023, 3, 6, 23, 59, 59)
#my_keyword = f'{my_topic} lang:en start_time:{start} end_time:{end}'
my_keyword = f'{my_topic} lang:en start_time:2023-03-06T01:00:00Z end_time:2023-03-06T23:59:59Z'

my_df, my_response = collect_tweets_to_df(my_keyword)
print(f'\n\n\n***** PULLED {my_df.shape[0]} tweets *****\n\n')
display(my_df.head())

# write df to tsv
timestamp = str(datetime.now()).replace(' ', '_').replace(':', '.')
file_topic = my_topic.replace(' ', '_')
output_path = f'{file_topic}_project_tweets.{timestamp}.tsv'

my_df.to_csv(output_path, sep='\t')
print(f'Wrote data to {output_path}')

# write also to json (just in case)
output_path = f'{file_topic}_project_tweets.{timestamp}.json'

my_df.to_json(output_path)
print(f'Wrote json data to {output_path}')


Endpiont Response Code: 400


Exception: (400, '{"errors":[{"parameters":{"query":["maga lang:en start_time:2023-03-06T01:00:00Z end_time:2023-03-06T23:59:59Z"]},"message":"There were errors processing your request: missing EOF at \':\' (at position 41), no viable alternative at input \':\' (at position 38)"}],"title":"Invalid Request","detail":"One or more parameters to your request was invalid.","type":"https://api.twitter.com/2/problems/invalid-request"}')

In [37]:
start

datetime.datetime(2023, 3, 6, 0, 0)

In [23]:
# DELETEME, just testing:

# # write df to tsv
# timestamp = str(datetime.now()).replace(' ', '_').replace(':', '.')
# file_topic = my_topic.replace(' ', '_')
# output_path = f'{file_topic}_project_tweets.{timestamp}.csv'

# my_df.to_csv(output_path, sep='|')
# print(f'Wrote data to {output_path}')

# Write to json?
timestamp = str(datetime.now()).replace(' ', '_').replace(':', '.')
file_topic = my_topic.replace(' ', '_')
output_path = f'{file_topic}_project_tweets.{timestamp}.json'

my_df.to_json(output_path)
print(f'Wrote data to {output_path}')


Wrote data to maga_project_tweets.2023-03-08_21.15.55.808461.json


In [26]:
new_df = pd.read_json('maga_project_tweets.2023-03-08_21.15.55.808461.json')
new_df.head()

other_df = pd.read_csv('maga_project_tweets.2023-03-08_20.58.51.129896.tsv', delimiter='\t')
other_df.head()

Unnamed: 0.1,Unnamed: 0,reply_settings,id,lang,text,conversation_id,author_id,referenced_tweets,edit_history_tweet_ids,created_at,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count,public_metrics.impression_count,in_reply_to_user_id,geo.place_id
0,0,everyone,1633648438993883136,en,RT @GabiNga1: @keith0sta @827js @1mir_r @45tf5...,1633648438993883136,1298715879673274370,"[{'type': 'retweeted', 'id': '1633505762881282...",['1633648438993883136'],2023-03-09T01:58:34.000Z,106,0,0,0,0,,
1,1,everyone,1633648437798522881,en,RT @OccupyDemocrats: BREAKING: MAGA Gov. Sarah...,1633648437798522881,615777223,"[{'type': 'retweeted', 'id': '1633546143337680...",['1633648437798522881'],2023-03-09T01:58:33.000Z,2300,0,0,0,0,,
2,2,everyone,1633648433977335808,en,RT @OccupyDemocrats: BREAKING: MAGA Gov. Sarah...,1633648433977335808,1393026118303854594,"[{'type': 'retweeted', 'id': '1633546143337680...",['1633648433977335808'],2023-03-09T01:58:33.000Z,2300,0,0,0,0,,
3,3,everyone,1633648433507573763,en,@ybarrap We demand justice when cops shooting ...,1633564967164100615,739248656,"[{'type': 'replied_to', 'id': '163356496716410...",['1633648433507573763'],2023-03-09T01:58:32.000Z,0,0,0,0,0,17180761.0,
4,4,everyone,1633648424569679872,en,RT @TheRickWilson: They're captives of in the ...,1633648424569679872,899686918719381506,"[{'type': 'retweeted', 'id': '1633530595572842...",['1633648424569679872'],2023-03-09T01:58:30.000Z,151,0,0,0,0,,


# References and Notes

#### References Used
- [Towards Data Science Twitter API v2 Post](https://towardsdatascience.com/an-extensive-guide-to-collecting-tweets-from-twitter-api-v2-for-academic-research-using-python-3-518fcb71df2a)
- [Twitter developer Recent API docs](https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-recent#tab1)
- [Building Queries API 2 docs](https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query)
- [Twitter Dev API - enriched tweet objects](https://developer.twitter.com/en/docs/twitter-api/enterprise/data-dictionary/native-enriched-objects/tweet)
- [Recent search params](https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-recent)
- [Stack Overflow, avoiding truncation](https://stackoverflow.com/questions/38717816/twitter-api-text-field-value-is-truncated)

#### URL/Query builder

- [Building Queries API 2 docs](https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query)
- [Twitter Dev API](https://developer.twitter.com/en/docs/twitter-api/enterprise/data-dictionary/native-enriched-objects/tweet)
- [Recent search params](https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-recent)
- avoid truncation: https://stackoverflow.com/questions/38717816/twitter-api-text-field-value-is-truncated
- `(400, '{"errors":[{"parameters":{"tweet.truncated":["false"]},"message":"The query parameter [tweet.truncated] is not one of [query,start_time,end_time,since_id,until_id,max_results,next_token,pagination_token,sort_order,expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields]"}],"title":"Invalid Request","detail":"One or more parameters to your request was invalid.","type":"https://api.twitter.com/2/problems/invalid-request"}')`

#### Overcoming API Limitation

- [Official twitter limits documentation](https://developer.twitter.com/en/docs/twitter-api/v1/tweets/timelines/faq#:~:text=900%20requests%2F15%2Dmin%20window,%2Dhour%20window%20(application%20level))

I have found two ways to get more than the 100 tweets allowed in a single request.  

1. `my_response['meta']['next_token']` - Control for pagination, when more than a single page of tweets are available within a response.  This is passed in with the request as one of the params [gist](https://gist.githubusercontent.com/AndrewEdward37/8309d5701af8b5414498f60aafd68a5d/raw/d7fbf33a61d58943348deb3a20d94284c4deffc4/connect.py): 
```
params['next_token'] = next_token
responose = requests.request("GET", url, headers=headers, params=parms)
```

1. Windowing over time ranges using `start_time` and `end_time` in query parameters.  Type should be ISO 8601 date, i.e. `YYYY-MM-DDTHH:mm:ssZ`.  This sets the newest, most recent UTC timestap to which the Tweets will be provided. 