In [1]:
import requests
import json
import time
import random
import os
import pandas as pd

In [2]:
from dotenv import load_dotenv
from datetime import datetime, timedelta
from pytz import timezone

load_dotenv()

BEARERTOKEN = os.getenv('BEARERTOKEN')

Source Code : https://medium.com/data-analytics-at-nesta/all-you-need-to-get-started-with-twitter-api-v2-using-python-6cd4be4d90fe

In [3]:
# Endpoint URL
endpoint_url = "https://api.twitter.com/2/tweets/search/recent"

#Timed settings with ISO 8601 format
local_time = timezone("Asia/Jakarta").localize(datetime(2025, 1, 27, 19, 0, 0))
start_time = local_time.astimezone(timezone("UTC")).strftime("%Y-%m-%dT%H:%M:%SZ")

In [15]:
#YYYY-MM-DDTHH:mm:ssZ
start_time

'2025-01-27T12:00:00Z'

In [4]:
# Query parameters
#query_parameters = {
#    "query": '("makan bergizi gratis" OR "makan siang gratis" OR "MBG") lang:id -is:retweet',
#    "tweet.fields": "id,text,author_id,created_at",
#    "start_time": start_time,
#    "max_results": 10,
#}

In [4]:
def request_headers(bearer_token: str) -> dict:
    """
    Sets up the request headers. 
    Returns a dictionary summarising the bearer token authentication details.
    """
    return {"Authorization": "Bearer {}".format(bearer_token)}

headers = request_headers(BEARERTOKEN)

In [5]:
def connect_to_endpoint(endpoint_url: str, headers: dict, parameters: dict) -> json:
    """
    Connects to the endpoint and requests data.
    Returns a json with Twitter data if a 200 status code is yielded.
    Programme stops if there is a problem with the request and sleeps
    if there is a temporary problem accessing the endpoint.
    """
    response = requests.request(
        "GET", url=endpoint_url, headers=headers, params=parameters
    )
    response_status_code = response.status_code
    if response_status_code != 200:
        if response_status_code >= 400 and response_status_code < 500:
            raise Exception(
                "Cannot get data, the program will stop!\nHTTP {}: {}".format(
                    response_status_code, response.text
                )
            )
        
        sleep_seconds = random.randint(5, 60)
        print(
            "Cannot get data, your program will sleep for {} seconds...\nHTTP {}: {}".format(
                sleep_seconds, response_status_code, response.text
            )
        )
        time.sleep(sleep_seconds)
        return connect_to_endpoint(endpoint_url, headers, parameters)
    return response.json()

#uncomment to execute
#json_response = connect_to_endpoint(endpoint_url, headers, query_parameters)

In [6]:
#set up rules
#rules = [
#    {"value": '("makan bergizi gratis" OR "makan siang gratis" OR "MBG") lang:id -is:retweet', "tag": "mbg"},
#]

In [6]:
def process_twitter_data(
    json_response: json,
    query_tag: str,
    tweets_data: pd.DataFrame,
    users_data: pd.DataFrame,
) -> (pd.DataFrame, pd.DataFrame):
    """
    Adds new tweet/user information to the table of
    tweets/users and saves dataframes as pickle files,
    if data is avaiable.
    
    Returns the tweets and users updated dataframes.
    """
    if "data" in json_response.keys():
        new = pd.DataFrame(json_response["data"])
        tweets_data = pd.concat([tweets_data, new])
        tweets_data.to_pickle("tweets_" + query_tag + ".pkl")

        #json_response["includes"] return error, disabling for now
        #if "users" in json_response["includes"].keys():
        #    new = pd.DataFrame(json_response["includes"]["users"])
        #    users_data = pd.concat([users_data, new])
        #    users_data.drop_duplicates("id", inplace=True)
        #    users_data.to_pickle("users_" + query_tag + ".pkl")

    return tweets_data, users_data

In [7]:
query_parameters = {
    "tweet.fields": "id,text,author_id,created_at",
    "user.fields": "id,name,username,created_at,description,location,verified",
    "start_time": start_time,
    "max_results": 99,
}

In [8]:
rules = [
    {"value": '("makan bergizi gratis" OR "makan siang gratis" OR "MBG") lang:id -is:retweet', "tag": "mbg"},
]

In [9]:
tweets_data = pd.DataFrame()
users_data = pd.DataFrame()

for i in range(len(rules)):
    query_parameters["query"] = rules[i]["value"]
    query_tag = rules[i]["tag"]

    json_response = connect_to_endpoint(endpoint_url, headers, query_parameters)
    tweets_data, users_data = process_twitter_data(
        json_response, query_tag, tweets_data, users_data
    )

    time.sleep(20)

    while "next_token" in json_response["meta"]:
        query_parameters["next_token"] = json_response["meta"]["next_token"]

        json_response = connect_to_endpoint(endpoint_url, headers, query_parameters)
        tweets_data, users_data = process_twitter_data(
            json_response, query_tag, tweets_data, users_data
        )

        time.sleep(20)

KeyError: 'includes'

In [None]:
json_response

In [39]:
## ganti nama file

with open('jsonresponse3.json', 'w') as fp: 
    json.dump(json_response, fp)

In [60]:
resultdf = pd.DataFrame(columns=['id', 'edit_history_tweet_ids', 'created_at', 'text', 'author_id'])

for i in range(len(json_response['data'])):
    #print(json_response['data'][i])
    insertdf = pd.DataFrame(json_response['data'][i])
    resultdf = pd.concat([resultdf,insertdf])

In [None]:
resultdf

In [11]:
json_response

{'data': [{'text': 'ril MBG 😭😭😭 https://t.co/kGHHXWho5c',
   'author_id': '1273934862215335937',
   'id': '1886107824356561286',
   'created_at': '2025-02-02T17:42:02.000Z',
   'edit_history_tweet_ids': ['1886107824356561286']},
  {'text': '@pengardian sekarang udah 36k, makan siang gratis could never',
   'author_id': '1857048752248209408',
   'id': '1886106735632609613',
   'created_at': '2025-02-02T17:37:42.000Z',
   'edit_history_tweet_ids': ['1886106735632609613']},
  {'text': '@abulmuzaffar10 gua yakin, ini mbg buat bikin rusuh di kulit. buat pembiasan hal yg lebih penting.',
   'author_id': '1632409274373791744',
   'id': '1886106107925626938',
   'created_at': '2025-02-02T17:35:12.000Z',
   'edit_history_tweet_ids': ['1886106107925626938']},
  {'text': 'Ternyata gak kaleng² isinya, nasi putih, sepotong ayam bakar, sambal, kriuk selada, timun &amp; daun kol juga masih diberi segelas air mineral, cuma 11.655 rupiah udah dianter @gojekindonesia pula\nLebih enak dari menu makan ber

In [12]:
resultdf = pd.DataFrame(columns=['id', 'edit_history_tweet_ids', 'created_at', 'text', 'author_id'])

for i in range(len(json_response['data'])):
    #print(json_response['data'][i])
    insertdf = pd.DataFrame(json_response['data'][i])
    resultdf = pd.concat([resultdf,insertdf])

In [14]:
resultdf.to_csv(r"dataset/raw/020225_150730sampai174202.csv")

In [16]:
len(resultdf)

99