In [55]:
import requests
import os
import json
from pprint import pprint
import time

# os.environ['BEARER_TOKEN'] = '<WRITE YOUR TOKEN HERE!!!!!>'
bearer_token = os.environ.get('BEARER_TOKEN')

def create_url(_ids):
    ids = "ids={}".format(_ids)
    tweet_fields = "tweet.fields=id,text,author_id,conversation_id,created_at,entities,geo,in_reply_to_user_id,lang,possibly_sensitive,public_metrics"
    expansion = "expansions=author_id"
    user_fields = "user.fields=id,name,username,created_at,description,entities,location,url,verified,withheld"
    
    url = "https://api.twitter.com/2/tweets?{}&{}&{}&{}".format(ids, tweet_fields, expansion, user_fields)
    return url


def get_tweets(_ids):
    url = create_url(_ids)
    payload={}
    headers = {
      'Authorization': 'Bearer {}'.format(bearer_token),
      'Cookie': 'guest_id=v1%3A164999832374703747; guest_id_ads=v1%3A164999832374703747; guest_id_marketing=v1%3A164999832374703747; personalization_id="v1_+F68isE/iukb7yr8y66bOw=="'
    }

    response = requests.request("GET", url, headers=headers, data=payload)

    return response.json()


In [37]:
import pandas as pd

def read_file(file_name):
    '''
    This function will read the file containing tweet ids sperating by lines (in groups)
    '''
    all_ids = pd.read_table(file_name,sep='\n',header=None)
    return all_ids

In [38]:
def separate_ids(ids):
    '''
    This function will split the sperate ids into a list
    '''
    seperated_ids = ids.split(",")
    return seperated_ids

In [135]:
def get_tweet_groups(groups_ids):
    '''
    This function will get tweets information using Twitter API in groups with indexing
    params groups_ids: all ids in lines (groups) retrieved from the orginal file 
    '''
    tweet_groups = {}
    for group_index in range(len(groups_ids)):
        group_separated_ids = separate_ids(groups_ids[0][group_index])
        group_dict = {}
        
        # first check if the first tweet is valid to be accessed
        # if no access, only 'errors' will be returned
        first_tweet = get_tweets(group_separated_ids[0])
        if len(first_tweet) == 1:
            continue
        else:
            # check if the group contains more than 100 ids 
            # since multiple tweets lookup has 100 maxmimum
            if len(group_separated_ids) > 100:
                index = 0
                partial_group = {}

                # partition groups with more than 100 ids into maximum 100 each group
                for i in range(len(group_separated_ids)//100 + 1):
                    partial_group_ids = ','.join(group_separated_ids[index:index+100])
                    partial_group = get_tweets(partial_group_ids)
                    # initialise the dict with the information from the first partial group
                    if len(group_dict) == 0 and len(partial_group) != 1:
                        group_dict = partial_group
                    # combining partial groups into a whole group
                    else:
                        # there is any tweet valid to be accessed in the partial group
                        if len(partial_group) != 1:
                            group_dict['data'].extend(partial_group['data'])
                            group_dict['includes']['users'].extend(partial_group['includes']['users'])

                    index += 100

            else:
                full_group_ids = ','.join(group_separated_ids)
                group_dict = get_tweets(full_group_ids)

            tweet_groups[group_index] = group_dict
    return tweet_groups

In [39]:
def get_tweet_groups_start_end(groups_ids, start, end):
    '''
    This function will get tweets information using Twitter API in groups with indexing
    params groups_ids: all ids in lines (groups) retrieved from the orginal file 
    '''
    tweet_groups = {}
    for group_index in range(start, end):
        group_separated_ids = separate_ids(groups_ids[0][group_index])
        group_dict = {}
        
        # first check if the first tweet is valid to be accessed
        # if no access, only 'errors' will be returned
        first_tweet = get_tweets(group_separated_ids[0])
        if len(first_tweet) == 1:
            continue
        else:
            # check if the group contains more than 100 ids 
            # since multiple tweets lookup has 100 maxmimum
            if len(group_separated_ids) > 100:
                index = 0
                partial_group = {}

                # partition groups with more than 100 ids into maximum 100 each group
                for i in range(len(group_separated_ids)//100 + 1):
                    if len(group_separated_ids) % 100 == 0 and i == len(group_separated_ids)//100:
                        continue
                        
                    partial_group_ids = ','.join(group_separated_ids[index:index+100])
                    partial_group = get_tweets(partial_group_ids)

                    # initialise the dict with the information from the first partial group
                    if len(group_dict) == 0 and len(partial_group) != 1:
                        group_dict = partial_group
                    # combining partial groups into a whole group
                    else:
                        # there is any tweet valid to be accessed in the partial group
                        if len(partial_group) != 1:
                            group_dict['data'].extend(partial_group['data'])
                            group_dict['includes']['users'].extend(partial_group['includes']['users'])

                    index += 100

            else:
                full_group_ids = ','.join(group_separated_ids)
                group_dict = get_tweets(full_group_ids)

            tweet_groups[group_index] = group_dict
    return tweet_groups

In [42]:
def update_tweet_data(ids, start, end, file):
    new_groups = get_tweet_groups_start_end(ids, start, end)
    
    with open(file,'r',encoding='utf-8') as f:
        tweets = json.load(f)
        
    tweets.update(new_groups)
    
    with open(file, "w",encoding='utf-8') as outfile:
        json.dump(tweets, outfile)

### Devset

In [112]:
dev_ids = read_file('../../project-data/dev.data.txt')
# dev_ids

In [None]:
# update_tweet_data(dev_ids, 0, 100, "data/dev_data.json")
# update_tweet_data(dev_ids, 100, 200, "data/dev_data.json")
# update_tweet_data(dev_ids, 200, 300, "data/dev_data.json")
# update_tweet_data(dev_ids, 300, 400, "data/dev_data.json")
# update_tweet_data(dev_ids, 400, 500, "data/dev_data.json")
# update_tweet_data(dev_ids, 500, 600, "data/dev_data.json")
# update_tweet_data(dev_ids, 600, 632, "data/dev_data.json")

In [103]:
# check if data is correct 
with open("data/dev_data_all.json",'r',encoding='utf-8') as f:
    dev_tweets = json.load(f)
    
# DO NOT PRINT IT DIRECTLY!!!
# 

### Trainset

In [40]:
train_ids = read_file('../../project-data/train.data.txt')
# train_ids

In [56]:
# update_tweet_data(train_ids, 0, 100, "data/train_data.json")
# update_tweet_data(train_ids, 100, 200, "data/train_data.json")
# update_tweet_data(train_ids, 200, 300, "data/train_data.json")
# update_tweet_data(train_ids, 300, 400, "data/train_data.json")
# update_tweet_data(train_ids, 400, 500, "data/train_data.json")
# update_tweet_data(train_ids, 500, 600, "data/train_data.json")
# update_tweet_data(train_ids, 600, 700, "data/train_data.json")
# update_tweet_data(train_ids, 700, 800, "data/train_data.json")
# update_tweet_data(train_ids, 800, 900, "data/train_data.json")
# update_tweet_data(train_ids, 900, 1000, "data/train_data.json")
# update_tweet_data(train_ids, 1000, 1100, "data/train_data.json")
# update_tweet_data(train_ids, 1100, 1200, "data/train_data.json")
# update_tweet_data(train_ids, 1200, 1300, "data/train_data.json")
# update_tweet_data(train_ids, 1300, 1400, "data/train_data.json")
# update_tweet_data(train_ids, 1400, 1500, "data/train_data.json")
# update_tweet_data(train_ids, 1500, 1600, "data/train_data.json")
# update_tweet_data(train_ids, 1600, 1700, "data/train_data.json")
# update_tweet_data(train_ids, 1700, 1800, "data/train_data.json")
# update_tweet_data(train_ids, 1800, 1895, "data/train_data.json")

In [77]:
# check if data is correct 
with open("data/train_data_all.json",'r',encoding='utf-8') as f:
    tweets = json.load(f)
    
# DO NOT PRINT IT DIRECTLY!!!

### Covid data

In [175]:
covid_ids = read_file('../../project-data/covid.data.txt')
# covid_ids

In [180]:
for i in range(len(covid_ids)//100 + 1):
    
    if i == len(covid_ids)//100:
        update_tweet_data(covid_ids, 100*i, len(covid_ids), "data/covid_data_all.json")     
        continue
    
    update_tweet_data(covid_ids, 100*i, 100*i+100, "data/covid_data_all.json")

    if i < len(covid_ids)//100:
        time.sleep(900)


In [None]:
# check if data is correct 
with open("data/covid_data_all.json",'r',encoding='utf-8') as f:
    tweets = json.load(f)
    
# DO NOT PRINT IT DIRECTLY!!!