In [36]:
import requests
import os
import json
from pprint import pprint

# os.environ['BEARER_TOKEN'] = 'WRITE YOUR TOKEN HERE'
bearer_token = os.environ.get('BEARER_TOKEN')

def create_url(_ids):
    ids = "ids={}".format(_ids)
    tweet_fields = "tweet.fields=id,text,author_id,conversation_id,created_at,entities,geo,in_reply_to_user_id,lang,possibly_sensitive,public_metrics"
    expansion = "expansions=author_id"
    user_fields = "user.fields=id,name,username,created_at,description,entities,location,url,verified,withheld"
    
    url = "https://api.twitter.com/2/tweets?{}&{}&{}&{}".format(ids, tweet_fields, expansion, user_fields)
    return url


def get_tweets(_ids):
    url = create_url(_ids)
    payload={}
    headers = {
      'Authorization': 'Bearer {}'.format(bearer_token),
      'Cookie': 'guest_id=v1%3A164999832374703747; guest_id_ads=v1%3A164999832374703747; guest_id_marketing=v1%3A164999832374703747; personalization_id="v1_+F68isE/iukb7yr8y66bOw=="'
    }

    response = requests.request("GET", url, headers=headers, data=payload)

    return response.json()


In [37]:
import pandas as pd

def read_file(file_name):
    '''
    This function will read the file containing tweet ids sperating by lines (in groups)
    '''
    all_ids = pd.read_table(file_name,sep='\n',header=None)
    return all_ids

In [38]:
def separate_ids(ids):
    '''
    This function will split the sperate ids into a list
    '''
    seperated_ids = ids.split(",")
    return seperated_ids

In [51]:
def get_tweet_groups(groups_ids):
    '''
    This function will get tweets information using Twitter API in groups with indexing
    params groups_ids: all ids in lines (groups) retrieved from the orginal file 
    '''
    tweet_groups = {}
    no_access_first_tweets = []
    for group_index in range(len(groups_ids)):
        group_separated_ids = separate_ids(groups_ids[0][group_index])
        group_dict = {}
        
        # first check if the first tweet is valid to be accessed
        # if no access, only 'errors' will be returned
        first_tweet = get_tweets(group_separated_ids[0])
        if len(first_tweet) == 1:
            no_access_first_tweets.append(group_index)
            continue
        else:
            # check if the group contains more than 100 ids 
            # since multiple tweets lookup has 100 maxmimum
            if len(group_separated_ids) > 100:
                index = 0
                partial_group = {}

                # partition groups with more than 100 ids into maximum 100 each group
                for i in range(len(group_separated_ids)//100 + 1):
                    partial_group_ids = ','.join(group_separated_ids[index:index+100])
                    partial_group = get_tweets(partial_group_ids)
                    # initialise the dict with the information from the first partial group
                    if len(group_dict) == 0 and len(partial_group) != 1:
                        group_dict = partial_group
                    # combining partial groups into a whole group
                    else:
                        # there is any tweet valid to be accessed in the partial group
                        if len(partial_group) != 1:
                            group_dict['data'].extend(partial_group['data'])
                            group_dict['includes']['users'].extend(partial_group['includes']['users'])

                    index += 100

            else:
                full_group_ids = ','.join(group_separated_ids)
                group_dict = get_tweets(full_group_ids)

            tweet_groups[group_index] = group_dict
    return tweet_groups, no_access_first_tweets

In [39]:
def get_tweet_groups_start_end(groups_ids, start, end):
    '''
    This function will get tweets information using Twitter API in groups with indexing
    params groups_ids: all ids in lines (groups) retrieved from the orginal file 
    '''
    tweet_groups = {}
    for group_index in range(start, end):
        group_separated_ids = separate_ids(groups_ids[0][group_index])
        group_dict = {}
        
        # first check if the first tweet is valid to be accessed
        # if no access, only 'errors' will be returned
        first_tweet = get_tweets(group_separated_ids[0])
        if len(first_tweet) == 1:
            continue
        else:
            # check if the group contains more than 100 ids 
            # since multiple tweets lookup has 100 maxmimum
            if len(group_separated_ids) > 100:
                index = 0
                partial_group = {}

                # partition groups with more than 100 ids into maximum 100 each group
                for i in range(len(group_separated_ids)//100 + 1):
                    if len(group_separated_ids) % 100 == 0 and i == len(group_separated_ids)//100:
                        continue
                        
                    partial_group_ids = ','.join(group_separated_ids[index:index+100])
                    partial_group = get_tweets(partial_group_ids)

                    # initialise the dict with the information from the first partial group
                    if len(group_dict) == 0 and len(partial_group) != 1:
                        group_dict = partial_group
                    # combining partial groups into a whole group
                    else:
                        # there is any tweet valid to be accessed in the partial group
                        if len(partial_group) != 1:
                            group_dict['data'].extend(partial_group['data'])
                            group_dict['includes']['users'].extend(partial_group['includes']['users'])

                    index += 100

            else:
                full_group_ids = ','.join(group_separated_ids)
                group_dict = get_tweets(full_group_ids)

            tweet_groups[group_index] = group_dict
    return tweet_groups

In [40]:
train_ids = read_file('../../project-data/train.data.txt')
# train_ids

In [41]:
dev_ids = read_file('../../project-data/dev.data.txt')
# dev_ids

In [42]:
def update_tweet_data(ids, start, end, file):
    new_groups = get_tweet_groups_start_end(ids, start, end)
    
    with open(file,'r',encoding='utf-8') as f:
        tweets = json.load(f)
        
    tweets.update(new_groups)
    
    with open(file, "w",encoding='utf-8') as outfile:
        json.dump(tweets, outfile)

### Devset

In [None]:
update_tweet_data(dev_ids, 0, 100, "data/dev_data.json")

In [None]:
# check if data is correct 
with open("data/dev_data.json",'r',encoding='utf-8') as f:
    tweets = json.load(f)
    
# DO NOT PRINT IT DIRECTLY!!!
# 

In [34]:
# tweets_dev1 = get_tweet_groups_start_end(dev_ids, 0, 100)
# tweets_dev2 = get_tweet_groups_start_end(dev_ids, 100, 200)
# tweets_dev3 = get_tweet_groups_start_end(dev_ids, 200, 300)
# tweets_dev4 = get_tweet_groups_start_end(dev_ids, 300, 400)
# tweets_dev5 = get_tweet_groups_start_end(dev_ids, 400, 500)
# tweets_dev6 = get_tweet_groups_start_end(dev_ids, 500, 632)

### Trainset

In [43]:
update_tweet_data(train_ids, 0, 100, "data/train_data.json")

FileNotFoundError: [Errno 2] No such file or directory: 'data/train_data.json'

In [None]:
# check if data is correct 
with open("data/train_data.json",'r',encoding='utf-8') as f:
    tweets = json.load(f)
    
# DO NOT PRINT IT DIRECTLY!!!

In [161]:
# tweets_train1 = get_tweet_groups_start_end(train_ids, 0, 100)
# tweets_train2 = get_tweet_groups_start_end(train_ids, 100, 200)
# tweets_train3 = get_tweet_groups_start_end(train_ids, 200, 300)
# tweets_train4 = get_tweet_groups_start_end(train_ids, 300, 400)
# tweets_train5 = get_tweet_groups_start_end(train_ids, 400, 500)
# tweets_train7 = get_tweet_groups_start_end(train_ids, 600, 700)
# tweets_train8 = get_tweet_groups_start_end(train_ids, 700, 800)
# tweets_train9 = get_tweet_groups_start_end(train_ids, 800, 900)
# tweets_train10 = get_tweet_groups_start_end(train_ids, 900, 1000)
# tweets_train11 = get_tweet_groups_start_end(train_ids, 1000, 1100)
# tweets_train12 = get_tweet_groups_start_end(train_ids, 1100, 1200)
# tweets_train13 = get_tweet_groups_start_end(train_ids, 1200, 1300)
# tweets_train14 = get_tweet_groups_start_end(train_ids, 1300, 1400)
# tweets_train15 = get_tweet_groups_start_end(train_ids, 1400, 1500)
# tweets_train16 = get_tweet_groups_start_end(train_ids, 1500, 1600)
# tweets_train17 = get_tweet_groups_start_end(train_ids, 1600, 1700)
# tweets_train18 = get_tweet_groups_start_end(train_ids, 1700, 1800)
# tweets_train19 = get_tweet_groups_start_end(train_ids, 1800, 1895)