### Create datasets from the twitter API

#### Imports

In [2]:
import pandas as pd
import json
import requests

#### Settings

In [3]:
bearer_token = ""

hashtag = "Paris"
dataset_name = hashtag
num_tweets = 1000
allow_retweets = False

# Create a folder for the dataset if it doesn't exist
try:
    import os
    os.mkdir("data/" + dataset_name)
except:
    pass


#### Utility functions

In [4]:
# Step 1 : Make the request to the API
# We search for tweets containing the hashtag and their author


# Get the current mininumum tweet id in a dataset
def get_min_index(hashtag):
    tweets_df = pd.read_csv("data/" + hashtag + "/tweets.csv")
    min_index = tweets_df["id"].min()
    return min_index



# Store the response in a json file
def get_tweets(hashtag, max_id=None):

    # Build the max_id parameter
    max_id_string = ""
    if max_id is not None:
        max_id_string = "&until_id=" + str(max_id)

    # Build the max_results parameter
    max_results = 100
    if num_tweets < max_results:
        max_results = num_tweets
    max_results_string = "&max_results=" + str(max_results)

    # Build the allow_retweets parameter
    retweet_string = ""
    if not allow_retweets:
        retweet_string = "%20-is%3Aretweet"

    url = "https://api.twitter.com/2/tweets/search/recent?query=%23" + hashtag + retweet_string + max_id_string + max_results_string + "&expansions=author_id&tweet.fields=id,created_at,text,author_id,in_reply_to_user_id,public_metrics,possibly_sensitive,lang&user.fields=id,created_at,name,username,protected,verified,profile_image_url,location,url,description,pinned_tweet_id,public_metrics"
    res = requests.get(url, headers={"Authorization": "Bearer " + bearer_token})
    
    # Check if the request was successful
    if (res.json().get("errors")):
        print(res.json().get("errors"))

    # Store the response in a json object
    json_object = res.json()

    # Change the int ids into strings for the tweets
    for i in range(len(json_object["data"])):
        json_object["data"][i]["id"] = str(json_object["data"][i]["id"])
        json_object["data"][i]["author_id"] = str(json_object["data"][i]["author_id"])


    # Change the int ids into strings for the includes.users
    for i in range(len(json_object["includes"]["users"])):
        json_object["includes"]["users"][i]["id"] = str(json_object["includes"]["users"][i]["id"])

    # Save the results to a json file
    with open("data/" + dataset_name + "/response.json", "w") as f:
        json_string = json.dumps(json_object)
        f.write(json_string)



In [5]:
# Step 2 : Extract the tweets from the response
def extract_tweets(dataset_name):

    # Load the json file
    with open("data/" + dataset_name + "/response.json", encoding="utf8") as f:
        data = json.load(f)

    # Convert the response to a pandas dataframe
    res_df = pd.DataFrame(data["data"])

    # If it exists, load the tweets.csv file
    try:
        tweets_df = pd.read_csv("data/" + dataset_name + "/tweets.csv")
    except:
        tweets_df = pd.DataFrame()

    # Add the new tweets to the dataframe
    tweets_df = pd.concat([tweets_df, res_df])
    
    # Save the dataframe to the csv file
    tweets_df.to_csv("data/" + dataset_name + "/tweets.csv", index=False)



In [6]:
# Step 3 : Get the authors of tweets

# Extract the authors of the tweets from the reqest
def extract_authors(dataset_name):
    
    # Load the json file
    with open("data/" + dataset_name + "/response.json", encoding="utf8") as f:
        data = json.load(f)

    # Convert the response to a pandas dataframe
    res_df = pd.DataFrame(data["includes"]["users"])

    # If it exists, load the authors.csv file
    try:
        authors_df = pd.read_csv("data/" + dataset_name + "/authors.csv")
    except:
        authors_df = pd.DataFrame()

    # Add the new authors to the dataframe
    authors_df = pd.concat([authors_df, res_df], ignore_index=True)


    # Save the authors to a csv file
    authors_df.to_csv("data/" + dataset_name + "/authors.csv", index=False)

# Call the API to get the authors of the tweets
def get_authors(author_id_list):
    
    # Generate a string of author ids
    autor_string = ""
    for author_id in author_id_list:
        autor_string += str(author_id) + ","
    autor_string = autor_string[:-1] # remove last comma

    # request the listed authors to the API
    url = "https://api.twitter.com/2/users?ids=" + autor_string + "&user.fields=id,created_at,name,username,verified,profile_image_url,location,description,pinned_tweet_id"
    res = requests.get(url, headers={"Authorization": "Bearer " + bearer_token})

    # Check if there is an error
    if res.json().get("errors"):
        print(res.json()["errors"])

    # convert the response to a dataframe
    res_df = pd.DataFrame(res.json()["data"])

    # Save the dataframe to a csv file in the right folder
    res_df.to_csv("data/" + dataset_name + "/authors.csv", index=False)


In [7]:
# Step 4 : Some tests on the data

# Check if we know all the authors of the tweets
def check_authors(dataset_name):

    res = True

    # Load the tweets file
    tweets_df = pd.read_csv("data/" + dataset_name + "/tweets.csv")

    # Get the list of author ids
    author_id_list = tweets_df["author_id"].unique()

    # Check if the authors are in the authors file
    authors_df = pd.read_csv("data/" + dataset_name + "/authors.csv")

    for author_id in author_id_list:
        if author_id not in authors_df["id"].unique():
            print("Author id " + str(author_id) + " not found in the authors file")
            res = False

    if (res):
        print("All the tweets authors are in the authors file")

# Look for duplicate tweets
def check_duplicate_tweets(dataset_name):

    # Load the tweets file
    tweets_df = pd.read_csv("data/" + dataset_name + "/tweets.csv")

    # Get the duplicates
    duplicates = tweets_df.duplicated(subset=["id"])

    # Check if there are duplicates
    if (duplicates.any()):
        print("There are duplicate tweets:")
        print(tweets_df[duplicates])
    else:
        print("There are no duplicate tweets")

    



#### Import the first batch of tweets

In [64]:
# First batch of tweets
get_tweets(hashtag)
extract_tweets(dataset_name)
extract_authors(dataset_name)
min_id = get_min_index(dataset_name)
print("Min from the first batch:", min_id)

Min from the first batch: 1536028215709511680


#### Import one more batch of tweets

In [16]:
# One more batch of tweets
min_id = get_min_index(dataset_name)
print("Min from the previous batch:", min_id)
get_tweets(hashtag, min_id)
extract_tweets(dataset_name)
extract_authors(dataset_name)
min_id = get_min_index(dataset_name)
print("Min from this batch:", min_id)

Min from the previous batch: 1535854506684125185
Min from this batch: 1535816097919811585


#### Test if the dataset is correctly created

In [17]:
# Check if we know all the authors of the tweets
check_authors(dataset_name)
check_duplicate_tweets(dataset_name)

tweets_df = pd.read_csv("data/" + dataset_name + "/tweets.csv")
print('Number of tweets:', len(tweets_df))

authors_df = pd.read_csv("data/" + dataset_name + "/authors.csv")
print('Number of authors:', len(authors_df))

All the tweets authors are in the authors file
There are no duplicate tweets
Number of tweets: 1500
Number of authors: 1157


#### Visualize the dataset

In [70]:
tweets_df = pd.read_csv("data/" + dataset_name + "/tweets.csv")
authors_df = pd.read_csv("data/" + dataset_name + "/authors.csv")
tweets_df

Unnamed: 0,possibly_sensitive,lang,id,text,public_metrics,author_id,created_at,in_reply_to_user_id
0,False,en,1536041012849692675,The bright side. A Sunday under the sun. The ...,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",1379526403578806280,2022-06-12T17:41:30.000Z,
1,False,fr,1536040869312200704,Martine Franck. \nHenri Cartier-Bresson dessin...,"{'retweet_count': 1, 'reply_count': 0, 'like_c...",875280010877296640,2022-06-12T17:40:56.000Z,
2,False,und,1536040769630388227,"Мир Украине, свободу России. #paris #standwith...","{'retweet_count': 0, 'reply_count': 0, 'like_c...",1519399318557532167,2022-06-12T17:40:32.000Z,
3,False,fr,1536040742434521088,C'est un établissement pluridisciplinaire né d...,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",1512584037369589761,2022-06-12T17:40:25.000Z,1.512584e+18
4,False,und,1536040680367210498,#NowPlaying #Africanism @martinsolveig #house ...,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",1390289676695654401,2022-06-12T17:40:11.000Z,
...,...,...,...,...,...,...,...,...
295,False,en,1536005494778867713,It's no longer possible to make a reservation ...,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",791590844050599936,2022-06-12T15:20:22.000Z,
296,False,fr,1536005467352227841,Enlaidissement non seulement inutile mais surt...,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",3407734155,2022-06-12T15:20:15.000Z,
297,False,ht,1536004928069672963,✌Que des #bonnesvibes #EnCeMomentSur :\n\n ➡️ ...,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",971844112575057920,2022-06-12T15:18:07.000Z,
298,False,en,1536004741616246785,"#FrançoiseDorléac, December 21, 1962, #Paris. ...","{'retweet_count': 0, 'reply_count': 0, 'like_c...",1478002801586618374,2022-06-12T15:17:22.000Z,
