In [None]:
import os
import json
import glob
import pandas as pd
from joblib import Parallel, delayed

from tqdm import tqdm
tqdm.pandas()

In [None]:
os.getcwd()

In [None]:
data = glob.glob(os.path.join("..", "data", "battles", "b01", "*.json"))[:100]
print(f"N tweets: {len(data)}")

In [None]:
def load_tweets(tweet: str) -> dict:
    with open(tweet, "r") as f:
        output = json.load(f)
    return output

In [None]:
%%time

tweets = Parallel(n_jobs=8)(delayed(load_tweets)(tweet) for tweet in tqdm(data))

In [None]:
tweets[2]["referenced_tweets"]

In [None]:
def extract_urls(tweet: dict):
    urls = []
    try:
        for url in tweet["entities"]["urls"]:
            urls.append(url["expanded_url"])
    except Exception:
        pass
    if len(urls) == 0:
        return None
    else:
        return urls
    
    
def extract_hashtags(tweet: dict):
    hashtags = []
    try:
        for hashtag in tweet["entities"]["hashtags"]:
            hashtags.append(hashtag["tag"])
    except Exception:
        pass
    
    if len(hashtags) == 0:
        return None
    else:
        return ",".join(hashtags)
    
    
def extract_tweet_references(tweet: dict):
    if "referenced_tweets" in tweet:
        refs = {}
        for ref in tweet["referenced_tweets"]:
            refs[ref["id"]] = ref["type"]
            
        return refs
    else:
        return "original_tweet"
    
    
def extract_mentions(tweet: dict):
    mentions = {}
    try:
        for mention in tweet["entities"]["mentions"]:
            mentions[mention["id"]] = mention["username"]
    except Exception:
        pass
    
    if len(mentions) == 0:
        return None
    else:
        return mentions
    
    
def extract_author_info(tweet: dict):
    author = {}
    
    author["id"] = tweet["id"]
    author["username"] = tweet["username"]
    author["description"] = tweet["description"]
    author["verified"] = tweet["verified"]
    
    try:
        author["location"] = tweet["location"]
    except Exception:
        author["location"] = None
    
    author["url"] = tweet["url"]
    author["profile_image_url"] = tweet["profile_image_url"]
    author["created_at"] = tweet["created_at"]
    author["followers_count"] = tweet["public_metrics"]["followers_count"]
    author["following_count"] = tweet["public_metrics"]["following_count"]
    author["tweet_count"] = tweet["public_metrics"]["tweet_count"]
    author["listed_acount"] = tweet["public_metrics"]["listed_count"]
        
    return author
    

In [None]:
def extract_tweet_metadata(tweet: dict):
    output = {}
    
    fields = [
        "id",
        "conversation_id",
        "reply_settings",
        "source",
        "author_id",
        "created_at",
        "text",
        "lang",
        "possibly_sensitive",
    ]
    
    output = {k: tweet[k] for k in fields}
    output["author"] = tweet["author"]["username"]
    output["urls"] = extract_urls(tweet)
    output["referenced_tweets"] = extract_tweet_references(tweet)
    output["hashtags"] = extract_hashtags(tweet)
    output["mentions"] = extract_mentions(tweet)
    
    author_info = extract_author_info(tweet["author"])
    
    return output, author_info

In [None]:
%%time

outputs = Parallel(n_jobs=8)(delayed(extract_tweet_metadata)(tweet) for tweet in tqdm(tweets))


In [None]:
author_data, tweet_data = outputs

In [None]:
author_data = []
tweet_data = []
for tup in tqdm(outputs):
    tweet_data.append(tup[0])
    author_data.append(tup[1])

In [None]:
pd.DataFrame(tweet_data)

In [None]:
extract_author_info(tweets[0]["author"])

In [None]:
tweets[0]["author"]