## Kherson Exmaple

Search terms:

`ukraine kherson -is:retweet`

In [12]:
import os
import json
from glob import glob
import pandas as pd
from tqdm import tqdm

In [13]:
tweets = glob(os.path.join("..", "data", "kherson", "*.json"))

In [14]:
data = []
for tweet in tqdm(tweets):
    with open(tweet, "r") as f:
        data.append(json.load(f))

100%|██████████| 6760/6760 [00:01<00:00, 4709.29it/s]


In [15]:
def extract_urls(tweet: dict):
    urls = []
    try:
        for url in tweet["entities"]["urls"]:
            urls.append(url["expanded_url"])
    except Exception:
        pass
    if len(urls) == 0:
        return None
    else:
        return ",".join(urls)

In [16]:
def extract_hashtags(tweet: dict):
    hashtags = []
    try:
        for hashtag in tweet["entities"]["hashtags"]:
            hashtags.append(hashtag["tag"])
    except Exception:
        pass
    
    if len(hashtags) == 0:
        return None
    else:
        return ",".join(hashtags)

In [17]:
def extract_mentions(tweet: dict):
    mentions = []
    try:
        for mention in tweet["entities"]["mentions"]:
            mentions.append(mention["id"] + "::" + mention["username"])
    except Exception:
        pass
    
    if len(mentions) == 0:
        return None
    else:
        return ",".join(mentions)

In [11]:
data

{'id': '245547900',
 'username': 'UkrainkaDesigns',
 'description': 'Leave a positive mark on the world. Putin is a terrorist.',
 'verified': False,
 'location': 'United States',
 'url': '',
 'profile_image_url': 'https://pbs.twimg.com/profile_images/1426201677309349899/omAGCfz-_normal.jpg',
 'created_at': '2011-01-31T23:13:41.000Z',
 'followers_count': 369,
 'following_count': 498,
 'tweet_count': 20755,
 'listed_acount': 26}

In [24]:
def extract_author_info(tweet: dict):
    author = {}
    
    author["id"] = tweet["id"]
    author["username"] = tweet["username"]
    author["description"] = tweet["description"]
    author["verified"] = tweet["verified"]
    
    try:
        author["location"] = tweet["location"]
    except Exception:
        author["location"] = None
    
    author["url"] = tweet["url"]
    author["profile_image_url"] = tweet["profile_image_url"]
    author["created_at"] = tweet["created_at"]
    author["followers_count"] = tweet["public_metrics"]["followers_count"]
    author["following_count"] = tweet["public_metrics"]["following_count"]
    author["tweet_count"] = tweet["public_metrics"]["tweet_count"]
    author["listed_acount"] = tweet["public_metrics"]["listed_count"]
        
    return author
    

In [25]:
def extract_tweet_metadata(tweet: dict):
    output = {}
    
    fields = [
        "id",
        "conversation_id",
        "reply_settings",
        "source",
        "author_id",
        "created_at",
        "text",
        "lang",
        "possibly_sensitive",
    ]
    
    output = {k: tweet[k] for k in fields}
    output["author"] = tweet["author"]["username"]
    output["urls"] = extract_urls(tweet)
    output["hashtags"] = extract_hashtags(tweet)
    output["mentions"] = extract_mentions(tweet)
    
    author_info = extract_author_info(tweet["author"])
    
    
    return output, author_info

In [28]:
author_data = []
tweet_data = []

for twt in tqdm(data):
    content, author = extract_tweet_metadata(twt)
    author_data.append(author)
    tweet_data.append(content)

100%|██████████| 6760/6760 [00:00<00:00, 120713.78it/s]


In [29]:
pd.DataFrame(tweet_data).to_csv("kherson_example.csv", index=False)

Unnamed: 0,id,conversation_id,reply_settings,source,author_id,created_at,text,lang,possibly_sensitive,author,urls,hashtags,mentions
0,1496636130753716229,1496636130753716229,everyone,Twitter Web App,1496443860729409538,2022-02-24T00:00:34.000Z,🚨 #BREAKING: 🇺🇦⚡️ #Ukraine's Zaporizhia airpor...,en,False,247WBN,https://twitter.com/247WBN/status/149663613075...,"BREAKING,Ukraine,UkraineRussiaCrisis,Ukraine,R...",1496443860729409538::247WBN
1,1496643328397484039,1496643328397484039,everyone,Twitter Web App,1110871915877552128,2022-02-24T00:29:10.000Z,"When the tension is high, closing the airspace...",en,False,MihajlovicMike,https://twitter.com/MihajlovicMike/status/1496...,,
2,1496651971595583494,1496651971595583494,everyone,Twitter Web App,245547900,2022-02-24T01:03:31.000Z,"Kharkiv, Dnipro, Zaporizhzhia airports closed ...",en,False,UkrainkaDesigns,https://twitter.com/UkrainkaDesigns/status/149...,,
3,1496671039295832068,1496671039295832068,everyone,Twitter Web App,1285954032813649923,2022-02-24T02:19:17.000Z,"#Russia has reportedly attacked Kalanchak, Khe...",en,False,Shaya_7,https://twitter.com/Militarylandnet/status/149...,"Russia,Ukraine",
4,1496678766634053635,1496678766634053635,everyone,Twitter Web App,1496515748914737163,2022-02-24T02:49:59.000Z,Unconfirmed reports Russian saboteurs entered ...,en,False,talkrealopinion,,"Putin,Russia,Ukraine",
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6755,1499172372439572483,1499172372439572483,everyone,Twitter for Android,1290210901543596035,2022-03-02T23:58:41.000Z,"If Kherson is now under Russian control, it wo...",en,False,ANIMATIONWEB5,https://youtu.be/39QxsZMlxcQ,"Ukraine,UkraineRussiaWar,StandWithUkriane,Russ...",
6756,1499172530036244482,1499172530036244482,everyone,Twitter Web App,1194664669991948289,2022-03-02T23:59:19.000Z,While we wait for Russia's double-dip recessio...,en,False,play_misty_4me,,"UkraineUnderAttaсk,kherson",
6757,1499172598063665153,1499172598063665153,everyone,Twitter Web App,1413170290750746626,2022-03-02T23:59:35.000Z,""" Russia claims it has taken control of major ...",en,False,EstelaMaciasGl1,https://www.rt.com/russia/551021-russia-kherso...,,
6758,1499172639495327747,1499172639495327747,everyone,Twitter for Android,1496645623432372228,2022-03-02T23:59:45.000Z,There are reports of Heavy Fighting in the Sou...,en,False,xuuNEWS,,"Ukraine,Russia,StopWarInUkraine",


In [None]:
pd.DataFrame(data).to_json("test.json", indent=4, orient="records")