# Process Elon's Twitter timeline

#### Load Python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import altair as alt
import altair_stiles as altstiles
import json

In [3]:
alt.themes.register("stiles", altstiles.theme)
alt.themes.enable("grid")

ThemeRegistry.enable('grid')

In [4]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000

In [5]:
today = pd.to_datetime("today").strftime("%Y-%m-%d")

---

## Get data

#### Which user are we seeking? 

In [25]:
user = "elonmusk"

#### Read timeline pulled with the [Twarc library](https://twarc-project.readthedocs.io/en/latest/twarc2_en_us/) in the command line

In [26]:
# twarc2 timeline --use-search 44196397 elonmusk_timeline_20221029.json

In [34]:
jsons = []

with open(f"/Users/stiles/twarc2/elonmusk_timeline_20221029-test.json") as f:
    for line in f:
        jsons.append(json.loads(line))

In [46]:
jsons[0].keys()

dict_keys(['data', 'includes', 'meta', '__twarc'])

#### Loop over list of twitter jsons and extract elements into a dictionary

In [35]:
data_list = []

for j in jsons:
    for d in j["data"]:
        data_list.append(d)

#### Convert list of dictionaries to dataframe

In [36]:
src = pd.DataFrame(data_list)

In [37]:
len(src)

100

In [38]:
src["idnum"] = src["id"].astype(int)

In [18]:
max_id = src[src["idnum"] == src["idnum"].max()]["id"].iloc[0]

In [19]:
!twarc2 timeline --use-search --since-id {max_id} elonmusk data/raw/elonmusk_timeline_since_{max_id}.json

100%|█████████████████| Processed 3 hours/3 hours [00:01<00:00, 0 tweets total ]


In [20]:
jsons_latest = []

with open(f"data/raw/elonmusk_timeline_since_{max_id}.json") as file:
    for l in file:
        jsons_latest.append(json.loads(l))

FileNotFoundError: [Errno 2] No such file or directory: 'data/raw/elonmusk_timeline_since_1585981766229143552.json'

In [15]:
data_list_latest = []

for jl in jsons_latest:
    for la in jl["data"]:
        data_list_latest.append(la)

In [16]:
src_latest = pd.DataFrame(data_list_latest)

In [17]:
all_df = pd.concat([src_latest, src])

#### Unpack nested columns

In [18]:
all_df[["urls", "annotations", "mentions", "cashtags", "hashtags"]] = pd.json_normalize(
    src["entities"]
)

In [19]:
all_df[
    ["retweet_count", "reply_count", "like_count", "quote_count"]
] = pd.json_normalize(all_df["public_metrics"])

In [20]:
all_df[["media_keys", "poll_ids"]] = pd.json_normalize(all_df["attachments"])

#### Dates

In [21]:
all_df["date"] = pd.to_datetime(src.created_at).dt.strftime("%Y-%m-%d")

#### Drop what we don't need

In [22]:
all_df.drop(
    [
        "entities",
        "author_id",
        "public_metrics",
        "context_annotations",
        "attachments",
        "lang",
        "reply_settings",
        "urls",
        "annotations",
        "cashtags",
        "poll_ids",
        "created_at",
        "possibly_sensitive",
        "mentions",
        "context_annotations",
        "media_keys",
        "hashtags",
        "referenced_tweets",
    ],
    axis=1,
    inplace=True,
)

In [23]:
df = all_df.copy()

---

In [24]:
df[df["text"].str.contains("Truth Social")]

Unnamed: 0,source,conversation_id,id,in_reply_to_user_id,text,idnum,retweet_count,reply_count,like_count,quote_count,date
41,Twitter for iPhone,1519179787163652099,1519363666377908225,44196397.0,Truth Social (terrible name) exists because Tw...,1.519364e+18,193624,82574,1386324,38447,2022-04-27
45,Twitter for iPhone,1519179787163652099,1519179787163652099,,Truth Social is currently beating Twitter &amp...,1.51918e+18,2429,1671,71289,115,2022-04-27


In [25]:
max_date = df.date.max()

In [26]:
df["id_num"] = df["id"].astype(int)

In [27]:
max_id = df[df["id_num"] == df.id_num.max()]["id_num"].iloc[0]

#### Trump

In [28]:
trump_replies_mentions = df[df["text"].str.contains("Trump")]

In [29]:
len(trump_replies_mentions)

28

---

## Export timeline

In [30]:
df.to_csv(f"data/processed/{user}_timeline_full.csv", index=False)