# Download and process user's Twitter timeline

#### Load Python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import altair as alt
import altair_stiles as altstiles
import json

In [3]:
alt.themes.register("stiles", altstiles.theme)
alt.themes.enable("stiles")

ThemeRegistry.enable('stiles')

In [4]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000

In [5]:
today = pd.to_datetime("today").strftime("%Y-%m-%d")

---

## Get data

#### Which user are we seeking? 

In [6]:
user = "elonmusk"

#### Read timeline pulled with the [Twarc library](https://twarc-project.readthedocs.io/en/latest/twarc2_en_us/) in the command line

In [7]:
# !twarc2 timeline --use-search {user} data/raw/{user}_{today}.jsonl

In [8]:
jsons = []

with open(f"data/raw/{user}_{today}.jsonl") as f:
    for line in f:
        jsons.append(json.loads(line))

#### Loop over list of twitter jsons and extract elements into a dictionary

In [9]:
data_list = []

for j in jsons:
    for d in j["data"]:
        data_list.append(d)

#### Convert list of dictionaries to dataframe

In [10]:
src = pd.DataFrame(data_list)

In [11]:
len(src)

19849

In [12]:
max_id = src["id"].astype(int).nlargest(2)[1]

In [13]:
max_id

1594474679452028929

In [14]:
!twarc2 timeline --use-search --since-id {max_id} elonmusk data/raw/elonmusk_timeline_since_{max_id}.json

100%|█████████████████| Processed an hour/an hour [00:01<00:00, 2 tweets total ]


In [15]:
jsons_latest = []

with open(f"data/raw/elonmusk_timeline_since_{max_id}.json") as file:
    for l in file:
        jsons_latest.append(json.loads(l))

In [16]:
data_list_latest = []

for jl in jsons_latest:
    for la in jl["data"]:
        data_list_latest.append(la)

In [17]:
src_latest = pd.DataFrame(data_list_latest)

In [18]:
all_df = pd.concat([src_latest, src])

#### Unpack nested columns

In [19]:
all_df[["urls", "annotations", "mentions", "cashtags", "hashtags"]] = pd.json_normalize(
    src["entities"]
)

In [20]:
all_df[
    ["retweet_count", "reply_count", "like_count", "quote_count"]
] = pd.json_normalize(all_df["public_metrics"])

In [21]:
all_df[["media_keys", "poll_ids"]] = pd.json_normalize(all_df["attachments"])

#### Dates

In [22]:
all_df["date"] = pd.to_datetime(src.created_at).dt.strftime("%Y-%m-%d")

#### Drop what we don't need

In [23]:
all_df.drop(
    [
        "entities",
        "author_id",
        "public_metrics",
        "context_annotations",
        "attachments",
        "lang",
        "reply_settings",
        "urls",
        "annotations",
        "cashtags",
        "poll_ids",
        "created_at",
        "possibly_sensitive",
        "mentions",
        "context_annotations",
        "media_keys",
        "hashtags",
        "referenced_tweets",
    ],
    axis=1,
    inplace=True,
)

In [24]:
df = all_df.drop_duplicates(subset="id").copy()

---

## Exports

In [25]:
df.to_csv(f"data/processed/{user}_timeline_full.csv", index=False)