# Get a user timeline from Twitter using Twarc

#### Load Python tools

In [1]:
%load_ext lab_black

In [22]:
import pandas as pd
import json
import datetime as dt
import altair as alt
import altair_stiles as altstiles

In [23]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
alt.themes.register("altstiles", altstiles.theme)
alt.themes.enable("altstiles")

ThemeRegistry.enable('altstiles')

In [24]:
today = dt.datetime.today().strftime("%Y-%m-%d")

---

## Get data

#### Which user are we seeking? 

In [25]:
user = "elisewho"

#### The `--use-search` argument requires an academic account

In [6]:
# !twarc2 timeline --use-search {user} data/raw/{user}_{today}.jsonl

In [7]:
# Last 3,200 tweets:
# !twarc2 timeline {user} data/raw/{user}_{today}.jsonl

#### Read timeline pulled with the [Twarc library](https://twarc-project.readthedocs.io/en/latest/twarc2_en_us/)

In [8]:
jsons = []

with open(f"data/raw/{user}_{today}.jsonl") as f:
    for line in f:
        jsons.append(json.loads(line))

FileNotFoundError: [Errno 2] No such file or directory: 'data/raw/elisewho_2022-11-06.jsonl'

#### Loop over list of twitter jsons and extract elements into a dictionary

In [None]:
data_list = []

for j in jsons:
    for d in j["data"]:
        data_list.append(d)

#### Convert list of dictionaries to dataframe

In [None]:
src = pd.DataFrame(data_list)

#### Unpack nested columns

In [None]:
src[["retweet_count", "reply_count", "like_count", "quote_count"]] = pd.json_normalize(
    src["public_metrics"]
)

In [None]:
src[["media_keys", "poll_ids"]] = pd.json_normalize(src["attachments"])

#### Dates

In [None]:
src["date"] = pd.to_datetime(src.created_at).dt.strftime("%Y-%m-%d")

#### Drop what we don't need

In [None]:
src.drop(
    [
        "entities",
        "author_id",
        "public_metrics",
        "context_annotations",
        "attachments",
        "lang",
        "reply_settings",
        "poll_ids",
        "created_at",
        "possibly_sensitive",
        "referenced_tweets",
    ],
    axis=1,
    inplace=True,
)

In [None]:
df = src.copy()

---

## Export timeline

In [None]:
df.to_csv(f"data/processed/{user}_timeline_{today}.csv", index=False)