# @elonmusk on Twitter

#### Load Python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import altair as alt
import altair_grid as altgrid
import json

In [3]:
alt.themes.register("grid", altgrid.theme)
alt.themes.enable("grid")

ThemeRegistry.enable('grid')

In [4]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000

---

## Get data

#### Which user are we seeking? 

In [16]:
user = "RadioShack"

In [5]:
!twarc2 timeline --use-search RadioShack data/raw/user_RadioShack.jsonl

100%|███████████| Processed 12 years/12 years [09:09<00:00, 29252 tweets total ]


#### Read timeline pulled with the [Twarc library](https://twarc-project.readthedocs.io/en/latest/twarc2_en_us/)

In [5]:
jsons = []

with open("/Users/stiles/twarc2/elonmusk_timeline.json") as f:
    for line in f:
        jsons.append(json.loads(line))

#### Loop over list of twitter jsons and extract elements into a dictionary

In [6]:
data_list = []

for j in jsons:
    for d in j["data"]:
        data_list.append(d)

#### Convert list of dictionaries to dataframe

In [7]:
src = pd.DataFrame(data_list)

#### Unpack nested columns

In [8]:
src[["urls", "annotations", "mentions", "cashtags", "hashtags"]] = pd.json_normalize(
    src["entities"]
)

In [9]:
src[["retweet_count", "reply_count", "like_count", "quote_count"]] = pd.json_normalize(
    src["public_metrics"]
)

In [10]:
src[["media_keys", "poll_ids"]] = pd.json_normalize(src["attachments"])

#### Dates

In [11]:
src["date"] = pd.to_datetime(src.created_at).dt.strftime("%Y-%m-%d")

#### Drop what we don't need

In [12]:
src.drop(
    [
        "entities",
        "author_id",
        "public_metrics",
        "context_annotations",
        "attachments",
        "lang",
        "reply_settings",
        "urls",
        "annotations",
        "cashtags",
        "poll_ids",
        "created_at",
        "possibly_sensitive",
        "mentions",
        "context_annotations",
        "media_keys",
        "hashtags",
        "referenced_tweets",
    ],
    axis=1,
    inplace=True,
)

In [13]:
df = src.copy()

---

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17405 entries, 0 to 17404
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   source               17405 non-null  object
 1   id                   17405 non-null  object
 2   text                 17405 non-null  object
 3   conversation_id      17405 non-null  object
 4   in_reply_to_user_id  12407 non-null  object
 5   retweet_count        17405 non-null  int64 
 6   reply_count          17405 non-null  int64 
 7   like_count           17405 non-null  int64 
 8   quote_count          17405 non-null  int64 
 9   date                 17405 non-null  object
dtypes: int64(4), object(6)
memory usage: 1.3+ MB


In [15]:
df.head()

---

## Export timeline

In [None]:
df.to_csv(f'data/processed/{user}_timeline.csv', index=False)