# Historical tweet counts, by term

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import altair as alt
import altair_grid as altgrid

In [3]:
alt.themes.register("grid", altgrid.theme)
alt.themes.enable("grid")

ThemeRegistry.enable('grid')

In [4]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000

---

## Search for a term

In [5]:
term = "elonmusk"

#### Get daily counts

In [6]:
!twarc2 counts --granularity 'day' '{term}' --archive --csv data/raw/{term}.csv

100%|███████| Processed 16 years/16 years [04:20<00:00, 195696706 tweets total ]


In [7]:
src = pd.read_csv(f"data/raw/{term}.csv")

In [8]:
src.head()

Unnamed: 0,start,end,day_count
0,2022-10-01T00:00:00.000Z,2022-10-02T00:00:00.000Z,252454
1,2022-10-02T00:00:00.000Z,2022-10-03T00:00:00.000Z,180697
2,2022-10-03T00:00:00.000Z,2022-10-04T00:00:00.000Z,920348
3,2022-10-04T00:00:00.000Z,2022-10-05T00:00:00.000Z,767713
4,2022-10-05T00:00:00.000Z,2022-10-06T00:00:00.000Z,427493


#### Clean up dates

In [9]:
src["year"] = pd.to_datetime(src["start"]).dt.strftime("%Y")
src["month_year"] = pd.to_datetime(src["start"]).dt.strftime("%Y-%m")
src["date"] = pd.to_datetime(src["start"]).dt.strftime("%Y-%m-%d")

In [10]:
src[src["date"] == src["date"].max()]

Unnamed: 0,start,end,day_count,year,month_year,date
30,2022-10-31T00:00:00.000Z,2022-10-31T01:00:10.000Z,94790,2022,2022-10,2022-10-31


In [11]:
src_grouped = (
    src.groupby(["date", "year", "month_year"]).agg({"day_count": sum}).reset_index()
)

#### Slim down and re-order the dataframe

In [12]:
df = (
    src_grouped[["year", "month_year", "date", "day_count"]]
    .sort_values("date", ascending=False)
    .copy()
)

In [13]:
df.head()

Unnamed: 0,year,month_year,date,day_count
6068,2022,2022-10,2022-10-31,94790
6067,2022,2022-10,2022-10-30,1334005
6066,2022,2022-10,2022-10-29,1668445
6065,2022,2022-10,2022-10-28,2524846
6064,2022,2022-10,2022-10-27,799050


#### When was the first mention? 

In [14]:
first = df[df["day_count"] > 0]["date"].tail(1).iloc[0]
first

'2008-01-18'

#### How many total mentions? 

In [15]:
df.day_count.sum()

195696706

#### Average mentions? 

In [16]:
df.day_count.mean()

32245.296753995717

#### Make a new dataframe starting from first mention

In [17]:
df_complete = df[df["date"] >= first]

#### Which day was mentioned the most? 

In [18]:
df_complete[df_complete["day_count"] == df_complete["day_count"].max()]

Unnamed: 0,year,month_year,date,day_count
5882,2022,2022-04,2022-04-28,3112530


In [19]:
df_complete.sort_values("day_count", ascending=False).head(10)

Unnamed: 0,year,month_year,date,day_count
5882,2022,2022-04,2022-04-28,3112530
5880,2022,2022-04,2022-04-26,2689560
6065,2022,2022-10,2022-10-28,2524846
5879,2022,2022-04,2022-04-25,2189980
5881,2022,2022-04,2022-04-27,1877869
6066,2022,2022-10,2022-10-29,1668445
5883,2022,2022-04,2022-04-29,1351116
6067,2022,2022-10,2022-10-30,1334005
5445,2021,2021-02,2021-02-15,1224570
5904,2022,2022-05,2022-05-20,1100582


#### Chart it

In [26]:
alt.Chart(df_complete[df_complete["date"] > "2010-01-01"]).mark_area(
    color="red"
).encode(
    x=alt.X("date:T", axis=alt.Axis(format="%b. %Y", tickCount=5), title=""),
    y=alt.Y("day_count:Q", title=" ", axis=alt.Axis(tickCount=5, format="0,M")),
).properties(
    width=900, height=300, title=f"Daily mentions of '{term}' on Twitter"
)

  for col_name, dtype in df.dtypes.iteritems():


---

## Aggregate 

#### Groupby month/year

In [21]:
months = (
    df_complete.groupby(["month_year"])
    .agg({"day_count": sum})
    .sort_values("month_year", ascending=False)
    .reset_index()
)

#### Chart it

In [22]:
alt.Chart(months.head(120)).mark_area().encode(
    x=alt.X("month_year:T", axis=alt.Axis(format="%b. %Y", tickCount=5), title=""),
    y=alt.Y("day_count:Q", title=" ", axis=alt.Axis(tickCount=5, format="0,M")),
).properties(width=650, title=f"Monthly mentions of {term} on Twitter")

  for col_name, dtype in df.dtypes.iteritems():


## Exports

In [23]:
months[months["month_year"] > "2014-12"].to_csv(
    f"data/processed/twitter_mentions_{term}_months.csv", index=False
)
df_complete[df_complete["date"] > "2014-12-31"].to_csv(
    f"data/processed/twitter_mentions_{term}_days.csv", index=False
)

In [24]:
months.head()

Unnamed: 0,month_year,day_count
0,2022-10,17016830
1,2022-09,3757259
2,2022-08,4400691
3,2022-07,5831406
4,2022-06,6547654


In [25]:
months[months["month_year"] > "2021-01"].day_count.mean()

7186733.80952381