# Historical tweet counts, by term

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import altair as alt
import altair_stiles as altstiles
import glob

In [3]:
alt.themes.register("stiles", altstiles.theme)
alt.themes.enable("grid")

ThemeRegistry.enable('grid')

In [4]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000

---

## Search for terms

In [5]:
idea = "@RadioShack mentions"

In [6]:
terms = [
    "@RadioShack",
]

#### Get daily counts for each term

In [7]:
for term in terms:
    !twarc2 counts --start-time '2015-01-01' --granularity 'day' '{term}' --archive --csv data/raw/{term}.csv

100%|████████████| Processed 7 years/7 years [01:43<00:00, 562592 tweets total ]


#### Read all those files into a dataframe

In [8]:
data_list = []

for term in terms:
    dataframe = pd.read_csv((f"data/raw/{term}.csv").replace('"', "")).assign(
        search_term=term
    )
    data_list.append(dataframe)

In [9]:
src = pd.concat(data_list)

In [10]:
src.head()

Unnamed: 0,start,end,day_count,search_term
0,2022-06-03T00:00:00.000Z,2022-06-04T00:00:00.000Z,185,@RadioShack
1,2022-06-04T00:00:00.000Z,2022-06-05T00:00:00.000Z,88,@RadioShack
2,2022-06-05T00:00:00.000Z,2022-06-06T00:00:00.000Z,92,@RadioShack
3,2022-06-06T00:00:00.000Z,2022-06-07T00:00:00.000Z,33,@RadioShack
4,2022-06-07T00:00:00.000Z,2022-06-08T00:00:00.000Z,33,@RadioShack


#### Convert dates

In [11]:
src["start_est"] = (
    pd.to_datetime(src["start"]).dt.tz_convert("US/Eastern").dt.strftime("%Y-%m-%d")
)
src["end_est"] = (
    pd.to_datetime(src["end"]).dt.tz_convert("US/Eastern").dt.strftime("%Y-%m-%d")
)

In [12]:
src["year"] = pd.to_datetime(src["end_est"]).dt.strftime("%Y")
src["month_year"] = pd.to_datetime(src["end_est"]).dt.strftime("%Y-%m")
src["date"] = pd.to_datetime(src["end_est"]).dt.strftime("%Y-%m-%d")

In [13]:
src[src["date"] == src["date"].max()]

Unnamed: 0,start,end,day_count,search_term,start_est,end_est,year,month_year,date
30,2022-07-03T00:00:00.000Z,2022-07-03T17:40:58.000Z,9106,@RadioShack,2022-07-02,2022-07-03,2022,2022-07,2022-07-03


In [14]:
src_grouped = (
    src.groupby(["date", "year", "month_year"]).agg({"day_count": sum}).reset_index()
)

#### Slim down and re-order the dataframe

In [15]:
df = (
    src_grouped[["year", "month_year", "date", "day_count"]]
    .sort_values("date", ascending=False)
    .copy()
)

In [16]:
df.head()

Unnamed: 0,year,month_year,date,day_count
2740,2022,2022-07,2022-07-03,9106
2739,2022,2022-07,2022-07-02,15490
2738,2022,2022-07,2022-07-01,16518
2737,2022,2022-06,2022-06-30,49739
2736,2022,2022-06,2022-06-29,23851


#### When was the first mention? 

In [17]:
first = df[df["day_count"] > 0]["date"].tail(1).iloc[0]
first

'2015-01-01'

#### How many total mentions? 

In [18]:
df.day_count.sum()

562592

#### Average mentions? 

In [19]:
df.day_count.mean()

205.2506384531193

#### Make a new dataframe starting from first mention

In [20]:
df_complete = df[df["date"] >= first]

#### Which day was mentioned the most? 

In [21]:
df_complete[df_complete["day_count"] == df_complete["day_count"].max()]

Unnamed: 0,year,month_year,date,day_count
2737,2022,2022-06,2022-06-30,49739


In [22]:
df_complete.sort_values("day_count", ascending=False).head(10)

Unnamed: 0,year,month_year,date,day_count
2737,2022,2022-06,2022-06-30,49739
2736,2022,2022-06,2022-06-29,23851
2738,2022,2022-07,2022-07-01,16518
2739,2022,2022-07,2022-07-02,15490
2676,2022,2022-04,2022-04-30,11490
2740,2022,2022-07,2022-07-03,9106
2725,2022,2022-06,2022-06-18,7466
2682,2022,2022-05,2022-05-06,6856
2722,2022,2022-06,2022-06-15,6840
2724,2022,2022-06,2022-06-17,5804


In [23]:
df_complete[df_complete["date"] > "2022-01-01"]["day_count"].sum()

236144

#### Chart it

In [31]:
alt.Chart(df_complete[df_complete["date"] > "2015-01-01"]).mark_bar(color="red").encode(
    x=alt.X("date:T", axis=alt.Axis(format="%b. %Y", tickCount=5), title=" "),
    y=alt.Y("day_count:Q", title=" ", axis=alt.Axis(tickCount=5, format="0,M")),
).properties(width=650, height=300, title=f"Daily mentions of '{term}' on Twitter")

---

## Aggregate 

#### Groupby month/year

In [25]:
months = (
    df_complete.groupby(["month_year"])
    .agg({"day_count": sum})
    .sort_values("month_year", ascending=False)
    .reset_index()
)

#### Chart it

In [26]:
alt.Chart(months.head(120)).mark_area().encode(
    x=alt.X("month_year:T", axis=alt.Axis(format="%b. %Y", tickCount=5), title=""),
    y=alt.Y("day_count:Q", title=" ", axis=alt.Axis(tickCount=5, format="0,M")),
).properties(width=650, title=f"Monthly mentions of {idea} on Twitter")

## Exports

In [27]:
months[months["month_year"] > "2014-12"].to_csv(
    f"data/processed/twitter_mentions_{term}_months.csv", index=False
)
df_complete[df_complete["date"] > "2022-01-01"].to_csv(
    f"data/processed/twitter_mentions_{term}_days.csv", index=False
)

In [28]:
months.head()

Unnamed: 0,month_year,day_count
0,2022-07,41114
1,2022-06,112116
2,2022-05,56670
3,2022-04,18905
4,2022-03,799


In [29]:
months[months["month_year"] > "2021-01"].day_count.mean()

13416.277777777777