# How often is 'x' mentioned on Twitter?
##### *Counts pulled with the Twarc library. [Check it out](https://twarc-project.readthedocs.io/en/latest/twarc2_en_us/)*. 

#### Load Python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import altair as alt

In [3]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000

---

#### Read data

In [4]:
mentioned = "USC"

src = pd.read_csv(
    "../data/raw/usc_mentions_daily.csv", parse_dates=["start", "end"]
).sort_values("start", ascending=False)

#### First five rows

In [5]:
src.head()

Unnamed: 0,start,end,day_count
30,2022-04-14 00:00:00+00:00,2022-04-14 16:27:39+00:00,4865
29,2022-04-13 00:00:00+00:00,2022-04-14 00:00:00+00:00,9993
28,2022-04-12 00:00:00+00:00,2022-04-13 00:00:00+00:00,6610
27,2022-04-11 00:00:00+00:00,2022-04-12 00:00:00+00:00,6719
26,2022-04-10 00:00:00+00:00,2022-04-11 00:00:00+00:00,5398


#### Process dates

In [6]:
src["year"] = pd.to_datetime(src["start"]).dt.strftime("%Y")
src["month_year"] = pd.to_datetime(src["start"]).dt.strftime("%Y-%m")
src["date"] = pd.to_datetime(src["start"]).dt.strftime("%Y-%m-%d")

#### Slim down and re-order the dataframe

In [7]:
df = src[["year", "date", "month_year", "day_count"]].copy()

In [8]:
df.tail()

Unnamed: 0,year,date,month_year,day_count
5863,2006,2006-03-25,2006-03,0
5862,2006,2006-03-24,2006-03,0
5861,2006,2006-03-23,2006-03,0
5860,2006,2006-03-22,2006-03,0
5859,2006,2006-03-21,2006-03,0


#### When was the first mention? 

In [9]:
df[df["day_count"] > 0]["date"].tail(1).iloc[0]

'2006-11-26'

#### Define that as a variable

In [10]:
first = df[df["day_count"] > 0]["date"].tail(1).iloc[0]

#### How many total mentions? 

In [11]:
df.day_count.sum()

28895654

#### Average mentions? 

In [12]:
round(df.day_count.mean())

4923

#### Make a new dataframe starting from first mention

In [13]:
df_all = df[df["date"] >= first].copy()

#### Which day was mentioned the most? 

In [14]:
# https://twitter.com/elonmusk/status/1463828765151272962

In [15]:
df_all[df_all.day_count == df_all.day_count.max()]

Unnamed: 0,year,date,month_year,day_count
3423,2012,2012-11-25,2012-11,136304


In [16]:
df_all.sort_values("day_count", ascending=False).head()

Unnamed: 0,year,date,month_year,day_count
3423,2012,2012-11-25,2012-11,136304
1947,2017,2017-01-03,2017-01,125522
3477,2012,2012-09-16,2012-09,114214
1694,2017,2017-09-03,2017-09,109665
3111,2013,2013-09-29,2013-09,93364


#### Chart it

In [17]:
# alt.Chart(source).mark_area().encode(
#     x="year:T",
#     y="net_generation:Q",
# )

In [18]:
alt.Chart(df_all.head(3000)).mark_area(color="red").encode(
    x=alt.X(
        "date:T", axis=alt.Axis(grid=False, tickCount=6, format="%b. %Y"), title="Day"
    ),
    y=alt.Y(
        "day_count:Q",
        axis=alt.Axis(
            domainOpacity=0,
            gridWidth=0.6,
            gridColor="#dddddd",
            offset=6,
            tickSize=0,
            tickCount=6,
        ),
        title="Daily mentions",
    ),
).properties(
    width=700, height=300, title=f"Mentions of {mentioned} on Twitter"
).configure_view(
    strokeOpacity=0
)

---

## Aggregate 

#### Groupby month/year

In [19]:
df_all.groupby(["year", "month_year"]).agg({"day_count": "sum"}).reset_index().head()

Unnamed: 0,year,month_year,day_count
0,2006,2006-11,5
1,2006,2006-12,9
2,2007,2007-01,13
3,2007,2007-02,3
4,2007,2007-03,42


In [20]:
df_all_months = (
    df_all.groupby(["year", "month_year"]).agg({"day_count": "sum"}).reset_index()
)

#### Which month-year was max? 

In [21]:
max_count = df_all_months[
    df_all_months["day_count"] == df_all_months["day_count"].max()
]

In [22]:
max_count

Unnamed: 0,year,month_year,day_count
72,2012,2012-11,635142


In [23]:
alt.Chart(df_all_months).mark_area(color="red").encode(
    x="month_year:T", y="day_count:Q",
).properties(width=700)

---

#### Exports

In [24]:
df_all_months.to_csv(
    f"../data/processed/twitter_mentions_{mentioned.lower().replace(' ', '')}.csv",
    index=False,
)