# Analyzing Barstool Sports post metadata

#### Python tools and notebook settings

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import altair as alt
import altair_stiles as altstiles
import datetime as dt

In [3]:
alt.themes.register("stiles", altstiles.theme)
alt.themes.enable("grid")

ThemeRegistry.enable('grid')

In [4]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

In [5]:
today = dt.date.today().strftime("%Y-%m-%d")

---

In [28]:
src_df = pd.read_csv(
    "/Users/stiles/data/barstool.csv",
    parse_dates=["week_start", "published_date", "timestamp"],
    dtype={"id": str},
)

In [29]:
len(src_df)

220793

#### Parse dates for aggregates

In [30]:
src_df["year"] = src_df["published_date"].dt.year
src_df["month"] = src_df["published_date"].dt.month
src_df["month_year"] = pd.to_datetime(
    (src_df["year"].astype(str) + "-" + src_df["month"].astype(str) + "-" + "01")
)
src_df.drop(["timestamp", "week_start"], axis=1, inplace=True)

In [32]:
src_df.head(1)

Unnamed: 0,id,type,title,author_name,comment_count,brand_name,branch_url,tags,published_date,published_hour,published_day,first_tag,category_name,year,month,month_year
0,3431690,standard_post,Nick Kyrgios Takes Down World No. 1 Daniil Medvedev And Is Now The Betting Favorite To Win The US Open,hubbs,15,,https://bars.tl/3431690,"['us-open', 'tennis', 'sports', 'nick-kyrgios']",2022-09-05,9,Monday,us-open,BarstoolU,2022,9,2022-09-01


#### Data for the last five years

In [33]:
df = src_df[src_df["published_date"] > "2017-06-23"].copy()

#### Tags

In [34]:
df["first-tag"] = (
    df["tags"]
    .str.split(",", expand=True, n=1)[0]
    .str.replace("[", "", regex=False)
    .str.replace("]", "", regex=False)
    .str.replace("'", "", regex=False)
)

In [35]:
df.drop("tags", axis=1, inplace=True)

---

## Posts by author

#### Group 'standard posts'. Count them. Sum comments. 

In [14]:
posts_grouped = (
    df[df["type"] == "standard_post"]
    .groupby("author_name")
    .agg({"id": "count", "comment_count": sum})
    .reset_index()
    .rename(columns={"id": "post_count"})
)

In [15]:
posts_grouped_2022 = (
    df[(df["month_year"] >= "2022-01-01") & (df["type"] == "standard_post")]
    .groupby("author_name")
    .agg({"id": "count", "comment_count": sum})
    .reset_index()
    .rename(columns={"id": "post_count"})
)

#### Create a comments/post rate

In [16]:
posts_grouped["comments_per_post"] = (
    posts_grouped["comment_count"] / posts_grouped["post_count"]
).round(2)

In [17]:
posts_grouped_2022["comments_per_post"] = (
    posts_grouped_2022["comment_count"] / posts_grouped_2022["post_count"]
).round(2)

In [18]:
posts_grouped_2022.sort_values("post_count", ascending=False).head(10)

Unnamed: 0,author_name,post_count,comment_count,comments_per_post
70,Reags,844,27485,32.57
35,Greenie,463,16299,35.2
40,Jerry Thornton,439,21984,50.08
43,Jordie,411,10389,25.28
57,Matt Fitzgerald,391,13075,33.44
19,Clem,390,14267,36.58
30,Frank The Tank,381,10647,27.94
22,Dante,363,19345,53.29
97,hubbs,342,14617,42.74
80,Steven Cheah,248,8794,35.46


---

#### Post counts by month/year

In [19]:
posts_grouped_months = (
    df[(df["month_year"] >= "2017-07-01") & (df["type"] == "standard_post")]
    .groupby("month_year")
    .agg({"id": "count", "comment_count": sum})
    .reset_index()
    .rename(columns={"id": "post_count"})
)

In [20]:
posts_grouped_months["comments_per_post"] = (
    posts_grouped_months["comment_count"] / posts_grouped_months["post_count"]
).round(2)

#### Chart it

In [21]:
bars = (
    alt.Chart(posts_grouped_months)
    .mark_bar(color="#EB3E3E", size=7)
    .encode(
        x=alt.X("month_year", title="", axis=alt.Axis(tickCount=6)),
        y=alt.Y("post_count", title=" ", axis=alt.Axis(tickCount=6)),
    )
)
bars.properties(
    width=650, height=350, title="Barstool Sports: Standard blog posts per month"
)

---

#### Categories

In [22]:
posts_grouped_categories = (
    df[df["month_year"] >= "2017-07-01"]
    .groupby(["category_name"])
    .agg({"id": "count", "comment_count": sum})
    .reset_index()
    .rename(columns={"id": "post_count"})
).sort_values("post_count", ascending=False)

In [23]:
posts_grouped_categories.sort_values("post_count", ascending=False).head()

Unnamed: 0,category_name,post_count,comment_count
2,Boston,49355,918665
8,New York,27909,513797
0,BarstoolU,23129,432750
3,Chicago,14412,274620
5,DMV,7078,132068


---

#### Frequency of the first tag

In [24]:
tags_grouped_categories = (
    df[df["month_year"] >= "2017-07-01"]
    .groupby("first_tag")
    .agg({"id": "count", "comment_count": sum})
    .reset_index()
    .rename(columns={"id": "post_count"})
).sort_values("post_count", ascending=False)

In [25]:
tags_grouped_categories.head(30)

Unnamed: 0,first_tag,post_count,comment_count
20891,sports,7382,84217
18509,nba,4628,37288
18676,nfl,3572,23115
9113,chicago,1883,26847
18160,mlb,1753,22296
5379,baseball,1610,42977
14734,home,1601,33644
18710,nhl,1529,7236
20728,smokeshow,1318,352
9844,college football,1181,8127


In [26]:
tags_grouped_categories["post_count_share"] = (
    (
        tags_grouped_categories["post_count"]
        / tags_grouped_categories["post_count"].sum()
    )
    * 100
).round(3)

In [27]:
tags_grouped_categories.head(30)

Unnamed: 0,first_tag,post_count,comment_count,post_count_share
20891,sports,7382,84217,5.653
18509,nba,4628,37288,3.544
18676,nfl,3572,23115,2.735
9113,chicago,1883,26847,1.442
18160,mlb,1753,22296,1.342
5379,baseball,1610,42977,1.233
14734,home,1601,33644,1.226
18710,nhl,1529,7236,1.171
20728,smokeshow,1318,352,1.009
9844,college football,1181,8127,0.904


---