In [23]:
import pandas as pd
from pathlib import Path

# 1. Setup: paths
DATA_DIR = Path("..") / "data"   # notebook is in notebooks/, data in ../data

# 2. Load data
# posts_with_predictions has created_at + is_stolen + pred_is_stolen
posts = pd.read_csv(
    DATA_DIR / "posts_with_predictions.csv",
    parse_dates=["created_at"]
)

# impressions + users in case we want to extend later
impr = pd.read_csv(DATA_DIR / "feed_impressions.csv")
users = pd.read_csv(DATA_DIR / "users.csv")

print(posts.dtypes)
posts.head()


post_id                      int64
author_id                    int64
group_id                     int64
is_original                   bool
is_stolen                     bool
created_at          datetime64[ns]
media_type                  object
text                        object
like_count                   int64
comment_count                int64
share_count                  int64
post_text_clean             object
best_match_index             int64
best_match_score           float64
pred_is_stolen                bool
dtype: object


Unnamed: 0,post_id,author_id,group_id,is_original,is_stolen,created_at,media_type,text,like_count,comment_count,share_count,post_text_clean,best_match_index,best_match_score,pred_is_stolen
0,1,2986,1,True,False,2024-08-04,image,Content group 1 original post about topic 9,37,9,1,content group 1 original post about topic 9,4,1.0,True
1,2,559,2,True,False,2024-07-17,image,Content group 2 original post about topic 18,2,0,0,content group 2 original post about topic 18,0,0.744558,False
2,3,1170,3,True,False,2024-07-24,video,Content group 3 original post about topic 23,28,4,2,content group 3 original post about topic 23,8,1.0,False
3,4,21,4,True,False,2024-07-26,video,Content group 4 original post about topic 34,20,4,2,content group 4 original post about topic 34,112,0.746047,False
4,5,130,5,True,False,2024-07-05,video,Content group 5 original post about topic 4,9,1,0,content group 5 original post about topic 4,0,1.0,False


# Daily content mix over time

In [24]:
# 3. Add a simple "day" column from created_at
posts["day"] = posts["created_at"].dt.date

# 4. Daily content mix (overall)
daily_content_mix = (
    posts
    .groupby("day", as_index=False)
    .agg(
        total_posts=("post_id", "size"),
        stolen_posts=("is_stolen", lambda s: s.sum()),
        original_posts=("is_stolen", lambda s: (~s).sum()),
        stolen_post_pct=("is_stolen", lambda s: 100 * s.mean()),
        predicted_stolen_posts=("pred_is_stolen", lambda s: s.sum()),
        predicted_stolen_post_pct=("pred_is_stolen", lambda s: 100 * s.mean())
    )
)

daily_content_mix.head()


Unnamed: 0,day,total_posts,stolen_posts,original_posts,stolen_post_pct,predicted_stolen_posts,predicted_stolen_post_pct
0,2024-07-01,34,1,33,2.941176,0,0.0
1,2024-07-02,42,7,35,16.666667,0,0.0
2,2024-07-03,60,12,48,20.0,4,6.666667
3,2024-07-04,39,11,28,28.205128,4,10.25641
4,2024-07-05,44,10,34,22.727273,2,4.545455


In [29]:
daily_content_mix["stolen_post_pct_smooth"] = (
    daily_content_mix["stolen_post_pct"].rolling(7, min_periods=1).mean()
)

daily_content_mix["pred_stolen_post_pct_smooth"] = (
    daily_content_mix["predicted_stolen_post_pct"].rolling(7, min_periods=1).mean()
)


In [30]:
daily_content_mix.to_csv(DATA_DIR / "time_daily_content_mix.csv", index=False)

# Daily stolen share by creator country

In [31]:
# 6. Daily metrics by creator country
daily_country = (
    posts_with_country
    .groupby(["day", "country"], as_index=False)
    .agg(
        total_posts=("post_id", "size"),
        stolen_posts=("is_stolen", "sum"),
        stolen_post_pct=("is_stolen", lambda s: 100 * s.mean())
    )
)

# 7. Add 7-day rolling average (smooth) per country
daily_country = (
    daily_country
    .sort_values(["country", "day"])
    .groupby("country")
    .apply(lambda df: df.assign(
        stolen_post_pct_smooth=df["stolen_post_pct"].rolling(7, min_periods=1).mean()
    ))
    .reset_index(drop=True)
)

daily_country.head()


  .apply(lambda df: df.assign(


Unnamed: 0,day,country,total_posts,stolen_posts,stolen_post_pct,stolen_post_pct_smooth
0,2024-07-01,BR,7,0,0.0,0.0
1,2024-07-02,BR,8,2,25.0,12.5
2,2024-07-03,BR,7,0,0.0,8.333333
3,2024-07-04,BR,4,1,25.0,12.5
4,2024-07-05,BR,6,1,16.666667,13.333333


In [32]:
daily_country.to_csv(DATA_DIR / "time_daily_stolen_by_country.csv", index=False)

# Time-Series Code for CTR Over Time

In [34]:
# 7. CTR by stolen vs original (using created_at as proxy for time)
ctr_join = impr.merge(
    posts[["post_id", "is_stolen", "created_at"]],
    on="post_id",
    how="left"
)

ctr_join["day"] = ctr_join["created_at"].dt.date
ctr_join["post_type"] = ctr_join["is_stolen"].map({True: "Stolen", False: "Original"})
ctr_join["clicked_flag"] = ctr_join["clicked"].astype(int)

daily_ctr_by_post_type = (
    ctr_join
    .groupby(["day", "post_type"], as_index=False)
    .agg(
        impressions=("impression_id", "count"),
        clicks=("clicked_flag", "sum"),
    )
)

daily_ctr_by_post_type["ctr_pct"] = (
    100 * daily_ctr_by_post_type["clicks"]
    / daily_ctr_by_post_type["impressions"]
)

# 7-day rolling average for CTR (per post_type)
daily_ctr_by_post_type["ctr_pct_smooth"] = (
    daily_ctr_by_post_type
    .sort_values(["post_type", "day"])
    .groupby("post_type")["ctr_pct"]
    .transform(lambda s: s.rolling(window=7, min_periods=1).mean())
)

daily_ctr_by_post_type.to_csv(DATA_DIR / "time_daily_ctr_by_post_type.csv", index=False)

daily_ctr_by_post_type.head()


Unnamed: 0,day,post_type,impressions,clicks,ctr_pct,ctr_pct_smooth
0,2024-07-01,Original,21531,3259,15.136315,15.136315
1,2024-07-01,Stolen,772,130,16.839378,16.839378
2,2024-07-02,Original,23245,3430,14.755861,14.946088
3,2024-07-02,Stolen,4168,625,14.995202,15.91729
4,2024-07-03,Original,31482,4779,15.180103,15.024093


In [35]:
# 8. Build wide table for CTR gap: Original minus Stolen (7-day avg)

ctr_gap = (
    daily_ctr_by_post_type
    .pivot(index="day", columns="post_type", values="ctr_pct_smooth")
    .rename(columns={
        "Original": "ctr_pct_original",
        "Stolen": "ctr_pct_stolen",
    })
    .reset_index()
)

ctr_gap["ctr_gap"] = ctr_gap["ctr_pct_original"] - ctr_gap["ctr_pct_stolen"]

ctr_gap.to_csv(DATA_DIR / "time_daily_ctr_gap.csv", index=False)

ctr_gap.head()


post_type,day,ctr_pct_original,ctr_pct_stolen,ctr_gap
0,2024-07-01,15.136315,16.839378,-1.703063
1,2024-07-02,14.946088,15.91729,-0.971202
2,2024-07-03,15.024093,15.600093,-0.576
3,2024-07-04,15.095961,15.348794,-0.252832
4,2024-07-05,15.056633,15.146382,-0.089749
