In [1]:
import pandas as pd
import random
from datetime import datetime, timedelta
from src.utils import HistoricalDiscordMessage, hist_msg_list_to_pandas_df

# Config

In [2]:
FREQ = "D" # resample freq: "D": daily, "MS": month start

# Generate data

In [3]:
# set up parameters
start_date = datetime(2023, 1, 1, 0, 0, 0)
end_date = datetime(2023, 4, 1, 0, 0, 0)
channels = [10112, 20112, 30112]
users = [6513, 6003, 7777, 9999, 1211]
n_items = 5000

# generate list of dictionaries
hist_list = []
for i in range(n_items):
    # generate random date and time
    rand_time = start_date + timedelta(seconds=random.randint(0, int((end_date - start_date).total_seconds())))
    # generate random channel, user, and type
    channel = random.choice(channels)
    user = random.choice(users)
    # type = random.choice(types)
    reactions = random.randint(0, 6)
    # create dictionary
    historical_msg = HistoricalDiscordMessage(
        date_time=rand_time.isoformat(),
        channel_id=channel,
        author_id=user,
        reactions=reactions
    )
    # append to list
    hist_list.append(historical_msg)

# Convert to df

In [4]:
df = hist_msg_list_to_pandas_df(hist_list=hist_list)
df.head()

Unnamed: 0_level_0,channel_id,author_id,reactions
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-02-27 20:59:18,10112,6003,2
2023-01-31 20:17:59,30112,6513,1
2023-02-23 18:36:25,10112,7777,4
2023-01-11 13:22:28,10112,6513,2
2023-03-31 13:38:12,30112,6003,1


# Summarise counts

In [5]:
# summary of counts for every channel and user per time sample
summary_df = df.groupby(by=["channel_id", "author_id"], as_index=True)\
    .resample(rule=FREQ)\
    .agg(
        messages=("reactions","count"), 
        reactions=("reactions","sum")
        )\
    .reset_index()

summary_df.head()

Unnamed: 0,channel_id,author_id,date_time,messages,reactions
0,10112,1211,2023-01-01,4,11
1,10112,1211,2023-01-02,7,18
2,10112,1211,2023-01-03,4,15
3,10112,1211,2023-01-04,5,23
4,10112,1211,2023-01-05,3,5


In [6]:
# total count as per time sample
out_df = summary_df.groupby(by="date_time", as_index=True)[["reactions", "messages"]].sum()
out_df["unique_authors"] = summary_df.groupby(by="date_time", as_index=True)["author_id"].nunique()
out_df

Unnamed: 0_level_0,reactions,messages,unique_authors
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-01-01,214,69,5
2023-01-02,129,51,5
2023-01-03,133,45,5
2023-01-04,187,66,5
2023-01-05,148,50,5
...,...,...,...
2023-03-27,163,60,5
2023-03-28,188,67,5
2023-03-29,148,50,5
2023-03-30,103,40,5
