In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from loguru import logger
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
import tomllib
configfile = Path("../config.toml").resolve()
with configfile.open("rb") as f:
    config = tomllib.load(f)
config


In [None]:
datafile = (Path("..") / Path(config["processed"]) / config["current"]).resolve()
if not datafile.exists():
    logger.warning("Datafile does not exist. First run src/preprocess.py, and check the timestamp!")

Lets load the data

In [None]:
df = pd.read_parquet(datafile)
df

Let's extract some more info from the timestamp:

In [None]:
df["date"] = df["timestamp"].dt.date
df["isoweek"] = df["timestamp"].dt.isocalendar().week
df["year-week"] = df["timestamp"].dt.strftime("%Y-%W")
df.head()


In [None]:
topk = list(df[df["is_topk"]].author.unique())
topk

Now, we can group by the isoweeks, for example. 
Let's reindex in order to fill the missing weeks.

In [None]:
df = df.drop(index=[0])

In [None]:
p = df.groupby("year-week").count()
p.head()

In [None]:
min_ts = df["timestamp"].min()
max_ts = df["timestamp"].max()
new_index = pd.date_range(start=min_ts, end=max_ts, freq='W', name="year-week").strftime('%Y-%W')
p = p.reindex(new_index, fill_value=0)
p.head()

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
sns.scatterplot(data=p, x=p.index, y="timestamp", ax=ax)
p["moving_avg"] = p["timestamp"].rolling(window=1).mean()
sns.lineplot(data=p, x=p.index, y="moving_avg", ax=ax)

interval = 4
xticks = p.index[::interval]
ax.set_xticks(xticks)
ax.set_xticklabels(xticks, rotation=45, ha='right');
plt.title("Messages over time")


Try to play with the colors. Make sure adding colors conveys a message, and isnt randomly adding colors.
Probaly, a scatterplot and lineplot are not the best way to show the count of messages per author, per week, for your data, so you might need to modify that. It might not even be useful for your data to do this (eg because you have too many authors).

The main goal here is to find some way to visualise the count over time, per author, and to give you some starting point if that is relevant for your dataset.

In [None]:
# keep only topk authors

p = df[df["author"].isin(topk)].groupby(["author", "year-week"]).count()
# p = df.groupby(["author", "year-week"]).count()
p

In [None]:
# authors = df["author"].unique()
multi_index = pd.MultiIndex.from_product([topk, new_index], names=["author", "year-week"])
p = p.reindex(multi_index, fill_value=0)
p

In [None]:
# put legend outside the plot
sns.scatterplot(data=p, x="year-week", y="timestamp", hue="author", legend=True)
sns.lineplot(data=p, x="year-week", y="timestamp", legend=False, hue="author")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)
plt.xticks(rotation=45, ha='right');

In [None]:
df["day_of_week"] = df["timestamp"].dt.day_name()
df_agg = df.groupby(['date', 'day_of_week']).size().reset_index(name='message_count')
df_agg

Sometimes, you want to group the timeseries in clusters, eg per day of the week (of per month, or per year, etc).
Facetgrid is a nice way to do this.

In [None]:
# Set the order of the days of the week
days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# Create a FacetGrid to plot the line plots for each day of the week
g = sns.FacetGrid(df_agg, col='day_of_week', col_wrap=4, height=3, col_order=days_order)
g.map(sns.scatterplot, 'date', 'message_count')
g.map(sns.lineplot, 'date', 'message_count')
g.set(xticks=[])
g.despine(left=True, bottom=False)

Another nice plot is the area plot. Plotly has a nice version if this one.

In [None]:
import plotly.express as px
# first count the number for every category, for every date
p = df.groupby(["date", "timestamp_category"]).size().reset_index(name='message_count')
# then, make the count cumulative for every category
p['cumulative_count'] = p.groupby('timestamp_category')['message_count'].cumsum()

fig = px.area(p, x="date", y="cumulative_count", color="timestamp_category",
              line_group="timestamp_category", labels={"cumulative_count": "Cumulative Message Count"})
fig.show()

# Code for two authors

I have experimented with two authors, and explored the question "who is the first (or last) of the day to send a message?".
I didnt tweak this for groupchats.

In [None]:
df_grouped = df.groupby('date').agg({
    'timestamp': ['min', 'max'],
    'author': ['first', 'last'],
    })
df_grouped.columns = df_grouped.columns.map("_".join)
df_grouped

In [None]:
df_grouped['timestamp_max_shifted'] = df_grouped['timestamp_max'].shift()

# Calculate the time difference
df_grouped['time_difference'] = df_grouped['timestamp_min'] - df_grouped['timestamp_max_shifted']
df_grouped

In [None]:
# this only works with two authors
mapping = {'author1': 1, 'author2' : -1}
df_grouped['last_message'] = df_grouped['author_last'].map(mapping)
df_grouped['first_message'] = df_grouped['author_first'].map(mapping)
df_grouped["last_balance"] = df_grouped['last_message'].cumsum()
df_grouped["first_balance"] = df_grouped['first_message'].cumsum()

sns.lineplot(data=df_grouped, x='date', y='last_balance', label="last message")
sns.lineplot(data=df_grouped, x='date', y='first_balance', label="first message")
plt.xticks(rotation=45);
plt.ylabel("cumulative balance")
plt.suptitle("Who sends the first or last message?")
plt.title("author1 +, author2 -")


In [None]:
df_grouped.head()

In [None]:
def time_to_decimal(time_obj):
    return time_obj.hour + time_obj.minute / 60 + time_obj.second / 3600

df_grouped["decimal_first_time"] = df_grouped.timestamp_min.apply(time_to_decimal)
df_grouped["decimal_last_time"] = df_grouped.timestamp_max.apply(time_to_decimal)


In [None]:

fig, ax = plt.subplots(figsize=(12, 6))
center = [t - 0.5 for t in range(0,25)]
sns.histplot(data=df_grouped,
    x='decimal_first_time',
    hue='author_first',
    common_norm=False,
    fill=True,
    bins=center,
    multiple="dodge",
    kde=True,
    ax=ax)
ax.set_xticks(range(0,24));

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
center = [t - 0.5 for t in range(0,25)]
sns.histplot(data=df_grouped,
    x='decimal_last_time',
    hue='author_last',
    common_norm=False,
    fill=True,
    bins=center,
    multiple="dodge",
    kde=True,
    ax=ax)
ax.set_xticks(range(0,24));

In [None]:
import numpy as np
def decimal_delta(td):
    if td == 0:
        return 0
    return td.total_seconds() / 3600

df["next_author"] = df.author.shift(-1)
df["next_timestamp"] = df.timestamp.shift(-1)
df["reaction_time"] = df.next_timestamp - df.timestamp
df["decimal_reaction_time"] = df.reaction_time.apply(decimal_delta)
df["reply"] = df.apply(lambda x: x.author != x.next_author, axis=1)
df.head()

In [None]:
p.unstack(level='next_author')

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
p = df[df.reply].groupby(['year-week', 'next_author']).agg({'decimal_reaction_time': 'mean'})
sns.scatterplot(data=p, x='year-week', y='decimal_reaction_time', hue='next_author', ax=ax)
sns.lineplot(data=p, x='year-week', y='decimal_reaction_time', hue='next_author', ax=ax)
plt.yscale('log')
interval = 4
xticks = p.reset_index()['year-week'].unique()[::interval]
# xticks = p.index[::interval]
ax.set_xticks(xticks)
ax.set_xticklabels(xticks, rotation=45, ha='right');

In [None]:
p = df[df.reply]

sns.kdeplot(
    data=p,
    x='decimal_reaction_time',
    hue='next_author'
)
plt.xlim((0,24))
# plt.yscale('log')