In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from loguru import logger
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

processed = Path("../data/processed")
datafile = processed / "whatsapp-20240122-182706.parq" # add your own file here
if not datafile.exists():
    logger.warning("Datafile does not exist. First run src/preprocess.py, and check the timestamp!")

Lets load the data

In [None]:
df = pd.read_parquet(datafile)
df

Let's extract some more info from the timestamp:

In [None]:
df["date"] = df["timestamp"].dt.date
df["isoweek"] = df["timestamp"].dt.isocalendar().week
df.head()


Now, we can group by the isoweeks, for example. 
Let's reindex in order to fill the missing weeks.

In [None]:
p = df.groupby("isoweek").count()
p.head()

In [None]:
# lets reindex the isoweeks. This wont work if you have multiple years!
# we want to do this, because we want to see the weeks where we have no messages
p = p.reindex(range(int(p.index.min()), int(p.index.max()) + 1), fill_value=0)
sns.scatterplot(data=p, x="isoweek", y="timestamp")
sns.lineplot(data=p, x="isoweek", y="timestamp")

Try to play with the colors. Make sure adding colors conveys a message, and isnt randomly adding colors.
Probaly, a scatterplot and lineplot are not the best way to show the count of messages per author, per week, for your data, so you might need to modify that. It might not even be useful for your data to do this (eg because you have too many authors).

The main goal here is to find some way to visualise the count over time, per author, and to give you some starting point if that is relevant for your dataset.

In [None]:
p = df.groupby(["author", "isoweek"]).count()
sns.scatterplot(data=p, x="isoweek", y="timestamp", hue="author", legend=True)
sns.lineplot(data=p, x="isoweek", y="timestamp", legend=False)

In [None]:
df["day_of_week"] = df["timestamp"].dt.day_name()
df_agg = df.groupby(['date', 'day_of_week']).size().reset_index(name='message_count')
df_agg

Sometimes, you want to group the timeseries in clusters, eg per day of the week (of per month, or per year, etc).
Facetgrid is a nice way to do this.

In [None]:
# Set the order of the days of the week
days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# Create a FacetGrid to plot the line plots for each day of the week
g = sns.FacetGrid(df_agg, col='day_of_week', col_wrap=4, height=3, col_order=days_order)
g.map(sns.scatterplot, 'date', 'message_count')
g.map(sns.lineplot, 'date', 'message_count')
g.set(xticks=[])
g.despine(left=True, bottom=False)

Another nice plot is the area plot. Plotly has a nice version if this one.

In [None]:
import plotly.express as px
# first count the number for every category, for every date
p = df.groupby(["date", "timestamp_category"]).size().reset_index(name='message_count')
# then, make the count cumulative for every category
p['cumulative_count'] = p.groupby('timestamp_category')['message_count'].cumsum()

fig = px.area(p, x="date", y="cumulative_count", color="timestamp_category",
              line_group="timestamp_category", labels={"cumulative_count": "Cumulative Message Count"})
fig.show()