In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from loguru import logger
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


In [None]:
import tomllib
configfile = Path("../config.toml").resolve()
with configfile.open("rb") as f:
    config = tomllib.load(f)
datafile = (Path("..") / Path(config["processed"]) / config["current"]).resolve()
if not datafile.exists():
    logger.warning("Datafile does not exist. First run src/preprocess.py, and check the timestamp!")
df = pd.read_parquet(datafile)
df.head()

In [None]:
sns.histplot(df, x="message_length", kde=True)

In [None]:
import numpy as np
df["log_len"] = df["message_length"].apply(lambda x: np.log(x))
sns.histplot(df, x="log_len", kde=True)

In [None]:
df.head()

In [None]:
q1 = lambda x: np.quantile(x, 0.25)
q3 = lambda x: np.quantile(x, 0.75)

df["hour"] = df.timestamp.dt.hour
quantiles = df.groupby('hour')['message_length'].agg([q1, q3])
quantiles.columns = ['q1', 'q3']

p = quantiles.reindex(range(24), fill_value=0)

sns.scatterplot(data=p, x="q1", y=p.index, color="grey")
sns.scatterplot(data=p, x="q3", y=p.index, color="grey")
for index, row in p.iterrows():
    sns.lineplot(x=[row["q1"], row["q3"]], y=[index, index], color="grey")

In [None]:
q1 = lambda x: np.quantile(x, 0.1)
q2 = lambda x: np.quantile(x, 0.25)
q3 = lambda x: np.quantile(x, 0.5)
q4 = lambda x: np.quantile(x, 0.75)
q5 = lambda x: np.quantile(x, 0.9)

df["hour"] = df.timestamp.dt.hour
quantiles = df.groupby('hour')['message_length'].agg([q1, q2, q3, q4, q5])
quantiles.columns = ['q1', 'q2', 'q3', 'q4', 'q5']
p = quantiles.reindex(range(24), fill_value=0)

plt.plot(p.index, p["q3"], color="black")
plt.fill_between(p.index, p["q2"], p["q4"], color="grey", alpha=0.3)
plt.fill_between(p.index, p["q1"], p["q5"], color="grey", alpha=0.3)
plt.xlabel("uur van de dag")
plt.ylabel("quantiles van berichtlengte (aantal karakters)")
plt.title("Wanneer zijn berichten langer?")