In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from loguru import logger
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
processed = Path("../data/processed")
datafile = processed / "whatsapp-20240122-182706.parq"
datafile = processed / "whatsapp-20240122-222233.parq"
if not datafile.exists():
    logger.warning("Datafile does not exist. First run src/preprocess.py, and check the timestamp!")

In [None]:
df = pd.read_parquet(datafile)
df

In [None]:
df["timestamp"].agg(["min", "max"])

In [None]:
df["date"] = df["timestamp"].dt.date
df["isoweek"] = df["timestamp"].dt.isocalendar().week
p = df.groupby("isoweek").count()

# Reindex to include missing isoweeks with value 0
p = p.reindex(range(p.index.min(), p.index.max() + 1), fill_value=0)

sns.scatterplot(data=p, x="isoweek", y="timestamp")
sns.lineplot(data=p, x="isoweek", y="timestamp")

In [None]:
all_authors = df['author'].unique()
all_isoweeks = df['isoweek'].unique()
all_combinations = pd.MultiIndex.from_product([all_authors, all_isoweeks], names=['author', 'isoweek'])
complete_df = pd.DataFrame(index=all_combinations).reset_index()

# Step 2: Merge with the original DataFrame
merged_df = pd.merge(complete_df, df, on=['author', 'isoweek'], how='left')

# Step 3: Replace NaN values with zero
merged_df.fillna(0, inplace=True)

In [None]:
p = merged_df.groupby(["author", "isoweek"]).count().reset_index()


In [None]:
sns.lineplot(data=p, x="isoweek", y="timestamp", legend=False)

In [None]:
sns.lineplot(data=p, x="isoweek", y="timestamp", hue='author', legend=False)
ax = plt.gca()
for line in ax.lines:
    line.set_color('grey')
