In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from loguru import logger

In [None]:
processed = Path("../data/processed")
datafile = processed / "whatsapp-20240122-182706.parq"
datafile = processed / "whatsapp-20240122-222233.parq"
if not datafile.exists():
    logger.warning("Datafile does not exist. First run src/preprocess.py, and check the timestamp!")

In [None]:
df = pd.read_parquet(datafile)
df.dtypes

In [None]:
df.head()

In [None]:
p1 = df[['author', 'message']].\
    groupby("author").\
    count().\
    sort_values("message", ascending=False)
sns.barplot(y=p1.index, x="message", data=p1)
plt.xticks(rotation=90);

In [None]:
p2 = df[['author', 'has_emoji']].\
    groupby('author').\
    agg(['sum', 'mean']).\
    sort_values(('has_emoji', 'sum'), ascending=False)
p2.columns = p2.columns.droplevel(0)
sns.barplot(y=p2.index, x="sum", data=p2)

In [None]:
import pandas as pd

# Define the time ranges
time_ranges = ['00:00', '08:00', '17:30', '22:00', '23:59']

# Define the category labels
categories = ['early morning', 'worktimes', 'evening', 'late']

# Categorize the timestamp column
df['timestamp_category'] = pd.cut(df['timestamp'].dt.time.astype(str), bins=time_ranges, labels=categories, right=False)

# Display the updated dataframe
df

In [None]:
# Group the dataframe by 'author' and 'timestamp_category', and count the occurrences
p3 = df.groupby(['author', 'timestamp_category']).size().unstack()

# Calculate the fraction of each category for every author
p3_frac = p3.div(p3.sum(axis=1), axis=0)
p3_frac

In [None]:
import plotly.express as px
p4 = p3_frac.reset_index().melt(id_vars='author')
fig = px.bar(p4, y="author", x="value", color="timestamp_category", barmode="stack")
fig.show()

In [None]:
df['hour'] = df['timestamp'].dt.time

summary_df = df.groupby('author')['hour'].agg(['min', 'max']).reset_index()


def convert_to_decimal_hours(timestamp):
    dec_hour = timestamp.hour + timestamp.minute / 60 + timestamp.second / 3600
    return dec_hour

summary_df['min_x_values'] = summary_df['min'].apply(convert_to_decimal_hours)
summary_df['max_x_values'] = summary_df['max'].apply(convert_to_decimal_hours)

# Drop the original 'min' and 'max' columns as they are no longer needed
summary_df = summary_df.drop(['min', 'max'], axis=1)
summary_df.head()

In [None]:
# Create a larger plot
plt.figure(figsize=(10, 8))

# Create scatter plots
sns.scatterplot(data=summary_df, x='min_x_values', y='author', color='grey')
sns.scatterplot(data=summary_df, x='max_x_values', y='author', color='grey')

# Add lines
for index, row in summary_df.iterrows():
    plt.plot([row['min_x_values'], row['max_x_values']], [row['author'], row['author']], color='grey')


# Adjust the font size of the y-axis labels if needed
plt.yticks(fontsize=10)

# Show the plot
plt.show()

In [None]:
df['day_of_week'] = df['timestamp'].dt.dayofweek
author_day_counts = df.groupby(['author', 'day_of_week']).size().unstack(fill_value=0)
author_day_percentages = author_day_counts.div(author_day_counts.sum(axis=1), axis=0)
sns.heatmap(author_day_percentages, annot=True, fmt=".1f", linewidths=.5, cmap="vlag")