In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from loguru import logger
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


Let's load the parquetfile we saved in notebook 1. You will need to change the filename in the config file!

In [None]:
import tomllib
configfile = Path("../config.toml").resolve()
with configfile.open("rb") as f:
    config = tomllib.load(f)
config

In [None]:
root = Path("..").resolve()
processed = root / Path(config["processed"])
datafile = processed / config["current"]
if not datafile.exists():
    logger.warning(f"{datafile} does not exist. First run src/preprocess.py, and check the timestamp!")

Note how datatypes have been preserved.

In [None]:
df = pd.read_parquet(datafile)
df.dtypes

In [None]:
df.head()

Let's count the amount of messages, per author.

In [None]:
p1 = df[['author', 'message']].\
    groupby("author").\
    count().\
    sort_values("message", ascending=False)

k = 15
topk = p1[:k]

In [None]:
topk_authors = list(topk.index)

In [None]:
df["is_topk"] = df["author"].apply(lambda x: x in topk_authors)
df.head()

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(y=p1.index[:k], x="message", data=topk);
plt.xticks(rotation=90);
plt.title("Sending the most messages...")

Maybe tweak the colors a bit

In [None]:
colors = [0 if x < 1000 else 1 for x in topk.message]
custom_palette = {0: "grey", 1: "red"}
sns.barplot(y=p1.index[:k], x="message", hue=colors, data=topk, palette=custom_palette, legend=False);
plt.title("Sending the most messages...")

Let's calculate the average message length.

In [None]:
df['message_length'] = df['message'].str.len()
p1 = df[['author', 'message_length']].\
    groupby("author").\
    mean().\
    sort_values("message_length", ascending=False)
k=15
topk = p1[1:k]
sns.barplot(y=p1.index[1:k], x="message_length", data=topk);
plt.xlabel("Average message length")
plt.title("Sending the longest messages...")

Create a simple regex to look for links in the messages and add that as a feature:

In [None]:
has_link = r"http"
df['has_link'] = df['message'].str.contains(has_link)
p1 = df[['author', 'has_link']].\
    groupby("author").\
    mean().\
    sort_values("has_link", ascending=False)

k=15
topk = p1[:k]
sns.barplot(y=p1.index[:k], x="has_link", data=topk)
plt.xlabel("Fraction of messages with a link")
plt.title("Most links by...")

Aggregate the emojis per user (can you change between sum and mean?)

In [None]:
p2 = df[['author', 'has_emoji']].\
    groupby('author').\
    agg(['sum', 'mean']).\
    sort_values(('has_emoji', 'sum'), ascending=False)

p2.columns = p2.columns.droplevel(0)
topk = p2[:k]
sns.barplot(y=p2.index[:k], x="mean", data=topk)
plt.xlabel("Average number of messages with an emoji")
plt.title("Are emoji's non-verbal?")

Let's add a cateory, based on the time when authors send a message:

In [None]:
import pandas as pd
# Define the time ranges
time_ranges = ['00:00', '06:00', '08:00', '17:30', '22:00', '23:59']
# Define the category labels
categories = ['night', 'morning', 'worktimes', 'evening', 'late']
# Categorize the timestamp column
df['timestamp_category'] = pd.cut(df['timestamp'].dt.time.astype(str), bins=time_ranges, labels=categories, right=False)
# Display the updated dataframe
df

Now we can group and count the categories:

In [None]:
# Group the dataframe by 'author' and 'timestamp_category', and count the occurrences
p3 = df.groupby(['author', 'timestamp_category']).size().unstack()

# Calculate the fraction of each category for every author
p3_frac = p3.div(p3.sum(axis=1), axis=0)
p3_frac

Let's use plotly the create a stacked bar chart:

In [None]:
p4 = p3_frac.reset_index().melt(id_vars='author')
p4.head()

In [None]:
p4_filtered = p4[p4['author'].isin(topk_authors)]

In [None]:
import plotly.express as px
fig = px.bar(p4_filtered, y="author", x="value", color="timestamp_category", barmode="stack")
fig.show()

Let's extract the minimum and maximum time of the messages for every author, and convert that to a decimal fraction:

In [None]:
df['hour'] = df['timestamp'].dt.time
summary_df = df.groupby('author')['hour'].agg(['min', 'max']).reset_index()

def convert_to_decimal_hours(timestamp):
    dec_hour = timestamp.hour + timestamp.minute / 60 + timestamp.second / 3600
    return dec_hour

summary_df['min_x_values'] = summary_df['min'].apply(convert_to_decimal_hours)
summary_df['max_x_values'] = summary_df['max'].apply(convert_to_decimal_hours)

# Drop the original 'min' and 'max' columns as they are no longer needed
summary_df = summary_df.drop(['min', 'max'], axis=1)
summary_df.head()

With this, we can create a nice barbell chart. Try to add colors for your own chart!

In [None]:
# Create a larger plot
plt.figure(figsize=(10, 8))

# Create scatter plots
sns.scatterplot(data=summary_df, x='min_x_values', y='author', color='grey')
sns.scatterplot(data=summary_df, x='max_x_values', y='author', color='grey')

# Add lines
for index, row in summary_df.iterrows():
    plt.plot([row['min_x_values'], row['max_x_values']], [row['author'], row['author']], color='grey')


# Adjust the font size of the y-axis labels if needed
plt.yticks(fontsize=10)

# Show the plot
plt.show()

Another approach for comparing is to create a heatmap:

In [None]:
df['day_of_week'] = df['timestamp'].dt.dayofweek
author_day_counts = df.groupby(['author', 'day_of_week']).size().unstack(fill_value=0)
author_day_percentages = author_day_counts.div(author_day_counts.sum(axis=1), axis=0)


filtered = author_day_percentages.loc[topk_authors]
sns.heatmap(filtered, annot=True, fmt=".2f", linewidths=.5, cmap="vlag")
plt.xticks(ticks=range(7), labels=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], rotation=45)
plt.title("Heatmap")

Let's save all the new features we added:

In [None]:
datafile

In [None]:
df.to_parquet(datafile, index=False)