In [None]:
from pathlib import Path
from loguru import logger
import pandas as pd
from datetime import datetime

processed = Path("../data/processed")
datafile = processed / "whatsapp-20240903-151052.csv"
if not datafile.exists():
    logger.warning("Datafile does not exist. First run src/preprocess.py, and check the timestamp!")

Read in the file

In [None]:
df = pd.read_csv(datafile, parse_dates=["timestamp"])
df.head()

Check the datatypes. Note the timestamp type!

In [None]:
df.dtypes

Sometimes, author names have a tilde in front of them, allong with some unicode. Let's clean that.

In [None]:
import re
clean_tilde = r"^~\u202f"
df["author"] = df["author"].apply(lambda x: re.sub(clean_tilde, "", x))

Let's check how many unique authors we have

In [None]:
len(df.author.unique())

Let's make the authors anonymous

In [None]:
import json
from wa_analyzer.humanhasher import humanize

authors = df.author.unique()
anon = {k:humanize(k) for k in authors}
# we save a reference file so we can look up the original author names if we want to
reference_file = processed / "anon_reference.json"

with open(reference_file, "w") as f:
    # invert the dictionary:
    ref = {v:k for k,v in anon.items()}
    # sort alphabetically:
    ref_sorted = {k:ref[k] for k in sorted(ref.keys())}
    # save as json:
    json.dump(ref_sorted, f)

assert len(anon) == len(authors), "you lost some authors!"


In [None]:
df["anon_author"] = df.author.map(anon)
df.head()

We can now drop the original author column

In [None]:
df.drop(columns=["author"], inplace=True)

Check if it's gone

In [None]:
df.head()

And let's rename the column

In [None]:
df.rename(columns={"anon_author":"author"}, inplace=True)

In [None]:
df.head()

In my case, the first line is a header, saying messages are encrypted. Let's remove that. Your data might be different, so double check if you also want to remove the first line!

In [None]:
df = df.drop(index=[0])

let's check:

In [None]:
df.head()

Let's find emojis in the text and add that as a feature.

In [None]:
import re

emoji_pattern = re.compile("["
                            u"\U0001F600-\U0001F64F"  # emoticons
                            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                            u"\U0001F680-\U0001F6FF"  # transport & map symbols
                            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                            u"\U00002702-\U000027B0"  # Dingbats
                            u"\U000024C2-\U0001F251"
                            "]+", flags=re.UNICODE)

def has_emoji(text):
    return bool(emoji_pattern.search(text))

df['has_emoji'] = df['message'].apply(has_emoji)

Let's create a timestamp for a new, unique, filename.

In [None]:
now = datetime.now().strftime("%Y%m%d-%H%M%S")
output = processed / f"whatsapp-{now}.csv"
output

Let's save the file both as a csv and as a parquet file.
Parquet has some advantages:
- its about 100x faster to read and write
- datatypes are preserved (eg the timestamp type). You will loose this in a csv file.
- file size is much smaller

The advantage of csv is that you can easily peak at the data in a text editor.

In [None]:
df.to_csv(output, index=False)
df.to_parquet(output.with_suffix(".parq"), index=False)

Now, go to `config.toml` and change the name by "current" to the parquet file you just created.
This makes it easier to use the same file everywhere, without the need to continuously retype the name if you change it.