In [None]:
import torch
import seaborn as sns
import matplotlib.pyplot as plt

We can use transfer learning to extract more types of features from our texts. There are a few things to consider when taking this approach:

- the models are pretrained on a corpus of texts. This can have a lot of impact; consider for example the difference between a "sentiment" model trained on movie reviews and one trained on tweets. The might grasps sort of the same concept, but the words used and the way they are used are different. If you are going to use this model to estimate the sentiment of, let's say, emails sent in a business context, you might get unexpected results.
- There are really a lot of different models on huggingface. It can be usefull to browse around in the model hub to see what is available, and try to find something that is close to your use case.

# Sentiment models
Sentiment models follow the following recipe:

1. map strings into tokens (arbitray integers)
2. map tokens into embeddings; this are high dimensional (eg 784 dimensions) vectors that represent the meaning of the words
3. do a lot of non-linear transformations on the embedding 
4. the final embedding is reduced from 784 dimension back to either a single value, sometimes into three values (positive, neutral, negative).

In [None]:
from transformers import pipeline

# models are downloaded to ~/.cache/huggingface/hub.
# you might want to clean up that location after you are done with the models
model = pipeline(
    model="lxyuan/distilbert-base-multilingual-cased-sentiments-student",
    top_k=None,
)

# english
model("I love this movie and i would watch it again and again!")

If you run this the first time, it will download the model from huggingface hub.
The second run will be much faster. You will get three outputs: positive, neutral and negative. The sum of these three is 1, because it is a probability distribution.

It would be straightforward to use this on your own dataset. For example:

In [None]:
from pathlib import Path
import pandas as pd
processed = Path("../data/processed")
datafile = processed / "whatsapp-20240122-182706.parq"
# datafile = processed / "whatsapp-20240122-222233.parq"
if not datafile.exists():
    logger.warning("Datafile does not exist. First run src/preprocess.py, and check the timestamp!")
df = pd.read_parquet(datafile)
df.head()

In [None]:
from tqdm import tqdm
from loguru import logger
data = []
for i, row in tqdm(df.iterrows(), total=len(df)):
    author = row["author"]
    message = row["message"]
    try:
        sentiment = model(message)[0]
    except Exception as e:
        logger.warning(f"Failed to process message {i}: {message}")
        logger.warning(f"Error: {e}")
    data.append((author, message, sentiment))


After this, you should be able to use the output as a simple new feature. You could, for example, extract the "positive" value and plot it. Or you could aggregate the sentiment over a certain timeframe. Or plot distributions of sentiment over different authors, etc.

For example:

In [None]:
data = [(author, msg, pos["score"], neg["score"], neut["score"]) for author, msg, (pos, neg, neut) in data]
df_sentiment = pd.DataFrame(data, columns=["author", "message", "positive", "negative", "neutral"])
df_sentiment.head()

In [None]:
threshold = df_sentiment.positive.median() * 1.1
colors = df_sentiment.groupby("author").positive.median().sort_values().reset_index()
colors["color"] = colors.positive.apply(lambda x: x > threshold)
colors.drop("positive", axis=1, inplace=True)
colors.head()

In [None]:
color_map = {True: "red", False: "grey"}
for author in colors.author:
    subset = df_sentiment[df_sentiment.author == author]
    colorgroup = colors[colors.author == author].color.values[0]
    sns.kdeplot(subset.positive, color=color_map[colorgroup], label=author, legend=True)
plt.legend(title='Author', bbox_to_anchor=(1.05, 1), loc='upper left')


# Mapping text to a semantic vectorspace

Instead of mapping the result of the non-linear transformations to just one dimension of sentiment, we can pick a more general model that doesnt do this. This model is "just" trained on a lot of textual data, and the output vectors will represent the meaning of the text in a high dimensional space. This can be used to compare the meaning of different texts, or to use as input for a classifier.

In [None]:
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)
print(f"Embedding shape: {embeddings.shape}")

As you can see, we input two sentences, and the output in this case are two vectors, 384 dimensions each. 

I will try to filter the text that are too short (eg more than just "hi" and "hello") to see if we can get a bit more interesting results.

In [None]:
sns.histplot(x=np.log(df["message_length"]))

<img src="../img/message_length.png" width=450 height=400 />

My dataset seems to have a median message length of log(x) = 4, so lets take 5 as a cutoff

In [None]:
subset = df[np.log(df["message_length"]) > 5].reset_index(drop=True)
subset

Let's make a new class to keep metadata and the output neatly together.

In [None]:
from dataclasses import dataclass
import numpy as np

@dataclass
class Embedding:
    metadata: list
    vectors: np.ndarray

    def __getitem__(self, idx: int) -> tuple:
        return (self.vectors[idx], self.metadata[idx])

    def __len__(self) -> int:
        return len(self.metadata)

    def __repr__(self) -> str:
        return f"Embedding, dims={self.vectors.shape}"

We can process all data

In [None]:
from tqdm import tqdm
metadata = {}
text = []
for idx, row in tqdm(subset.iterrows(), total=len(subset)):
    author = row["author"]
    message = row["message"]
    timestamp = row["timestamp"]
    metadata[idx] = {"author": author, "message": message, "timestamp": timestamp}
    text.append(message)

Now, we use the model to encode every message. 
if the length of your text is not too big, this will work in one go
otherwise, you might want to split the text into smaller chunks, encode the chunks,
and then concatenate the results

In [None]:
vectors = model.encode(text)
vectors.shape

And store it in our dataclass

In [None]:
emb = Embedding(metadata, vectors)
emb

Let's check our `__getitem__` method

In [None]:
X, y = emb[1]
X.shape, y

One way to visualise this would be with PCA

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X = pca.fit_transform(emb.vectors)
plt.figure(figsize=(10, 10))
labels = [emb.metadata[i]["author"] for i in range(len(emb.metadata))]
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=labels)
plt.legend(title='Author', bbox_to_anchor=(1.05, 1), loc='upper left')


tSNE is often better for visualising high dimensional data.

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2)
X = tsne.fit_transform(emb.vectors)
plt.figure(figsize=(10, 10))
labels = [emb.metadata[i]["author"] for i in range(len(emb.metadata))]
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=labels)
plt.legend(title='Author', bbox_to_anchor=(1.05, 1), loc='upper left')

It might be the case that you just get a blob of points with no clear clustering. Some things to consider:

- you might have just too much authors, and if you look more closely you might still find that some authors are more similar than others
- you text messages might overall be too short and too similar. You might need to filter out more messages, or group messages of the same author together and then encode them.
- you might want to add more structure. Eg, label some messages by hand (or with a regex) and use that as a coloring. You might find that some type of message actually do cluster together in a relevant way, just not clustered by author but more by subject.

Please keep in mind that normally, when doing unsupervised clustering, you will have some idea of what you are looking for. For example, you might be looking for fraud, or you are looking for a certain sentiment, or for a specific topic. A typical strategy would be to hand-label a few items and then calculate the distance to find "close" items you didn't label yet. 

In [None]:
avg_author = {}
for i in range(len(emb)):
    # for every embedding
    X, y = emb[i]
    # we store the embedding in a list per author, and average it later
    avg_author[y["author"]] = avg_author.get(y["author"], []) + [X]

for author, vectors in avg_author.items():
    # take the average of all embeddings per author
    avg_author[author] = np.mean(vectors, axis=0)
# We extract all values as a single matrix
A = np.array(list(avg_author.values()))
labels = list(avg_author.keys())
A.shape, len(labels)

In [None]:
sns.clustermap(A, yticklabels=labels);

We calculate the distance between the average vector for every author

In [None]:
from scipy.spatial import distance_matrix
from scipy.spatial.distance import squareform

D = distance_matrix(A, A)
sns.heatmap(D, yticklabels=labels, xticklabels=labels)

We can also reduce the "distance" fingerprint to two dimensions and plot it.
This will show us which authors are in a similar way close to other authors.

In [None]:
pca = PCA(n_components=2)
X = pca.fit_transform(D)
plt.figure(figsize=(10, 10))
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=labels)
plt.legend(title='Author', bbox_to_anchor=(1.05, 1), loc='upper left')