In [25]:
import torch
from transformers import AutoTokenizer, AutoModel, FlaubertWithLMHeadModel
import pandas as pd
import os
import re
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
import numpy as np
import warnings

In [26]:
!pip install sacremoses



In [27]:
tokenizer = AutoTokenizer.from_pretrained("flaubert/flaubert_base_cased")
model = AutoModel.from_pretrained("flaubert/flaubert_base_cased")
BERT_model= "Flaubert"
vocab = tokenizer.get_vocab()

In [28]:
cuda = torch.device('cuda')
model.cuda()

FlaubertModel(
  (position_embeddings): Embedding(512, 768)
  (embeddings): Embedding(68729, 768, padding_idx=2)
  (layer_norm_emb): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (attentions): ModuleList(
    (0-11): 12 x MultiHeadAttention(
      (q_lin): Linear(in_features=768, out_features=768, bias=True)
      (k_lin): Linear(in_features=768, out_features=768, bias=True)
      (v_lin): Linear(in_features=768, out_features=768, bias=True)
      (out_lin): Linear(in_features=768, out_features=768, bias=True)
    )
  )
  (layer_norm1): ModuleList(
    (0-11): 12 x LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  )
  (ffns): ModuleList(
    (0-11): 12 x TransformerFFN(
      (lin1): Linear(in_features=768, out_features=3072, bias=True)
      (lin2): Linear(in_features=3072, out_features=768, bias=True)
      (act): GELUActivation()
    )
  )
  (layer_norm2): ModuleList(
    (0-11): 12 x LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  )
)

In [69]:
WSD_PATTERN = r' (\w+)/\w+\.\w\.\d+' # (Woord), letterlijke slash, woord, letterlijke punt, letter, cijfer(s)

def get_target_word(sentence, target_word):
    word_match = re.search(r'\b{}\w*\b'.format(target_word), sentence, re.IGNORECASE)
    if word_match is None:
        return None
    return word_match.group(0)

def preproc_sentence(sentence):
    sent_preproc = re.sub(WSD_PATTERN, r'\1', sentence) # Alleen woord blijft over
    return sent_preproc

def find_position_word(sentence, word):
    ids_word = tokenizer.encode(word)
    tokens_word = tokenizer.convert_ids_to_tokens(ids_word)[1:-1]
    ids_sentence = tokenizer.encode(sentence)
    tokens_sentence = tokenizer.convert_ids_to_tokens(ids_sentence)
    #print(tokens_sentence) # if the code breaks, enabling this helps to see if tokenization of the sentence is what breaks it
    if len(tokens_word) == 1:
        position_word_in_sentence = tokens_sentence.index(tokens_word[0])
    else:
        position_word_in_sentence = [tokens_sentence.index(tokens_word[0]),tokens_sentence.index(tokens_word[-1])+1]
    return position_word_in_sentence

@torch.no_grad()
def encode_sentence_and_extract_position(sentence, position):
    ids = tokenizer.encode(sentence)
    bert_output = model.forward(torch.tensor(ids, device=cuda).unsqueeze(0))
    final_layer_embeddings = bert_output['last_hidden_state'].squeeze()
    if type(position) == int:
        return final_layer_embeddings[position].unsqueeze(0)
    elif type(position) == list:
        return torch.mean(
            final_layer_embeddings[position[0]:position[1]], 0
            ).unsqueeze(0)

def get_embeddings_from_dataframe(dataframe):
    word = dataframe['source'].iloc[0].lower()
    embeddings = []
    rows_to_keep = []
    for index, sentence in dataframe.iterrows():
        matched_word_form = get_target_word(sentence['match'], sentence['source'])
            # print(sentence["match"], matched_word_form, "\n")
        if matched_word_form == word:

            prep_sent = preproc_sentence(sentence['match'])
            try:
                position = find_position_word(prep_sent, matched_word_form)
                embeddings.append(encode_sentence_and_extract_position(prep_sent, position))
                rows_to_keep.append(sentence)
            except:
                print(f"Error with sentence: {sentence['match']}")
    new_df = pd.DataFrame(rows_to_keep)
    return new_df, embeddings

# functie om dataframe uit te breiden met drie PCA-waarden
def extend_df_with_pca(df, m_np):
    df_new = df.copy()

    pca = PCA(n_components=3) # We tried with 3, 4, 5 dimensions. 3 gave best results
    components = pca.fit_transform(m_np)

    df_new.insert(1, 'x', components[:,0])
    df_new.insert(2, 'y', components[:,1])
    df_new.insert(3, 'z', components[:,2])
    #df_new.insert(4, 'dim_4', components[:,3])
    #df_new.insert(5, 'dim_5', components[:,4])


    return df_new


In [70]:
Word = "Tirer"
df = pd.read_csv(f'Corpus/Final/Manual/{Word}.csv', sep=";", encoding="utf-8", header=0)

new_df, embeddings = get_embeddings_from_dataframe(df)
emb_matrix = torch.cat(embeddings, dim=0)
print(f"{len(embeddings)} left of {len(df)}")
# constructie van numpy-matrix; die matrix kunnen we gebruiken voor PCA
matrix_np = emb_matrix.cpu().detach().numpy()

df_pca_og = extend_df_with_pca(new_df, matrix_np)



104 left of 106


In [71]:
# We add the embeddings to the original dataframe, to the relevant row
df_with_embeddings = df_pca_og.copy()
df_with_embeddings["embeddings"] = matrix_np.tolist()
# We drop the "x", "y" and "z" columns
df_with_embeddings = df_with_embeddings.drop(columns=["x", "y", "z"])

In [72]:
# visualisatie in 2D; kleuren voor verschillende klassen;
# hoveren over data toont zinnen
fig_2d = px.scatter(df_pca_og, x='x', y='y', color='sense',
                 hover_data='match',
                 template="plotly_white",
                 title = f"Manual annotation: {Word}")
#fig_2d.show()
# visualisatie in 3D
fig_3d = px.scatter_3d(
    df_pca_og, x='x', y='y', z='z', color='sense',
    hover_data='match',
    template="plotly_white",
     title = f"Manual annotation: {Word}"
)
fig_3d.show()

# Clustering on embeddings
We will calculate the optimal number of clusters using a Bayesian Information Criterion


In [73]:

warnings.filterwarnings("ignore")

data_og = np.stack(df_with_embeddings["embeddings"].values)
# Assuming data_og is your data

# List to hold BIC values
bic_values = []

# Range of potential cluster numbers to test
cluster_range = range(1,11)

# Fit Gaussian Mixture Models for each number of clusters
for i in cluster_range:
    print(f"Fitting model with {i} clusters")
    gmm = GaussianMixture(n_components=i, random_state=0).fit(data_og)
    bic_values.append(gmm.bic(data_og))

# Find the number of clusters that gives the minimum BIC
optimal_clusters = cluster_range[np.argmin(bic_values)]
print(f"Optimal number of clusters: {optimal_clusters}")

# Fit the optimal model
gmm_optimal = GaussianMixture(n_components=optimal_clusters).fit(data_og)

# Predict the cluster for each data point
clusters_og = gmm_optimal.predict(data_og)
# We want them to start counting at "1" instead of "0" (to assign cluster number "0" to wrong clustering later)
clusters_og += 1
print(f"Number of clusters: {len(np.unique(clusters_og))}")

# Add the clusters to the dataframe
df_pca_og["cluster"] = clusters_og

Fitting model with 1 clusters
Fitting model with 2 clusters
Fitting model with 3 clusters
Fitting model with 4 clusters
Fitting model with 5 clusters
Fitting model with 6 clusters
Fitting model with 7 clusters
Fitting model with 8 clusters
Fitting model with 9 clusters
Fitting model with 10 clusters
Optimal number of clusters: 1
Number of clusters: 1


# Clustering on more dimensions

using more dimensions in the PCA analysis did not yield better results.

In [74]:

warnings.filterwarnings("ignore")

df_pca_numerical_og = df_pca_og.select_dtypes(include=[np.number])
# Assuming data_og is your data
data_og = df_pca_numerical_og.to_numpy()

# List to hold BIC values
bic_values = []

# Range of potential cluster numbers to test
cluster_range = range(1,11)

# Fit Gaussian Mixture Models for each number of clusters
for i in cluster_range:
    print(f"Fitting model with {i} clusters")
    gmm = GaussianMixture(n_components=i, random_state=0).fit(data_og)
    bic_values.append(gmm.bic(data_og))

# Find the number of clusters that gives the minimum BIC
optimal_clusters = cluster_range[np.argmin(bic_values)]
print(f"Optimal number of clusters: {optimal_clusters}")

# Fit the optimal model
gmm_optimal = GaussianMixture(n_components=optimal_clusters).fit(data_og)

# Predict the cluster for each data point
clusters_og = gmm_optimal.predict(data_og)
# We want them to start counting at "1" instead of "0" (to assign cluster number "0" to wrong clustering later)
clusters_og += 1
print(f"Number of clusters: {len(np.unique(clusters_og))}")

# Add the clusters to the dataframe
df_pca_og["cluster"] = clusters_og

Fitting model with 1 clusters
Fitting model with 2 clusters
Fitting model with 3 clusters
Fitting model with 4 clusters
Fitting model with 5 clusters
Fitting model with 6 clusters
Fitting model with 7 clusters
Fitting model with 8 clusters
Fitting model with 9 clusters
Fitting model with 10 clusters
Optimal number of clusters: 3
Number of clusters: 3


# Calculating the clustering score
As explained in the *methode* section of the report, we calculate a score for the clustering
 based on the ratio of datapoints falling into a "default" cluster for each manually annotated sense.



In [75]:
# We calculate a score for the clustering (starting before the outliers are removed)
# Group the dataframe by "sense" and "cluster", and calculate the size of each group
df_grouped = df_pca_og.groupby(["sense", "cluster"]).size().reset_index(name="count")

# Sort these clusters by size in descending order
# This way, the biggest clusters get "priority" when claiming clusters
df_grouped = df_grouped.sort_values(by="count", ascending=False)

# Initialize an empty dictionary to store the cluster numbers that have been assigned as default clusters
# If the cluster number is not taken, we assign it to the corresponding "sense"
# Else, we try to assign it to the next biggest cluster for that sense, if it is not already taken by a bigger group, etc.
cluster_dict = {}
for index, row in df_grouped.iterrows():
    if row["sense"] not in cluster_dict:
        if row["cluster"] not in cluster_dict.values():
            cluster_dict[row["sense"]] = row["cluster"]

# We add "sense" values that have no entry in cluster_dict and set value to 0 (always seen as wrong)
for sense in df_pca_og["sense"].unique():
    if sense not in cluster_dict:
        cluster_dict[sense] = 0

# Add a new column "default" to the original dataframe
df_pca_og["default"] = df_pca_og.apply(lambda x: x["cluster"] == cluster_dict[x["sense"]], axis=1)
# We calculate the percentage of default clusters
percentage_default = (df_pca_og["default"].sum() / len(df_pca_og)) * 100
# We also calculate this separately for each "sense"
percentage_default_mean = df_pca_og.groupby("sense")["default"].mean() * 100

# We want the mean score across all senses, as it does not mean a lot if a program can correctly define one big cluster containing most of the data and fail at all other senses. (see description clustering score in report for examples)
percentage_weighted = percentage_default_mean.mean()
print("Score for each", percentage_default_mean)
print("Overall score", percentage_default)
print("Weighted score", percentage_weighted)

Score for each sense
pull     86.666667
shoot    43.243243
Name: default, dtype: float64
Overall score 55.769230769230774
Weighted score 64.95495495495496


In [12]:
# we will also try using v measure score to calculate cluster performance
from sklearn.metrics.cluster import v_measure_score

v_measure = v_measure_score(df_pca_og["sense"], df_pca_og["cluster"])
print(v_measure)

0.7988668250065365


# Removing outliers
We will remove clusters that consist of one or two points from the visualisation.
They are still taken into account when calculating the clustering score, but they are not shown in the visualisation.
(This was more useful with *hierarchical* clustering methods that created many small clusters)

In [13]:
# We delete all clusters that consist of one or two points from the visualisation.
# We do this by checking if the cluster is in the list of unique clusters that only occur once or twice
clusters_to_delete = [cluster for cluster in np.unique(clusters_og) if len(clusters_og[clusters_og == cluster]) <= 2]
# We delete the clusters from the original dataframe
df_pca = df_pca_og[~df_pca_og["cluster"].isin(clusters_to_delete)]
# We delete the clusters from the clusters
before_removing = len(np.unique(clusters_og))
clusters = clusters_og[~np.isin(clusters_og, clusters_to_delete)]
print(f"Removed {len(np.unique(clusters_og)) - len(np.unique(clusters))} clusters")
points_removed = len(df_pca_og) - len(df_pca)
print(f"Removed {points_removed} points")


Removed 0 clusters
Removed 0 points


In [14]:
fig_2d_cluster = px.scatter(df_pca, x="x", y="y", color="sense", symbol=clusters,
                 hover_data='match',
                 template="plotly_white")
fig_2d_cluster.update_layout(
    title={
        'text': f"Clusters {Word}<br><sub>Clusters: {len(np.unique(clusters))}{(', Outliers removed' if points_removed else '')}</sub>"
        f"<sub><br>Clustering score: {percentage_weighted:.2f}%</sub>",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig_2d_cluster.show()


In [15]:
fig_3d_cluster = px.scatter_3d(df_pca, x="x", y="y", z="z", color="sense", symbol=clusters,
                 hover_data='match',
                 template="plotly_white")
fig_3d_cluster.update_layout(
    title={
        'text': f"Clusters {Word}<br><sub>Clusters: {len(np.unique(clusters))}{(', Outliers removed' if points_removed else '')}</sub>"
        f"<sub><br>Clustering score: {percentage_weighted:.2f}%</sub>",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig_3d_cluster.show()

In [16]:
# We calculate the center of each cluster
centers = {}
radii = []
sentences = []
for cluster_id in np.unique(clusters):
    points = df_pca[df_pca["cluster"] == cluster_id][['x', 'y', 'z']].values

    centroid = points.mean(axis=0)
    centers[cluster_id] = centroid
    
    # Calculate the distances of all points to the centroid
    distances = np.linalg.norm(points - centroid, axis=0)
    # Find the minimum distance and the corresponding point
    min_distance = np.min(distances)
    closest_point_index = np.argmin(distances)
    closest_point_match = df_pca.loc[clusters == cluster_id, 'match'].values[closest_point_index]
    sentences.append(f"{cluster_id}: {closest_point_match}")
    print(f"Match associated with the point closest to the centroid of cluster {cluster_id}: {closest_point_match}")

predictions = {}

df_pca["distance"] = 0
# Calculating distance to center for each point in cluster
for cluster_id in np.unique(clusters):
    points = df_pca[df_pca["cluster"] == cluster_id][['x', 'y', 'z']].values
    # we calculate the distance between each point and the center of the cluster
    distances = np.linalg.norm(points - centers[cluster_id], axis=1)
    # We add these distances to the relevant points
    df_pca.loc[df_pca["cluster"] == cluster_id, "distance"] = distances
    

Match associated with the point closest to the centroid of cluster 1:  Malgré nombre d'adeptes célèbres (Robert Monroe, Raymond Réant, Mircea Eliade... et même Hemingway !) soutenant l'authenticité de cette incroyable faculté apparemment universelle, la science n'en est pour l'instant qu'à vérifier un état modifié du rythme cérébral en étudiant l'influence de sons binauraux du type hemi-sync . 
Match associated with the point closest to the centroid of cluster 2:  Le développeur d'Oracle Database et d'Application Express (APEX) à l'université avait déjà créé quelques applications spéciales à petite échelle pour la faculté qui avaient plu aux utilisateurs. 



## Testing with truly automatic download 
Our previously used dataset consists of manually curated sentences.
We removed sentences that we deemed too ambiguous or sentences that were nonsensical. (As described in the report)
As our corpus is the result of a webcrawl, this is an inherent characteristic of the data.
We will now test our clustering algorithm on a truly automatic download.
We also add our "manual" dataset to this bigger automatic download to have a way of interpreting the cluster formation.

In [17]:
try:
    df_automatic = pd.read_csv(f'Corpus/Final/Automatic/{Word}.csv', sep=";", encoding="utf-8", header=0)
    # We add the original df to df_automatic
    df_automatic = pd.concat([df, df_automatic])
    new_df_automatic, embeddings_automatic = get_embeddings_from_dataframe(df_automatic)
    emb_matrix_automatic = torch.cat(embeddings_automatic, dim=0)
    print(len(embeddings_automatic))
    df_pca_automatic = extend_df_with_pca(new_df_automatic, emb_matrix_automatic.cpu().detach().numpy())
    fig_automatic = px.scatter(df_pca_automatic, x='x', y='y', color="sense",
                    #color_discrete_sequence = ["lightgrey"],
                    hover_data='match',
                     template="plotly_white",
                     title = f"Random download: {Word}")
    fig_automatic.show()
except:
    print("No automatic download available")
    fig_automatic = None
    fig_circles = None
    fig_2d_cluster_automatic = None
    fig_3d_cluster_automatic = None
    df_pca_automatic = None
    df_pca_semi_automatic = None

560


In [18]:
warnings.filterwarnings("ignore")

df_pca_numerical_automatic = df_pca_automatic.select_dtypes(include=[np.number])
data_automatic = df_pca_numerical_automatic.to_numpy()
# List to hold BIC values
bic_values_automatic = []

# Range of potential cluster numbers to test
cluster_range = range(1,11)

# Fit Gaussian Mixture Models for each number of clusters
for i in cluster_range:
    print(f"Fitting model with {i} clusters")
    gmm = GaussianMixture(n_components=i, random_state=0).fit(data_automatic)
    bic_values_automatic.append(gmm.bic(data_automatic))

# Find the number of clusters that gives the minimum BIC
optimal_clusters_automatic = cluster_range[np.argmin(bic_values_automatic)]
print(f"Optimal number of clusters: {optimal_clusters_automatic}")

# Fit the optimal model
gmm_optimal_automatic = GaussianMixture(n_components=optimal_clusters_automatic).fit(data_automatic)

# Predict the cluster for each data point
clusters_automatic = gmm_optimal_automatic.predict(data_automatic)
# We want them to start counting at "1" instead of "0"
clusters_automatic += 1
print(f"Number of clusters: {len(np.unique(clusters_automatic))}")

# Add the clusters to the dataframe
df_pca_automatic["cluster"] = clusters_automatic

Fitting model with 1 clusters
Fitting model with 2 clusters
Fitting model with 3 clusters
Fitting model with 4 clusters
Fitting model with 5 clusters
Fitting model with 6 clusters
Fitting model with 7 clusters
Fitting model with 8 clusters
Fitting model with 9 clusters
Fitting model with 10 clusters
Optimal number of clusters: 5
Number of clusters: 5


In [19]:
# We delete occasional clusters that consist of one or two points to visualise data more clearly.
# We do this by checking if the cluster is in the list of unique clusters that only occur once or twice
clusters_to_delete = [cluster for cluster in np.unique(clusters_automatic) if len(clusters_automatic[clusters_automatic == cluster]) <= 2]
# We delete the clusters from the original dataframe
df_pca_automatic = df_pca_automatic[~df_pca_automatic["cluster"].isin(clusters_to_delete)]
# We delete these clusters from the clusters
before_removing = len(np.unique(clusters_automatic))
clusters_automatic = clusters_automatic[~np.isin(clusters_automatic, clusters_to_delete)]
print(f"Removed {before_removing - len(np.unique(clusters_automatic))} clusters")

Removed 0 clusters


In [20]:
fig_2d_cluster_automatic = px.scatter(df_pca_automatic, x="x", y="y",
                                        color=clusters_automatic,
                                        symbol="sense",
                                        hover_data='match',
                                        template="plotly_white",
                                        title = f"Clusters {Word}, random webcrawl sample")
fig_2d_cluster_automatic.update(layout_coloraxis_showscale=False)
fig_2d_cluster_automatic.show()
fig_3d_cluster_automatic = px.scatter_3d(df_pca_automatic, x="x", y="y", z="z",
                                        color=clusters_automatic,
                                        symbol="sense",
                                        hover_data='match',
                                        template="plotly_white",
                                        title = f"Clusters {Word}, random webcrawl sample")
fig_3d_cluster_automatic.update(layout_coloraxis_showscale=False)

fig_3d_cluster_automatic.show()

# Calculating the clustering score
We calculate a clustering score of the annotated datapoints the same way as we did before
Does this make sense? Or are there better methods we should use?

In [21]:
# We calculate a score for the clustering (starting before the outliers are removed)
# Group the dataframe by "sense" and "cluster", and calculate the size of each group
df_grouped_automatic = df_pca_automatic.groupby(["sense", "cluster"]).size().reset_index(name="count")
# We remove all instances of "sense" == ???
# Sort these clusters by size in descending order
df_grouped_automatic = df_grouped_automatic.sort_values(by="count", ascending=False)

df_grouped_automatic = df_grouped_automatic[df_grouped_automatic["sense"] != "???"]

In [22]:
cluster_dict = {}
for index, row in df_grouped_automatic.iterrows():
    if row["sense"] not in cluster_dict:
        if row["cluster"] not in cluster_dict.values():
            cluster_dict[row["sense"]] = row["cluster"]

# We add "sense" values that have no entry in cluster_dict and set value to 0 (always seen as wrong)
for sense in df_grouped_automatic["sense"].unique():
    if sense not in cluster_dict:
        cluster_dict[sense] = 0

In [23]:

# Initialize an empty dictionary to store the cluster numbers that have been assigned as default clusters
# If the cluster number is not taken, we assign it to the corresponding "sense"
# Else, we try to assign it to the next cluster number
df_pca_semi_automatic = df_pca_automatic[df_pca_automatic["sense"] != "???"]
# This is effectively the same as df_pca_og

# Add a new column "default" to the original dataframe
df_pca_semi_automatic["default"] = df_pca_semi_automatic.apply(lambda x: x["cluster"] == cluster_dict[x["sense"]], axis=1)
# We calculate the percentage of default clusters
percentage_default = (df_pca_semi_automatic["default"].sum() / len(df_pca_semi_automatic)) * 100
# We also calculate this separately for each "sense"
percentage_default_mean = df_pca_semi_automatic.groupby("sense")["default"].mean() * 100

# We want the mean score across all senses, as it does not mean a lot if a program can correctly define one big cluster containing most of the data and fail at all other senses.
percentage_weighted = percentage_default_mean.mean()

# We calculate a v-measure score
v_measure_automatic = v_measure_score(df_pca_semi_automatic["sense"], df_pca_semi_automatic["cluster"])

print("Score for each", percentage_default_mean)
print("Overall score", percentage_default)
print("Weighted score", percentage_weighted)
print("v-measure cluster score", v_measure_automatic)

Score for each sense
ability       93.333333
competence    71.428571
university    56.756757
Name: default, dtype: float64
Overall score 68.75
Weighted score 73.83955383955384
v-measure cluster score 0.6681173653330227


In [24]:
# We make a directory named f"{Word}" in the folder "Beelden" if it doesnt exist
if not os.path.exists(f"{BERT_model}/{Word}"):
    os.makedirs(f"{BERT_model}/{Word}")
# We save the plots in the directory

# Original plots with all embeddings
fig_2d.write_html(f"{BERT_model}/{Word}/{Word}_2d.html")
fig_3d.write_html(f"{BERT_model}/{Word}/{Word}_3d.html")

# Clusters after removing outliers
fig_2d_cluster.write_html(f"{BERT_model}/{Word}/{Word}_Clusters_2d.html")
fig_3d_cluster.write_html(f"{BERT_model}/{Word}/{Word}_Clusters_3d.html")

# Clusters of automatical downloads
fig_2d_cluster_automatic.write_html(f"{BERT_model}/{Word}/{Word}_Experiment_2d.html")
fig_3d_cluster_automatic.write_html(f"{BERT_model}/{Word}/{Word}_Experiment_3d.html")

# We also save df_pca_og with added clusters to a csv file
df_pca_og.to_csv(f'{BERT_model}/{Word}.csv', sep=";", encoding="utf-8", index=False)
df_pca_semi_automatic.to_csv(f'{BERT_model}/{Word}_automatic.csv', sep=";", encoding="utf-8", index=False)