In [1]:
import torch
from transformers import AutoTokenizer, FlaubertWithLMHeadModel
import pandas as pd
import os
import re
import time
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
import numpy as np
import warnings
import torch.nn.functional as F
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import ast
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import TruncatedSVD
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer as fll


  torch.utils._pytree._register_pytree_node(


In [2]:
tokenizer = AutoTokenizer.from_pretrained("flaubert/flaubert_base_cased")
model = FlaubertWithLMHeadModel.from_pretrained("flaubert/flaubert_base_cased")

In [3]:
cuda = torch.device('cuda')
model.to(cuda)

FlaubertWithLMHeadModel(
  (transformer): FlaubertModel(
    (position_embeddings): Embedding(512, 768)
    (embeddings): Embedding(68729, 768, padding_idx=2)
    (layer_norm_emb): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (attentions): ModuleList(
      (0-11): 12 x MultiHeadAttention(
        (q_lin): Linear(in_features=768, out_features=768, bias=True)
        (k_lin): Linear(in_features=768, out_features=768, bias=True)
        (v_lin): Linear(in_features=768, out_features=768, bias=True)
        (out_lin): Linear(in_features=768, out_features=768, bias=True)
      )
    )
    (layer_norm1): ModuleList(
      (0-11): 12 x LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    )
    (ffns): ModuleList(
      (0-11): 12 x TransformerFFN(
        (lin1): Linear(in_features=768, out_features=3072, bias=True)
        (lin2): Linear(in_features=3072, out_features=768, bias=True)
        (act): GELUActivation()
      )
    )
    (layer_norm2): ModuleList(
      (0-11): 

In [4]:
model_name = "Flaubert"

In [5]:
def get_token_predictions(substituted_sentence, keyword, top_k=3):
    blacklist = [keyword, keyword.capitalize(), keyword.lower(), 
                 "</s>", "", "-", '"', "'", ",", "le", "la", "l'", "l", "les", "un", "une", "des",
                "et", "/", ".", "(", ")", "du", "de", "d'", "ce", "cet", "cette", "ces", "qui", "que", 
                 "à", "au", "aux", "en", "dans", "par", "pour", "sur", "avec", "sans", "sous", "entre",
                 "...", ":", ";", "!", "?", "«", "»", "(", ")", "[", "]", "{", "}", "–", "—", "–", "—", "‘", "’", "“", "+"
                 ]
    inputs = tokenizer(substituted_sentence, return_tensors="pt")
    inputs = inputs.to(cuda)
    with torch.no_grad():
        logits = model(**inputs).logits
    mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
    init_k = 10
    while True:
        # Get the top k predicted token ids
        predicted_token_ids = logits[0, mask_token_index].topk(init_k).indices
        predicted_token_probs = F.softmax(logits[0, mask_token_index].topk(init_k).values, dim=-1).cpu().numpy()

        # Convert the token ids to tokens
        predicted_token_ids = predicted_token_ids.flatten().tolist()
        predicted_token_probs = predicted_token_probs.flatten().tolist()

        substitutions = tokenizer.convert_ids_to_tokens(predicted_token_ids)

        substitutions = [substitution.replace('</w>', '') for substitution in substitutions]
        substitutions_with_probs = [(substitution, prob) for substitution, prob in
                                    zip(substitutions, predicted_token_probs) if substitution not in blacklist]

        if len(substitutions) >= top_k:
            break

        init_k += 5

    substitutions_with_probs = substitutions_with_probs[:top_k]
    return substitutions_with_probs


def return_masked_predictions_probs(sentence, keyword, top_k=3):
    zinput_forward = sentence.replace(keyword, f"<special1> et {keyword}", 1)
    zinput_backward = sentence.replace(keyword, f"{keyword} et <special1>", 1)
    if zinput_forward == sentence or zinput_backward == sentence:
        print("something went wrong with sentence", sentence)
        return ["Keyword not found"] * 2

    # forward:
    substitutions_forward = get_token_predictions(zinput_forward, keyword, top_k=top_k)
    # backward:
    substitutions_backward = get_token_predictions(zinput_backward, keyword, top_k=top_k)
    return substitutions_forward, substitutions_backward

WSD_PATTERN = r' (\w+)/\w+\.\w\.\d+' # (Woord captured), letterlijke slash, woord, letterlijke punt, letter, cijfer(s)
# Needed for the "Tour" keyword: remnant of older dataset.
def preproc_sentence(sentence):
    sent_preproc = re.sub(WSD_PATTERN, r'\1', sentence) # Alleen woord blijft over
    return sent_preproc

In [10]:
test_1 = return_masked_predictions_probs("Le chat est sur le tapis", "chat")
test_2 = return_masked_predictions_probs("Le chat est sur le tapis, où il ronronne doucement", "chat")

([('chien', 0.2836516499519348),
  ('souris', 0.0222591795027256),
  ('chiot', 0.020429719239473343)],
 [('souris', 0.9390504360198975),
  ('compagnie', 0.0217279102653265),
  ('s', 0.006186521612107754)])

In [12]:
return_masked_predictions_probs("Le chat est sur le tapis.", "chat")

([('noir', 0.14924348890781403),
  ('jeu', 0.10539386421442032),
  ('jour', 0.09504830837249756)],
 [('moi', 0.23445254564285278),
  ('vous', 0.18811574578285217),
  ('Fred', 0.12172508239746094)])

# Getting the substitute vectors

In [6]:
all_predictions = {}
lemmatizer = fll()
for filename in os.listdir(f"{model_name}/Experiment_1/Curated/"):
    if filename.endswith(".csv"):
        df = pd.read_csv(f"{model_name}/Experiment_1/Curated/" + filename, sep=";", encoding="utf-8", header=0)
        # We calculate predictions
        predictions = []
        keyword = df["source"][1].lower()
        sentences = df["match"].values
        if keyword == "tour":
            sentences = [preproc_sentence(sentence) for sentence in sentences]
        for sentence in sentences:
            start_time = time.time()
            forward, backward = return_masked_predictions_probs(sentence, keyword, top_k=10)
            forward = [(lemmatizer.lemmatize(prediction[0]), prediction[1]) for prediction in forward]
            backward = [(lemmatizer.lemmatize(prediction[0]), prediction[1]) for prediction in backward]
            print(f"Forward: {forward}\tBackward: {backward}")
            sentence_tuple = (sentence, forward, backward)
            predictions.append(sentence_tuple)
            elapsed_time = time.time() - start_time
            print(f"Elapsed time: {elapsed_time}")
        all_predictions[keyword] = predictions
        

Forward: [('historien', 0.23809105157852173), ('économiste', 0.16905708611011505), ('enseignant', 0.08178497105836868), ('universitaire', 0.05853410065174103), ('écrivain', 0.04269665852189064), ('ingénieur', 0.04133529216051102), ('étudiant', 0.035823170095682144), ('informaticien', 0.03359093517065048), ('archéologue', 0.026318497955799103)]	Backward: [('directeur', 0.39019495248794556), ('responsable', 0.17413459718227386), ('chercheur', 0.16371509432792664), ('membre', 0.06893948465585709), ('militant', 0.05158180370926857), ('agent', 0.03436049818992615), ('journaliste', 0.0342571996152401), ('conservateur', 0.030371621251106262), ('scientifique', 0.028634455054998398), ('président', 0.023810310289263725)]
Elapsed time: 0.27225518226623535
Forward: [('citron', 0.21453529596328735), ('concombre', 0.13585315644741058), ('saumon', 0.10378960520029068), ('yaourt', 0.03588685765862465), ('pamplemousse', 0.022916654124855995), ('poivron', 0.02218732051551342), ('oignon', 0.0193212553858

In [7]:
# We save this to a dataframe
df = pd.DataFrame(columns=["source", "match", "predictions"])
for match, predictions in all_predictions.items():
    for sentence, forward, backward in predictions:
        new_row = pd.DataFrame({"source": [match], "match": [sentence], "predictions": [(forward, backward)]})
        df = pd.concat([df, new_row], ignore_index=True)

In [8]:
if not os.path.exists(f"{model_name}/Experiment_2/Curated/predictions/together/"):
    os.makedirs(f"{model_name}/Experiment_2/Curated/predictions/together/")
df.to_csv(f"{model_name}/Experiment_2/Curated/predictions/together/predictions.csv", sep=";", encoding="utf-8", index=False)


In [9]:
# We also save these to different files, one for each different "source"
for match, predictions in all_predictions.items():
    df_match = pd.DataFrame(columns=["source", "match", "predictions"])
    for sentence, forward, backward in predictions:
        new_row = pd.DataFrame({"source": [match], "match": [sentence], "predictions": [(forward, backward)]})
        df_match = pd.concat([df, new_row], ignore_index=True)
    df_match.to_csv(f"{model_name}/Experiment_2/Curated/predictions/{match}.csv", sep=";", encoding="utf-8", index=False)

# Creating sparse vectors based on the predictions
For each source word, we create sparse vectors representing each match:
We create a vector of the length of the amount of items for each word,
then we add the prediction probability for the corresponding word to the vector.

In [10]:
# We then create a list of all the different words in "forward" and "backward"
all_words = {}
for match, predictions in all_predictions.items():
    all_words[match] = []
    for sentence, forward, backward in predictions:
        all_words[match].extend([prediction[0] for prediction in forward + backward])
    all_words[match] = list(set(all_words[match]))

In [11]:
# Represent the sentences as sparse vectors
# We do this by making a dataframe for each word: the index is the sentence, the columns are the words, and the values are the probabilities
# We transform the dataframe
vectors = {}
for keyword, predictions in all_predictions.items():
    df_sparse = pd.DataFrame(columns=["source", "match"] + all_words[keyword])
    print("created dataframe for ", keyword)
    rows = []
    for sentence, forward, backward in predictions:
            row = {"source": keyword, "match": sentence}
            for word in all_words[keyword]:
                row[word] = 0
            for prediction in forward + backward:
                word, prob = prediction
                row[word] =+ prob # We want to add to it if it already exists (forward and backward)
            rows.append(row)
    df_sparse = pd.DataFrame(rows)
    vector = df_sparse.drop(columns=["source", "match"]).values
    # We now have a numpy ndarray, which we can transform using the TfidfTransformer
    #transformer = TfidfTransformer()
    #vector_Tfidf = transformer.fit_transform(vector)
    svd = TruncatedSVD(n_components=100)
    vector_SVD = svd.fit_transform(vector)
    vectors[keyword] = vector_SVD
    print(f"converted {keyword} into {type(vector)}")



created dataframe for  avocat
converted avocat into <class 'numpy.ndarray'>
created dataframe for  bien
converted bien into <class 'numpy.ndarray'>
created dataframe for  bureau
converted bureau into <class 'numpy.ndarray'>
created dataframe for  faculté
converted faculté into <class 'numpy.ndarray'>
created dataframe for  filer
converted filer into <class 'numpy.ndarray'>
created dataframe for  glace
converted glace into <class 'numpy.ndarray'>
created dataframe for  souris
converted souris into <class 'numpy.ndarray'>
created dataframe for  supporter
converted supporter into <class 'numpy.ndarray'>
created dataframe for  tirer
converted tirer into <class 'numpy.ndarray'>
created dataframe for  tour
converted tour into <class 'numpy.ndarray'>
created dataframe for  vol
converted vol into <class 'numpy.ndarray'>


Amrami and Goldberg state that using the TF-IDF transformation can help to improve the results of the clustering. We will apply this transformation to the sparse vectors.

In [12]:
# Initialize the clustering algorithm
warnings.filterwarnings("ignore")
# 'n_clusters' is the number of clusters we want to form (and also the number of clusters to be found)
# 'linkage' is the linkage criterion (can be 'ward', 'complete', 'average', 'single')
clustermin = 3

agg_clustering = AgglomerativeClustering(n_clusters=clustermin, metric='cosine', linkage='average')

# Range of potential cluster numbers to test
cluster_range = range(clustermin,11)


for keyword, vector_SVD in vectors.items():
    # As we worked with the same dataset as experiment 1,
    # we keep using the same set here
    df = pd.read_csv(f"{model_name}/Experiment_1/Curated/{keyword}.csv", sep=";", encoding="utf-8", header=0)
    # Apply the clustering algorithm to the SVD-transformed vectors
    df_vector_SVD = pd.DataFrame(vector_SVD)
    
    # We have some rows with 0 values. This is not permitted when working with "cosine" calculations
    # Get the indices of the rows with all zeros
    zero_rows = (df_vector_SVD==0).all(axis=1)
    
    # Remove rows with all zeros from df_vector_SVD
    df_vector_SVD = df_vector_SVD.loc[~zero_rows]
    
    # Remove corresponding rows from df
    df = df.loc[~zero_rows]
    
    # Convert DataFrame back to numpy array
    vector_SVD = df_vector_SVD.to_numpy()
    
    # Now you can apply the clustering algorithm
    agg_clusters = agg_clustering.fit_predict(vector_SVD)
    # 'agg_clusters' is now an array where the i-th element is the cluster label of the i-th instance
    # We add "1" to all clusters to start counting at 1 instead of 0
    agg_clusters += 1
    # We add the cluster label to the original dataframe, corresponding to the correct index
    
    df["agg_cluster_sub"] = agg_clusters
    
    
    # List to hold BIC values
    bic_values = []

    
    # Fit Gaussian Mixture Models for each number of clusters
    for i in cluster_range:
        print(f"Fitting model with {i} clusters")
        gmm = GaussianMixture(n_components=i, random_state=0).fit(vector_SVD)
        bic_values.append(gmm.bic(vector_SVD))
    
    # Find the number of clusters that gives the minimum BIC
    optimal_clusters = cluster_range[np.argmin(bic_values)]
    print(f"Optimal number of clusters: {optimal_clusters}")
    
    # Fit the optimal model
    gmm_optimal = GaussianMixture(n_components=optimal_clusters).fit(vector_SVD)
    
    # Predict the cluster for each data point
    BIC_clusters = gmm_optimal.predict(vector_SVD)
    # We want them to start counting at "1" instead of "0"
    BIC_clusters += 1
    print(f"Number of clusters: {len(np.unique(BIC_clusters))}")
    df["BIC_cluster_sub"] = BIC_clusters
    

    # We calculate a score for the clustering scores (starting before the outliers are removed)
    # Group the dataframe by "sense" and "cluster", and calculate the size of each group
    df_grouped = df.groupby(["sense", "agg_cluster_sub"]).size().reset_index(name="count")

    # Sort these clusters by size in descending order
    df_grouped = df_grouped.sort_values(by="count", ascending=False)

    # Initialize an empty dictionary to store the cluster numbers that have been assigned as default clusters
    # If the cluster number is not taken, we assign it to the corresponding "sense"
    # Else, we try to assign it to the next cluster number
    cluster_dict = {}
    for index, row in df_grouped.iterrows():
        if row["sense"] not in cluster_dict:
            if row["agg_cluster_sub"] not in cluster_dict.values():
                cluster_dict[row["sense"]] = row["agg_cluster_sub"]

        # We add "sense" values that have no entry in cluster_dict and set value to 0 (always seen as wrong)
    for sense in df["sense"].unique():
        if sense not in cluster_dict:
            cluster_dict[sense] = 0

    # Add a new column "default" to the original dataframe
    df["agg_default_sub"] = df.apply(lambda x: x["agg_cluster_sub"] == cluster_dict[x["sense"]], axis=1)
    
    # We calculate the percentage of default clusters
    percentage_default = (df["agg_default_sub"].sum() / len(df)) * 100
    # We also calculate this separately for each "sense"
    percentage_default_mean = df.groupby("sense")["agg_default_sub"].mean() * 100

    # We want the mean score across all senses, as it does not mean a lot if a program can correctly define one big cluster containing most of the data and fail at all other senses.
    percentage_weighted = percentage_default_mean.mean()
    print("agg:")
    print("Score for each", percentage_default_mean)
    print("Overall score", percentage_default)   
    
    #----------------------------------------------------------------------------------------------------
    # We do this again for BIC clusters
    #----------------------------------------------------------------------------------------------------
 
    # We calculate a score for the clustering scores (starting before the outliers are removed)
    # Group the dataframe by "sense" and "cluster", and calculate the size of each group
    df_grouped = df.groupby(["sense", "BIC_cluster_sub"]).size().reset_index(name="count")

    # Sort these clusters by size in descending order
    df_grouped = df_grouped.sort_values(by="count", ascending=False)

    # Initialize an empty dictionary to store the cluster numbers that have been assigned as default clusters
    # If the cluster number is not taken, we assign it to the corresponding "sense"
    # Else, we try to assign it to the next cluster number
    cluster_dict = {}
    for index, row in df_grouped.iterrows():
        if row["sense"] not in cluster_dict:
            if row["BIC_cluster_sub"] not in cluster_dict.values():
                cluster_dict[row["sense"]] = row["BIC_cluster_sub"]

        # We add "sense" values that have no entry in cluster_dict and set value to 0 (always seen as wrong)
    for sense in df["sense"].unique():
        if sense not in cluster_dict:
            cluster_dict[sense] = 0

    # Add a new column "default" to the original dataframe
    df["BIC_default_sub"] = df.apply(lambda x: x["BIC_cluster_sub"] == cluster_dict[x["sense"]], axis=1)
    
    # We calculate the percentage of default clusters
    percentage_default = (df["BIC_default_sub"].sum() / len(df)) * 100
    # We also calculate this separately for each "sense"
    percentage_default_mean = df.groupby("sense")["BIC_default_sub"].mean() * 100

    # We want the mean score across all senses, as it does not mean a lot if a program can correctly define one big cluster containing most of the data and fail at all other senses.
    percentage_weighted = percentage_default_mean.mean()
    print("BIC:")
    print("Score for each", percentage_default_mean)
    print("Overall score", percentage_default)    
    
    
    
    df.to_csv(f"{model_name}/Experiment_2/Curated/{keyword}.csv", sep=";", encoding="utf-8", index=False)

Fitting model with 3 clusters
Fitting model with 4 clusters
Fitting model with 5 clusters
Fitting model with 6 clusters
Fitting model with 7 clusters
Fitting model with 8 clusters
Fitting model with 9 clusters
Fitting model with 10 clusters
Optimal number of clusters: 3
Number of clusters: 3
agg:
Score for each sense
avocado     3.703704
lawyer     96.428571
Name: agg_default_sub, dtype: float64
Overall score 66.26506024096386
BIC:
Score for each sense
avocado    25.925926
lawyer     57.142857
Name: BIC_default_sub, dtype: float64
Overall score 46.98795180722892
Fitting model with 3 clusters
Fitting model with 4 clusters
Fitting model with 5 clusters
Fitting model with 6 clusters
Fitting model with 7 clusters
Fitting model with 8 clusters
Fitting model with 9 clusters
Fitting model with 10 clusters
Optimal number of clusters: 3
Number of clusters: 3
agg:
Score for each sense
good          6.896552
property     97.959184
wellbeing     5.000000
Name: agg_default_sub, dtype: float64
Overa