In [1]:
!git clone https://github.com/jvladika/Lexical-Substitution.git

Cloning into 'Lexical-Substitution'...


In [1]:
from transformers import AutoTokenizer, CamembertModel, CamembertForMaskedLM
import pandas as pd
import os
import re
from sklearn.mixture import GaussianMixture
import warnings
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import TruncatedSVD
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer as fll
import torch
import string
import nltk
import time
import numpy as np
# Cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained('camembert-base')
lm_model = CamembertForMaskedLM.from_pretrained('camembert-base').to(device)
raw_model = CamembertModel.from_pretrained('camembert-base', output_hidden_states=True, output_attentions=True).to(device)
def load_transformers():
    return tokenizer, lm_model, raw_model

  torch.utils._pytree._register_pytree_node(
Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForMaskedLM: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing CamembertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
# imports from filter
def filter_substitutions(substitutions):
    dels = list()
    for sub in substitutions:
        if sub.lower() in substitutions and sub.capitalize() in substitutions:
            dels.append(sub.capitalize())
        if sub.lower() in substitutions and sub.upper() in substitutions:
            dels.append(sub.upper())
        if sub in nltk.corpus.stopwords.words('french') or sub in string.punctuation:
            dels.append(sub)
    dels = list(set(dels))
    for d in dels:
        substitutions.remove(d)
    return substitutions

def filter_words(target, words, scores, tokens):
    # lets time
    time_filter = time.time()
    dels = list()
    toks = tokens.tolist()
    blacklist = [target, target.capitalize()]
    
    for w in words:
        if w.lower() in words and w.capitalize() in words:
            dels.append(w.capitalize())
        if w.lower() in words and w.upper() in words:
            dels.append(w.upper())
        if w in nltk.corpus.stopwords.words('french') or w in string.punctuation:
            dels.append(w)
        if w in blacklist:
            dels.append(w)
    

    dels = list(set(dels))
    for d in dels:
        del scores[words.index(d)]
        del toks[words.index(d)]
        words.remove(d)
    

    return words, scores, torch.tensor(toks)
# imports from scores

#Calculates the similarity score
def similarity_score(original_output, subst_output, k):
    mask_idx = k
    cos_sim = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
    weights = torch.div(torch.stack(list(original_output[3])).squeeze().sum(0).sum(0), (12 * 12.0))

    #Calculate the similarittimey score 
    #SIM(x, x'; k) = sum_i^L [ w_{i,k} * cos(h(x_i|x), h(x_i'|x')) ]

    #subst_output = raw_model(sent.reshape(1, sent.shape[0]))
    suma = 0.0
    sent_len = original_output[2][2].shape[1]

    for token_idx in range(sent_len):     
        original_hidden = original_output[2]
        subst_hidden = subst_output[2]

        #Calculate the contextualized representation of the i-th word as a concatenation of RoBERTa's values in its last four layers
        context_original = torch.cat( tuple( [original_hidden[hs_idx][:, token_idx, :] for hs_idx in [1, 2, 3, 4]] ), dim=1)
        context_subst = torch.cat( tuple( [subst_hidden[hs_idx][:, token_idx, :] for hs_idx in [1, 2, 3, 4]] ), dim=1)
        suma += weights[mask_idx][token_idx] * cos_sim(context_original, context_subst)

    substitute_validation = suma
    return substitute_validation


#Calculates the proposal score
def proposal_score(original_score, subst_scores):
    subst_scores = torch.tensor(subst_scores)
    # we have to revert original_score to cpu
    original_score = original_score.cpu()
    return np.log(torch.div(subst_scores , (1.0 - original_score)) )


#finals, props, subval = calc_scores(scores, input_ids[i], input_embeds[i], original_score, mask_position)
def calc_scores(scr, sentences, original_output, original_score, mask_index):
    #Get representations of all substitute sentences
    sentences= torch.tensor(sentences).to(device)
    subst_output = raw_model(sentences)

    prop_score = proposal_score(original_score, scr) # this is cpu
    substitute_validation = similarity_score(original_output, subst_output, mask_index)

    alpha = 0.003
    # Move prop_score to the same device as substitute_validation before the operation
    prop_score = prop_score.to(substitute_validation.device)

    final_score = substitute_validation + alpha*prop_score
    
    '''
    print("Proposal score: " + str(prop_score))
    print("Subst. validation: " + str(substitute_validation))
    print("Final score for " + str(final_score) + "\n")
    '''
    return final_score, prop_score, substitute_validation

WSD_PATTERN = r' (\w+)/\w+\.\w\.\d+' # (Woord captured), letterlijke slash, woord, letterlijke punt, letter, cijfer(s)
# Needed for the "Tour" keyword: remnant of older dataset.
def preproc_sentence(sentence):
    sent_preproc = re.sub(WSD_PATTERN, r'\1', sentence) # Alleen woord blijft over
    return sent_preproc

In [3]:
def lexsub_dropout(sentence, target, topk=15):
    sentence = sentence.replace('-', ' ')
    table = str.maketrans(dict.fromkeys(string.punctuation)) 

    #Remove unnecessary punctuation from the sentence (such as: "GET *free food *coupons!!")
    split_sent = nltk.word_tokenize(sentence)
    split_sent = list(map(lambda wrd : wrd.translate(table) if wrd not in string.punctuation else wrd, split_sent))
    original_sent = ' '.join(split_sent)

    #Get raw model word embeddings for words in the sentence
    original_token_input = tokenizer.encode(" "+original_sent, return_tensors="pt").to(device)
    original_output = raw_model(original_token_input)
    inputs_embeds = original_output[2][1]

    #The target word to substitute
    target_token_id = tokenizer.encode(" "+target)[1]
    input_ids = tokenizer.encode(" " + original_sent)
    
    mask_position = input_ids.index(target_token_id)
    #input_ids = torch.tensor(input_ids).to(device)
    #Set a percentage of randomly selected embedding weights of the target word to 0.
    embedding_dim = 768
    dropout_percent = 0.3
    dropout_amount = round(dropout_percent*embedding_dim)

    #Start timing the experiment.
    start = time.time()

    #Run multiple experiments and then take average because of stochastic nature of choosing indices to dropout (sometimes the predictions are gibberish)
    all_scores = dict()
    all_counts = dict()
    num_iterations = 5
    for it in range(num_iterations):
        #Choose the weight indices to drop out.
        dropout_indices = np.random.choice(embedding_dim, dropout_amount, replace=False)
        #Apply dropout to the embeddings
        inputs_embeds[0, mask_position, dropout_indices] = 0
        #Pass the embeddings where masked word's embedding is partially droppped out to the model 
        with torch.no_grad():
                output = lm_model(inputs_embeds=inputs_embeds)
        logits = output[0].squeeze()
        #Get top guesses
        mask_logits = logits[mask_position]
        top_tokens = torch.topk(mask_logits, k=16, dim=0)[1]
        scores = torch.softmax(mask_logits, dim=0)[top_tokens].tolist()
        words = [tokenizer.decode(i.item()).strip() for i in top_tokens]
        words, scores, top_tokens = filter_words(target, words, scores, top_tokens) # THIS IS WHAT MAKES IT SLOW
        assert len(words) == len(scores)
        if len(words) == 0: 
            continue

        #Calculate proposal scores, substitute validation scores, and final scores
        original_score = torch.softmax(mask_logits, dim=0)[target_token_id]
        sentences = list()
        for i in range(len(words)):
            subst_word = top_tokens[i]
            input_ids[mask_position] = int(subst_word)
            sentences.append(list(input_ids))
       
        finals, props, subval = calc_scores(scores, sentences, original_output, original_score, mask_position)
        finals = map(lambda f : float(f), finals)
        props = map(lambda f : float(f), props)
        subval = map(lambda f : float(f), subval)

        if target in words:
            words.remove(target)

        #Update total scores and counts in the dictionary
        res = dict(zip(words, finals))
        for w, s in res.items():
            all_scores[w] = all_scores[w] + s if w in all_scores.keys() else s
            all_counts[w] = all_counts[w] + 1 if w in all_counts.keys() else 1
    #Get the average of accumulated scores.
    for w, s in all_scores.items():
        all_scores[w] = s / all_counts[w]
    words, finals = list(all_scores.keys()), list(all_scores.values())


    #Sort the found substitutes by scores and print them out.
    x = dict(zip(words, finals)) # list of words and final scores in dict form
    sorted_list = list(sorted(x.items(), key=lambda item: item[1], reverse=True))[:topk] # take the "topk" best substitutes
    print(["({0}: {1:0.8f})".format(k, v) for k,v in sorted_list])
    print("Elapsed time: ", time.time() - start, "\n")
    return sorted_list


In [4]:
test = lexsub_dropout("Le chat est sur le tapis", "chat")

['(chaton: 0.97169087)', '(félin: 0.96996006)', '(chien: 0.96967189)', '(chiot: 0.96169713)', '(minet: 0.96091932)', '(garde: 0.96065670)', '(chatte: 0.96041462)', '(serpent: 0.95942074)', '(roi: 0.95649716)', '(renard: 0.95592791)', '(poil: 0.95367508)', '(mâle: 0.95355189)', '(matin: 0.95318750)', '(nuage: 0.95109892)', '(tchat: 0.95043768)']
Elapsed time:  0.3506128787994385 


# We apply this method to the dataset

In [4]:
model_name = "Camembert"
all_predictions = {}

In [6]:
lemmatizer = fll()
for filename in os.listdir(f"{model_name}/Experiment_1/Curated/"):
    if filename.endswith("Tour.csv"):
        df = pd.read_csv(f"{model_name}/Experiment_1/Curated/" + filename, sep=";", encoding="utf-8", header=0)
        # We calculate predictions
        predictions = []
        keyword = df["source"][1].lower()
        sentences = df["match"].values
        if keyword == "tour":
            sentences = [preproc_sentence(sentence) for sentence in sentences]
        for sentence in sentences:
            print(sentence)
            substitutions = lexsub_dropout(sentence, keyword)
            sentence_tuple = (sentence, substitutions)
            predictions.append(sentence_tuple)
        all_predictions[keyword] = predictions
        

 J'ai refais plusieurs fois le tour du magasin en vain et j'étais, ma foi, trèèèès triste (T___T)! 
['(tours: 0.98619188)', '(coup: 0.96607646)', '(parcours: 0.96326862)', '(détour: 0.96256906)', '(chemin: 0.96212000)', '(mur: 0.96083177)', '(visite: 0.96078038)', '(trajet: 0.95991299)', '(centre: 0.95966133)', '(portrait: 0.95885819)', '(voyage: 0.95871965)', '(plein: 0.95853382)', '(panneau: 0.95797357)', '(temps: 0.95768526)', '(reste: 0.95750542)']
Elapsed time:  0.7320029735565186 

 Bonjour, J'ai acheté ce tour ... et pour tout dire, je suis déçu. 
['(tours: 0.97625327)', '(panneau: 0.95627072)', '(mur: 0.95604030)', '(groupe: 0.95528859)', '(fond: 0.95498383)', '(collier: 0.95372981)', '(ventilateur: 0.95323434)', '(guide: 0.95305356)', '(jeu: 0.95195243)', '(film: 0.95190018)', '(instrument: 0.95180967)', '(mécanisme: 0.95175789)', '(ensemble: 0.95165843)', '(robot: 0.95161256)', '(masque: 0.95152220)']
Elapsed time:  0.38699793815612793 

 Ce tour ne vaut pas 45 €... Pour ce p

In [12]:
# We save the predictions to a pandas dataframe (columns "keyword", 'sentence', 'predictions'
for match, predictions in all_predictions.items():
    df = pd.DataFrame(predictions, columns=["match", "predictions"])
    df["source"] = match
    # we add a 'match'
    df.to_csv(f"{model_name}/Experiment_3/Curated/predictions/{match}.csv", sep=";", encoding="utf-8", index=False)

In [13]:
df = pd.DataFrame(columns=["source", "match", "predictions"])
for match, predictions in all_predictions.items():
    for sentence, subs in predictions:
        new_row = pd.DataFrame({"source": [match], "match": [sentence], "predictions": [subs]})
        df = pd.concat([df, new_row], ignore_index=True)

In [14]:
# We then create a list of all the different words in "forward" and "backward"
all_words = {}
for match, predictions in all_predictions.items():
    all_words[match] = []
    for sentence, subs in predictions:
        all_words[match].extend([prediction[0] for prediction in subs])
    all_words[match] = list(set(all_words[match]))



In [15]:
# Represent the sentences as sparse vectors
# We do this by making a dataframe for each word: the index is the sentence, the columns are the words, and the values are the probabilities
# We transform the dataframe
vectors = {}
for keyword, predictions in all_predictions.items():
    df_sparse = pd.DataFrame(columns=["source", "match"] + all_words[keyword])
    print("created dataframe for ", keyword)
    rows = []
    for sentence, subs in predictions:
        row = {"source": keyword, "match": sentence}
        for word in all_words[keyword]:
            row[word] = 0
        for prediction in subs:
            word, prob = prediction
            row[word] = + prob  # We want to add to it if it already exists (forward and backward)
        rows.append(row)
    df_sparse = pd.DataFrame(rows)
    vector = df_sparse.drop(columns=["source", "match"]).values
    # We now have a numpy ndarray, which we can transform using the TfidfTransformer
    #transformer = TfidfTransformer()
    #vector_Tfidf = transformer.fit_transform(vector)
    svd = TruncatedSVD(n_components=100)
    vector_SVD = svd.fit_transform(vector)
    vectors[keyword] = vector_SVD
    print(f"converted {keyword} into {type(vector)}")


created dataframe for  tour
converted tour into <class 'numpy.ndarray'>


In [16]:
# Initialize the clustering algorithm
warnings.filterwarnings("ignore")
# 'n_clusters' is the number of clusters we want to form (and also the number of clusters to be found)
# 'linkage' is the linkage criterion (can be 'ward', 'complete', 'average', 'single')
clustermin = 3

agg_clustering = AgglomerativeClustering(n_clusters=clustermin, metric='cosine', linkage='average')

# Range of potential cluster numbers to test
cluster_range = range(clustermin, 11)

for keyword, vector_SVD in vectors.items():
    df = pd.read_csv(f"{model_name}/{keyword}.csv", sep=";", encoding="utf-8", header=0)
    # Apply the clustering algorithm to the SVD-transformed vectors
    agg_clusters = agg_clustering.fit_predict(vector_SVD)

    # 'agg_clusters' is now an array where the i-th element is the cluster label of the i-th instance
    # We add "1" to all clusters to start counting at 1 instead of 0
    agg_clusters += 1
    # We add the cluster label to the original dataframe, corresponding to the correct index

    df["agg_cluster_sub"] = agg_clusters

    # List to hold BIC values
    bic_values = []

    # Fit Gaussian Mixture Models for each number of clusters
    for i in cluster_range:
        print(f"Fitting model with {i} clusters")
        gmm = GaussianMixture(n_components=i, random_state=0).fit(vector_SVD)
        bic_values.append(gmm.bic(vector_SVD))

    # Find the number of clusters that gives the minimum BIC
    optimal_clusters = cluster_range[np.argmin(bic_values)]
    print(f"Optimal number of clusters: {optimal_clusters}")

    # Fit the optimal model
    gmm_optimal = GaussianMixture(n_components=optimal_clusters).fit(vector_SVD)

    # Predict the cluster for each data point
    BIC_clusters = gmm_optimal.predict(vector_SVD)
    # We want them to start counting at "1" instead of "0"
    BIC_clusters += 1
    print(f"Number of clusters: {len(np.unique(BIC_clusters))}")
    df["BIC_cluster_sub"] = BIC_clusters

    # We calculate a score for the clustering scores (starting before the outliers are removed)
    # Group the dataframe by "sense" and "cluster", and calculate the size of each group
    df_grouped = df.groupby(["sense", "agg_cluster_sub"]).size().reset_index(name="count")

    # Sort these clusters by size in descending order
    df_grouped = df_grouped.sort_values(by="count", ascending=False)

    # Initialize an empty dictionary to store the cluster numbers that have been assigned as default clusters
    # If the cluster number is not taken, we assign it to the corresponding "sense"
    # Else, we try to assign it to the next cluster number
    cluster_dict = {}
    for index, row in df_grouped.iterrows():
        if row["sense"] not in cluster_dict:
            if row["agg_cluster_sub"] not in cluster_dict.values():
                cluster_dict[row["sense"]] = row["agg_cluster_sub"]

        # We add "sense" values that have no entry in cluster_dict and set value to 0 (always seen as wrong)
    for sense in df["sense"].unique():
        if sense not in cluster_dict:
            cluster_dict[sense] = 0

    # Add a new column "default" to the original dataframe
    df["agg_default_sub"] = df.apply(lambda x: x["agg_cluster_sub"] == cluster_dict[x["sense"]], axis=1)

    # We calculate the percentage of default clusters
    percentage_default = (df["agg_default_sub"].sum() / len(df)) * 100
    # We also calculate this separately for each "sense"
    percentage_default_mean = df.groupby("sense")["agg_default_sub"].mean() * 100

    # We want the mean score across all senses, as it does not mean a lot if a program can correctly define one big cluster containing most of the data and fail at all other senses.
    percentage_weighted = percentage_default_mean.mean()
    print("agg:")
    print("Score for each", percentage_default_mean)
    print("Overall score", percentage_default)

    #----------------------------------------------------------------------------------------------------
    # We do this again for BIC clusters
    #----------------------------------------------------------------------------------------------------

    # We calculate a score for the clustering scores (starting before the outliers are removed)
    # Group the dataframe by "sense" and "cluster", and calculate the size of each group
    df_grouped = df.groupby(["sense", "BIC_cluster_sub"]).size().reset_index(name="count")

    # Sort these clusters by size in descending order
    df_grouped = df_grouped.sort_values(by="count", ascending=False)

    # Initialize an empty dictionary to store the cluster numbers that have been assigned as default clusters
    # If the cluster number is not taken, we assign it to the corresponding "sense"
    # Else, we try to assign it to the next cluster number
    cluster_dict = {}
    for index, row in df_grouped.iterrows():
        if row["sense"] not in cluster_dict:
            if row["BIC_cluster_sub"] not in cluster_dict.values():
                cluster_dict[row["sense"]] = row["BIC_cluster_sub"]

        # We add "sense" values that have no entry in cluster_dict and set value to 0 (always seen as wrong)
    for sense in df["sense"].unique():
        if sense not in cluster_dict:
            cluster_dict[sense] = 0

    # Add a new column "default" to the original dataframe
    df["BIC_default_sub"] = df.apply(lambda x: x["BIC_cluster_sub"] == cluster_dict[x["sense"]], axis=1)

    # We calculate the percentage of default clusters
    percentage_default = (df["BIC_default_sub"].sum() / len(df)) * 100
    # We also calculate this separately for each "sense"
    percentage_default_mean = df.groupby("sense")["BIC_default_sub"].mean() * 100

    # We want the mean score across all senses, as it does not mean a lot if a program can correctly define one big cluster containing most of the data and fail at all other senses.
    percentage_weighted = percentage_default_mean.mean()
    print("BIC:")
    print("Score for each", percentage_default_mean)
    print("Overall score", percentage_default)

    df.to_csv(f"{model_name}/Experiment_3/Curated/{keyword}.csv", sep=";", encoding="utf-8", index=False)

Fitting model with 3 clusters
Fitting model with 4 clusters
Fitting model with 5 clusters
Fitting model with 6 clusters
Fitting model with 7 clusters
Fitting model with 8 clusters
Fitting model with 9 clusters
Fitting model with 10 clusters
Optimal number of clusters: 3
Number of clusters: 3
agg:
Score for each sense
lap       0.000000
round     0.000000
tower     0.000000
trick    21.428571
turn     95.833333
Name: agg_default_sub, dtype: float64
Overall score 35.13513513513514
BIC:
Score for each sense
lap        0.000000
round    100.000000
tower      0.000000
trick     42.857143
turn     100.000000
Name: BIC_default_sub, dtype: float64
Overall score 59.45945945945946


In [17]:
import plotly.express as px

In [20]:
# We load in the dataframe for "Bureau"
df = pd.read_csv(f"{model_name}/Experiment_3/Curated/Tour.csv", sep=";", encoding="utf-8", header=0)

In [21]:
fig_2d_cluster = px.scatter(df, x="x", y="y", color="sense", symbol="BIC_cluster_sub",
                 hover_data='match',
                 template="plotly_white")
fig_2d_cluster.update_layout(
    title={
        'text': f"Clusters Bureau<br><sub>Calculated with Lexical substitution following Zhou 2019</sub>",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig_2d_cluster.show()


In [22]:
fig_3d_cluster = px.scatter_3d(df, x="x", y="y", z="z", color="sense", symbol="BIC_cluster_sub",
                 hover_data='match',
                 template="plotly_white")
fig_3d_cluster.update_layout(
    title={
        'text': f"Clusters Bureau<br><sub>Calculated with Lexical substitution following Zhou 2019</sub>",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig_3d_cluster.show()

In [76]:
# We save these files straight into our appendix
if not os.path.exists("Results/Appendix/Experiment_3/Curated"):
    os.makedirs("Results/Appendix/Experiment_3/Curated")
#fig_2d_cluster.write_html(f"Results/Appendix/Experiment_3/Curated/Bureau_2d_cluster.html")
