# Experiment 3: Clustering op substitute word vectors

In [1]:
import torch
import pandas as pd
import os
import re
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.metrics.cluster import v_measure_score, adjusted_rand_score
import numpy as np
import statistics
import openpyxl

The second part of this notebook only works if the first part has been ran with both 'Flaubert' and 'Camembert' as model names.

In [30]:
model_name = "Flaubert"

# AGG-clusters

In [31]:
weighted_scores_agg = {} # our original metric
overall_scores_agg = {} # the percentage of correct predictions across all senses
all_in_one_cluster_overall_scores_agg = {} # the percentage of correct predictions if no clustering would take place
amount_senses = {} # the amount of different senses (does not differ)
v_measure_scores_agg = {} # the v-measure score
rand_scores_agg = {} # the adjusted rand score
# We also calculate a F1 score in a later code block

for filename in os.listdir(f"{model_name}/Experiment_2/Curated"):
    if filename.endswith(".csv"):
        df = pd.read_csv(f"{model_name}/Experiment_2/Curated/" + filename, sep=";", encoding="utf-8", header=0)
        target_word = df["source"][1]
        print(target_word)
        # We calculate the amount of different senses
        senses = len(df["sense"].unique())
        print("Number of senses:", senses)
        percentage_default_agg = (df["agg_default_sub"].sum() / len(df))
        # We also calculate this separately for each "sense"
        percentage_default_mean_agg = df.groupby("sense")["agg_default_sub"].mean()
        # We calculate a harmonic mean, as it is more representative of the overall performance
        harmonic_mean_agg = statistics.harmonic_mean(df.groupby("sense")["agg_default_sub"].mean())
        # We want the mean score across all senses, as it does not mean a lot if a program can correctly define one big cluster containing most of the data and fail at all other senses.
        percentage_weighted_agg = percentage_default_mean_agg.mean()
        v_measure_agg = v_measure_score(df["sense"], df["agg_cluster_sub"])
        rand_agg = adjusted_rand_score(df["sense"], df["agg_cluster_sub"])



        print("Score for each", percentage_default_mean_agg*100)
        print("Overall score", percentage_default_agg*100)
        print("Weighted score", percentage_weighted_agg*100)
        print("Harmonic mean", harmonic_mean_agg*100)
        print("V-measure score", v_measure_agg*100)
        print("Adjusted Rand score", rand_agg*100)
        weighted_scores_agg[target_word] = percentage_weighted_agg
        overall_scores_agg[target_word] = percentage_default_agg
        amount_senses[target_word] = senses
        v_measure_scores_agg[target_word] = v_measure_agg
        rand_scores_agg[target_word] = rand_agg
        # We also calculate the score if no clustering would take place
        default = df["sense"].value_counts().idxmax()
        
        # We calculate the overall score: this is the percentage of this sense in the total dataframe
        all_in_one_cluster_score_agg = (df["sense"] == default).sum() / len(df)
        all_in_one_cluster_overall_scores_agg[target_word] = all_in_one_cluster_score_agg
        print("All-in-one score", all_in_one_cluster_score_agg*100, "\n\n")

Avocat
Number of senses: 2
Score for each sense
avocado     3.703704
lawyer     96.428571
Name: agg_default_sub, dtype: float64
Overall score 66.26506024096386
Weighted score 50.06613756613757
Harmonic mean 7.133421400264201
V-measure score 1.559295869676433
Adjusted Rand score -0.06809171721353265
All-in-one score 67.46987951807229 


Bien
Number of senses: 3
Score for each sense
good          6.896552
property     97.959184
wellbeing     5.000000
Name: agg_default_sub, dtype: float64
Overall score 52.04081632653062
Weighted score 36.618578465869106
Harmonic mean 8.44574780058651
V-measure score 4.718652672395608
Adjusted Rand score 1.4073174086804534
All-in-one score 50.0 


Bureau
Number of senses: 3
Score for each sense
cabinet     0.000000
desk        2.857143
office     98.148148
Name: agg_default_sub, dtype: float64
Overall score 52.42718446601942
Weighted score 33.66843033509701
Harmonic mean 0
V-measure score 3.0925518897676674
Adjusted Rand score -0.3351663176785294
All-in-on

In [32]:
# We create a dictionary of dictionaries to save, for each sense, the ratings of its precision and recall
all_scores_agg = {}
all_lengths_agg = {}
for filename in os.listdir(f"{model_name}/Experiment_2/Curated/"):
    if filename.endswith(".csv"):
        df_F1 = pd.read_csv(f"{model_name}/Experiment_2/Curated/" + filename, sep=";", encoding="utf-8", header=0)

        target_word = df_F1["source"][1]
        print("\n\n", target_word)
        total_df_length = len(df_F1)
        # We calculate the amount of different senses
        senses = len(df_F1["sense"].unique())
        print("Number of senses:", senses)
        target_word_dictionary = {}
        target_word_lengths = {}
        target_word_lengths["total"] = total_df_length
        for sense in df_F1["sense"].unique():
            scores = {}
            sense_df = df_F1[df_F1["sense"] == sense]
                    #precision: number of correct instances in a cluster / total cluster size
                    # We calculate the amount of "default" instances
            sense_len = len(sense_df)
        
            
            correct_df = sense_df[sense_df["agg_default_sub"] == True]
            correct_len = len(correct_df)
            if correct_len > 0:
                correct_cluster = correct_df["agg_cluster_sub"].iloc[0]
            else:
                correct_cluster = "No cluster"
                
            cluster_len = len(df_F1[df_F1["agg_cluster_sub"] == correct_cluster])
            print("\nsense:", sense)
            print("instances of sense:", sense_len)
            print("amount of correct instances:", correct_len)
            print("correct cluster number:", correct_cluster)
            print("total amount of points in biggest cluster:", cluster_len)
            
            if cluster_len > 0:  
                precision = correct_len / cluster_len
            else:
                precision = 0
                print("precision zero")
            recall = correct_len / sense_len
            print("precision:", precision)
            print("recall", recall)
            
            # Automatically calculated version. Gives bizarre results, so we do it manually later on
            F1 = statistics.harmonic_mean((precision, recall))
            print("F1", F1)
            
            scores["precision"] = precision
            scores["recall"] = recall
            scores["F1"] = F1
            target_word_dictionary[sense] = scores
            target_word_lengths[sense] = sense_len
        all_scores_agg[target_word] = target_word_dictionary
        all_lengths_agg[target_word] = target_word_lengths
            



 Avocat
Number of senses: 2

sense: lawyer
instances of sense: 56
amount of correct instances: 54
correct cluster number: 1
total amount of points in biggest cluster: 80
precision: 0.675
recall 0.9642857142857143
F1 0.7941176470588236

sense: avocado
instances of sense: 27
amount of correct instances: 1
correct cluster number: 3
total amount of points in biggest cluster: 2
precision: 0.5
recall 0.037037037037037035
F1 0.06896551724137931


 Bien
Number of senses: 3

sense: property
instances of sense: 49
amount of correct instances: 48
correct cluster number: 1
total amount of points in biggest cluster: 94
precision: 0.5106382978723404
recall 0.9795918367346939
F1 0.6713286713286714

sense: wellbeing
instances of sense: 20
amount of correct instances: 1
correct cluster number: 2
total amount of points in biggest cluster: 1
precision: 1.0
recall 0.05
F1 0.09523809523809523

sense: good
instances of sense: 29
amount of correct instances: 2
correct cluster number: 3
total amount of poin

In [33]:
# Total F1: weighted average of the F1 scores of each gold standard sense
# We calculate the weight for each sense of the dataframe (the amount of instances of this sense divided by the total amount of instances)
# We make a dictionary F1_scores, in which each target word is associated with its F1 score
F1_scores_agg = {}
for target_word in all_scores_agg:
    F1 = 0
    weight = 0
    print("\n\n", target_word)
    for sense in all_scores_agg[target_word]:
        score = all_scores_agg[target_word][sense]["F1"]
        print(sense)
        print("score:", score)
        
        len_sense = all_lengths_agg[target_word][sense]
        print("length of sense:", len_sense)
        
        len_total = all_lengths_agg[target_word]["total"]
        print("total length:", len_total)
        added_weight = len_sense / len_total
        weight += added_weight
        print("added weight:", added_weight, "for a total of", weight)
        F1 += score * (len_sense / len_total)
        print("F1 score after calc:", F1*100)
    F1_scores_agg[target_word] = F1





 Avocat
lawyer
score: 0.7941176470588236
length of sense: 56
total length: 83
added weight: 0.6746987951807228 for a total of 0.6746987951807228
F1 score after calc: 53.579021970233875
avocado
score: 0.06896551724137931
length of sense: 27
total length: 83
added weight: 0.3253012048192771 for a total of 1.0
F1 score after calc: 55.82247855519441


 Bien
property
score: 0.6713286713286714
length of sense: 49
total length: 98
added weight: 0.5 for a total of 0.5
F1 score after calc: 33.56643356643357
wellbeing
score: 0.09523809523809523
length of sense: 20
total length: 98
added weight: 0.20408163265306123 for a total of 0.7040816326530612
F1 score after calc: 35.51006816312939
good
score: 0.125
length of sense: 29
total length: 98
added weight: 0.29591836734693877 for a total of 1.0
F1 score after calc: 39.20904775496612


 Bureau
desk
score: 0.05555555555555555
length of sense: 35
total length: 103
added weight: 0.33980582524271846 for a total of 0.33980582524271846
F1 score after ca

In [34]:
# We will also calculate an F1 score for each word when no clustering takes place
# We will make a copy of each dataframe and set the "cluster" column to 0 for all rows
# We create a dictionary of dictionaries to save, for each sense, the ratings of its precision and recall
all_scores_no_cluster = {}
all_lengths_no_cluster = {}

# In all further experiments, we keep working with the same dataset as in Experiment 1.
for filename in os.listdir(f"{model_name}/Experiment_1/Curated"):
    if filename.endswith(".csv"):
        df_F1 = pd.read_csv(f"{model_name}/Experiment_1/Curated/" + filename, sep=";", encoding="utf-8", header=0)

        target_word = df_F1["source"][1]
        # We undo clustering: we set all clusters to 0
        # The "default" (i.e. "correct" category is the most frequent sense)
        df_F1["cluster"] = 0
        most_freq_sense = df_F1["sense"].value_counts().idxmax()
        df_F1["default"] = [1 if sense == most_freq_sense else 0 for sense in df_F1["sense"]]
        print("\n\n", target_word)
        total_df_length = len(df_F1)
        # We calculate the amount of different senses
        senses = len(df_F1["sense"].unique())
        print("Number of senses:", senses)
        target_word_dictionary = {}
        target_word_lengths = {}
        target_word_lengths["total"] = total_df_length
        for sense in df_F1["sense"].unique():
            scores = {}
            sense_df = df_F1[df_F1["sense"] == sense]
                    #precision: number of correct instances in a cluster / total cluster size
                    # We calculate the amount of "default" instances
            sense_len = len(sense_df)
        
            
            correct_df = sense_df[sense_df["default"] == True]
            correct_len = len(correct_df)
            if correct_len > 0:
                correct_cluster = correct_df["cluster"].iloc[0]
            else:
                correct_cluster = "No cluster"
                
            cluster_len = len(df_F1[df_F1["cluster"] == correct_cluster])
            print("\nsense:", sense)
            print("instances of sense:", sense_len)
            print("amount of correct instances:", correct_len)
            print("correct cluster number:", correct_cluster)
            print("total amount of points in biggest cluster:", cluster_len)
            
            if cluster_len > 0:  
                precision = correct_len / cluster_len
            else:
                precision = 0
                print("precision zero")
            recall = correct_len / sense_len
            print("precision:", precision)
            print("recall", recall)
            
            F1 = statistics.harmonic_mean((precision, recall))
            print("F1", F1)
            
            scores["precision"] = precision
            scores["recall"] = recall
            scores["F1"] = F1
            target_word_dictionary[sense] = scores
            target_word_lengths[sense] = sense_len
        all_scores_no_cluster[target_word] = target_word_dictionary
        all_lengths_no_cluster[target_word] = target_word_lengths
# Total F1: weighted average of the F1 scores of each gold standard sense
# We calculate the weight for each sense of the dataframe (the amount of instances of this sense divided by the total amount of instances)
# We make a dictionary F1_scores, in which each target word is associated with its F1 score
F1_scores_no_cluster = {}
for target_word in all_scores_no_cluster:
    F1 = 0
    weight = 0
    print("\n\n", target_word)
    for sense in all_scores_no_cluster[target_word]:
        score = all_scores_no_cluster[target_word][sense]["F1"]
        print(sense)
        print("score:", score)
        
        len_sense = all_lengths_no_cluster[target_word][sense]
        print("length of sense:", len_sense)
        
        len_total = all_lengths_no_cluster[target_word]["total"]
        print("total length:", len_total)
        added_weight = len_sense / len_total
        weight += added_weight
        print("added weight:", added_weight, "for a total of", weight)
        F1 += score * (len_sense / len_total)
        print("F1 score after calc:", F1)
    F1_scores_no_cluster[target_word] = F1





 Avocat
Number of senses: 2

sense: lawyer
instances of sense: 56
amount of correct instances: 56
correct cluster number: 0
total amount of points in biggest cluster: 84
precision: 0.6666666666666666
recall 1.0
F1 0.8

sense: avocado
instances of sense: 28
amount of correct instances: 0
correct cluster number: No cluster
total amount of points in biggest cluster: 0
precision zero
precision: 0
recall 0.0
F1 0


 Bien
Number of senses: 3

sense: property
instances of sense: 49
amount of correct instances: 49
correct cluster number: 0
total amount of points in biggest cluster: 98
precision: 0.5
recall 1.0
F1 0.6666666666666666

sense: wellbeing
instances of sense: 20
amount of correct instances: 0
correct cluster number: No cluster
total amount of points in biggest cluster: 0
precision zero
precision: 0
recall 0.0
F1 0

sense: good
instances of sense: 29
amount of correct instances: 0
correct cluster number: No cluster
total amount of points in biggest cluster: 0
precision zero
precisio

In [35]:
Nouns = ["Avocat", "Bien", "Bureau", "Faculté", "Glace", "Souris", "Tour", "Vol"]
Verbs = ["Filer", "Supporter", "Tirer"]

nouns_average_weighted_agg = np.mean([weighted_scores_agg[noun] for noun in Nouns])
nouns_average_overall_agg = np.mean([overall_scores_agg[noun] for noun in Nouns])
nouns_average_all_in_one_agg = np.mean([all_in_one_cluster_overall_scores_agg[noun] for noun in Nouns])
nouns_average_v_measure_agg = np.mean([v_measure_scores_agg[noun] for noun in Nouns])
nouns_average_rand_agg = np.mean([rand_scores_agg[noun] for noun in Nouns])
nouns_average_F1_agg = np.mean([F1_scores_agg[noun] for noun in Nouns])
nouns_average_F1_no_cluster = np.mean([F1_scores_no_cluster[noun] for noun in Nouns])


verbs_average_weighted_agg = np.mean([weighted_scores_agg[verb] for verb in Verbs])
verbs_average_overall_agg = np.mean([overall_scores_agg[verb] for verb in Verbs])
verbs_average_all_in_one_agg = np.mean([all_in_one_cluster_overall_scores_agg[verb] for verb in Verbs])
verbs_average_v_measure_agg = np.mean([v_measure_scores_agg[verb] for verb in Verbs])
verbs_average_rand_agg = np.mean([rand_scores_agg[verb] for verb in Verbs])
verbs_average_F1_agg = np.mean([F1_scores_agg[verb] for verb in Verbs])
verbs_average_F1_no_cluster = np.mean([F1_scores_no_cluster[verb] for verb in Verbs])

print("Nouns average of weighted scores:", nouns_average_weighted_agg)
print("Nouns average of overall scores:", nouns_average_overall_agg)
print("Nouns average of all in one cluster scores:", nouns_average_all_in_one_agg)
print("Nouns average of v-measure cluster score:", nouns_average_v_measure_agg)
print("Nouns average of rand score:", nouns_average_rand_agg)
print("Nouns average of F1 scores:", nouns_average_F1_agg)
print("Nouns average of F1 scores when no clustering takes place:", nouns_average_F1_no_cluster)
print("\n\n")
print("Verbs average of weighted scores:", verbs_average_weighted_agg)
print("Verbs average of overall scores:", verbs_average_overall_agg)
print("Verbs average of all in one cluster scores:", verbs_average_all_in_one_agg)
print("Verbs average of v-measure cluster score:", verbs_average_v_measure_agg)
print("Verbs average of rand score:", verbs_average_rand_agg)
print("Verbs average of F1 scores:", verbs_average_F1_agg)
print("Verbs average of F1 scores when no clustering takes place:", verbs_average_F1_no_cluster)


Nouns average of weighted scores: 0.4448430820594663
Nouns average of overall scores: 0.5886544329361538
Nouns average of all in one cluster scores: 0.5147473294527403
Nouns average of v-measure cluster score: 0.2228025652098367
Nouns average of rand score: 0.19676685501966798
Nouns average of F1 scores: 0.47712131595320795
Nouns average of F1 scores when no clustering takes place: 0.3574206110459456



Verbs average of weighted scores: 0.436291064032158
Verbs average of overall scores: 0.555051044634378
Verbs average of all in one cluster scores: 0.5670405982905983
Verbs average of v-measure cluster score: 0.24564319539678547
Verbs average of rand score: 0.1056622184706341
Verbs average of F1 scores: 0.4657311758030552
Verbs average of F1 scores when no clustering takes place: 0.41897442708157473


In [36]:
# We create a dataframe from the dictionaries to eventually save the results to a csv file
df_agg = pd.DataFrame.from_dict(weighted_scores_agg, orient="index", columns=["Weighted"])
df_agg["Overall"] = overall_scores_agg.values()
df_agg["V_measure"] = v_measure_scores_agg.values()
df_agg["Rand"] = rand_scores_agg.values()
df_agg["F1"] = [F1_scores_agg[word] for word in df_agg.index]

df_agg["Type"] = ["Noun" if word in Nouns else "Verb" for word in df_agg.index]
df_agg["Amount_senses"] = amount_senses.values()
# We add a "F1" column to the dataframe

# We also create a separate file for the all in one cluster scores
df_all_in_one_agg = pd.DataFrame.from_dict(all_in_one_cluster_overall_scores_agg, orient="index", columns=["Overall_no_cluster"])
df_all_in_one_agg["F1_no_cluster"] = [F1_scores_no_cluster[word] for word in df_all_in_one_agg.index]
df_all_in_one_agg["Type"] = ["Noun" if word in Nouns else "Verb" for word in df_all_in_one_agg.index]

# We add a row 'Average' to the dataframes
df_with_average_agg = df_agg.copy()
df_with_average_agg.loc["Average_nouns"] = [nouns_average_weighted_agg, nouns_average_overall_agg, nouns_average_v_measure_agg, nouns_average_rand_agg, nouns_average_F1_agg, "", ""]
df_with_average_agg.loc["Average_verbs"] = [verbs_average_weighted_agg, verbs_average_overall_agg, verbs_average_v_measure_agg, verbs_average_rand_agg, verbs_average_F1_agg, "", ""]
df_with_average_agg.loc["Global_average"] = [df_agg["Weighted"].mean(), df_agg["Overall"].mean(), df_agg["V_measure"].mean(), df_agg["Rand"].mean(), df_agg["F1"].mean(), "", ""] #use the og file, no account of averages


# As we don't make a copy, we have to calculate the mean before adding the other averages!
df_all_in_one_agg.loc["Global_average"] = [df_all_in_one_agg["Overall_no_cluster"].mean(), df_all_in_one_agg["F1_no_cluster"].mean(), "",] 
df_all_in_one_agg.loc["Average_nouns"] = [nouns_average_all_in_one_agg, nouns_average_F1_no_cluster,""]
df_all_in_one_agg.loc["Average_verbs"] = [verbs_average_all_in_one_agg, verbs_average_F1_no_cluster,""]

In [37]:
fig_zegt_weinig = px.scatter(df_agg, x="Weighted", y="Overall", text=df_agg.index, color="Type", title="Scores for each word, op een manier die weinig zegt eigenlijk",template="plotly_white")
fig_zegt_weinig.show()
fig_zegt_weinig = px.scatter(df_agg, x="Weighted", y="Overall", text=df_agg.index, color="Type", title="Scores for each word, op een manier die weinig zegt eigenlijk",template="plotly_white")
fig = px.scatter(df_agg, x="V_measure", y="F1", text=df_agg.index, color="Type", title="Scores for each word",template="plotly_white")
fig.show()

# Estimating the number of clusters
We will now calculate if the model correctly predicts the number of clusters.
We will count the number of different manually annotated senses and compare this to the amount of generated clusters.

In [38]:
correct = []
close = [] # One cluster off, often the result of different granularity. Is it ok to report like this or do we have to be more strict?
incorrect = []
differences_agg = {}
for filename in os.listdir(f"{model_name}/Experiment_2/Curated/"):
    if filename.endswith(".csv"):
        df = pd.read_csv(f"{model_name}/Experiment_2/Curated/" + filename, sep=";", encoding="utf-8", header=0)
        target_word = df["source"][1]
        print(target_word)
        # We count the number of different senses
        senses = len(df["sense"].unique())
        # We count the number of clusters
        clusters = len(df["agg_cluster_sub"].unique())
        print("Number of senses:", senses)
        print("Number of clusters:", clusters)
        if senses == clusters:
            print("Correct number of clusters")
            difference = 0
            correct.append(target_word)
        elif senses > clusters:
            difference = senses - clusters
            print(difference, "too few clusters")
            if difference == 1:
                close.append(target_word)
            else:
                incorrect.append(target_word)
            difference = -difference
        elif senses < clusters:
            difference = clusters - senses
            print(difference, "too many clusters")
            if difference == 1:
                close.append(target_word)
            else:
                incorrect.append(target_word)
                
        differences_agg[target_word] = difference
        # We calculate the score
        #print("Score:", score)
        #scores[target_word] = score

print("Correct:", correct)
print("Close:", close)
print("Incorrect:", incorrect)
print("Percentage approach:", (len(correct) + len(close)) / ((len(correct) + len(close) + len(incorrect))))

Avocat
Number of senses: 2
Number of clusters: 3
1 too many clusters
Bien
Number of senses: 3
Number of clusters: 3
Correct number of clusters
Bureau
Number of senses: 3
Number of clusters: 3
Correct number of clusters
Faculté
Number of senses: 3
Number of clusters: 3
Correct number of clusters
Filer
Number of senses: 3
Number of clusters: 3
Correct number of clusters
Glace
Number of senses: 5
Number of clusters: 3
2 too few clusters
Souris
Number of senses: 3
Number of clusters: 3
Correct number of clusters
Supporter
Number of senses: 3
Number of clusters: 3
Correct number of clusters
Tirer
Number of senses: 2
Number of clusters: 3
1 too many clusters
Tour
Number of senses: 5
Number of clusters: 3
2 too few clusters
Vol
Number of senses: 2
Number of clusters: 3
1 too many clusters
Correct: ['Bien', 'Bureau', 'Faculté', 'Filer', 'Souris', 'Supporter']
Close: ['Avocat', 'Tirer', 'Vol']
Incorrect: ['Glace', 'Tour']
Percentage approach: 0.8181818181818182


**Results for Camembert:**
Correct: ['Filer', 'Glace']
Close: ['Avocat', 'Bien', 'Bureau', 'Faculté', 'Tirer', 'Tour', 'Vol']
Incorrect: ['Souris', 'Supporter']
Percentage approach: 0.8181818181818182

**Results for Flaubert:**
Correct: ['Bien']
Close: ['Avocat', 'Bureau', 'Faculté', 'Filer', 'Glace', 'Tirer', 'Vol']
Incorrect: ['Souris', 'Supporter', 'Tour']
Percentage approach: 0.7272727272727273

In [39]:
# We add a "difference" column to the dataframe
df_with_average_agg["Cluster_Estimation"] = [differences_agg[word] if word in differences_agg else "" for word in df_with_average_agg.index]

In [40]:
# We save these dataframes to a csv file
if not os.path.exists(f"Results/Experiment_2/Curated/AGG"):
    os.makedirs(f"Results/Experiment_2/Curated/AGG")
df_with_average_agg.to_csv(f"Results/Experiment_2/Curated/AGG/{model_name}_scores.csv", sep=";", encoding="utf-8")
fig.write_html(f"Results/Experiment_2/Curated/AGG/{model_name}_scores.html")
df_all_in_one_agg.to_csv(f"Results/Experiment_2/Curated/AGG/all_in_one_scores.csv", sep=";", encoding="utf-8")

# BIC

In [41]:
weighted_scores_BIC = {} # our original metric
overall_scores_BIC = {} # the percentage of correct predictions across all senses
all_in_one_cluster_overall_scores_BIC = {} # the percentage of correct predictions if no clustering would take place
amount_senses = {} # the amount of different senses (does not differ)
v_measure_scores_BIC = {} # the v-measure score
rand_scores_BIC = {} # the adjusted rand score
# We also calculate a F1 score in a later code block

for filename in os.listdir(f"{model_name}/Experiment_2/Curated/"):
    if filename.endswith(".csv"):
        df = pd.read_csv(f"{model_name}/Experiment_2/Curated/" + filename, sep=";", encoding="utf-8", header=0)
        target_word = df["source"][1]
        print(target_word)
        # We calculate the amount of different senses
        senses = len(df["sense"].unique())
        print("Number of senses:", senses)
        percentage_default_BIC = (df["BIC_default_sub"].sum() / len(df))
        # We also calculate this separately for each "sense"
        percentage_default_mean_BIC = df.groupby("sense")["BIC_default_sub"].mean()
        # We calculate a harmonic mean, as it is more representative of the overall performance
        harmonic_mean_BIC = statistics.harmonic_mean(df.groupby("sense")["BIC_default_sub"].mean())
        # We want the mean score across all senses, as it does not mean a lot if a program can correctly define one big cluster containing most of the data and fail at all other senses.
        percentage_weighted_BIC = percentage_default_mean_BIC.mean()
        v_measure_BIC = v_measure_score(df["sense"], df["BIC_cluster_sub"])
        rand_BIC = adjusted_rand_score(df["sense"], df["BIC_cluster_sub"])



        print("Score for each", percentage_default_mean_BIC*100)
        print("Overall score", percentage_default_BIC*100)
        print("Weighted score", percentage_weighted_BIC*100)
        print("Harmonic mean", harmonic_mean_BIC*100)
        print("V-measure score", v_measure_BIC*100)
        print("Adjusted Rand score", rand_BIC*100)
        weighted_scores_BIC[target_word] = percentage_weighted_BIC
        overall_scores_BIC[target_word] = percentage_default_BIC
        amount_senses[target_word] = senses
        v_measure_scores_BIC[target_word] = v_measure_BIC
        rand_scores_BIC[target_word] = rand_BIC
        # We also calculate the score if no clustering would take place
        default = df["sense"].value_counts().idxmax()
        
        # We calculate the overall score: this is the percentage of this sense in the total dataframe
        all_in_one_cluster_score_BIC = (df["sense"] == default).sum() / len(df)
        all_in_one_cluster_overall_scores_BIC[target_word] = all_in_one_cluster_score_BIC
        print("All-in-one score", all_in_one_cluster_score_BIC*100, "\n\n")

Avocat
Number of senses: 2
Score for each sense
avocado    25.925926
lawyer     57.142857
Name: BIC_default_sub, dtype: float64
Overall score 46.98795180722892
Weighted score 41.53439153439153
Harmonic mean 35.6687898089172
V-measure score 28.619700251607465
Adjusted Rand score 9.486532200587083
All-in-one score 67.46987951807229 


Bien
Number of senses: 3
Score for each sense
good          0.000000
property     93.877551
wellbeing    10.000000
Name: BIC_default_sub, dtype: float64
Overall score 48.97959183673469
Weighted score 34.625850340136054
Harmonic mean 0
V-measure score 4.140460145469759
Adjusted Rand score -0.08028424740723052
All-in-one score 50.0 


Bureau
Number of senses: 3
Score for each sense
cabinet     0.000000
desk       25.714286
office     98.148148
Name: BIC_default_sub, dtype: float64
Overall score 60.19417475728155
Weighted score 41.28747795414462
Harmonic mean 0
V-measure score 16.30196741030239
Adjusted Rand score 6.190183692049923
All-in-one score 52.42718446

In [42]:
# We create a dictionary of dictionaries to save, for each sense, the ratings of its precision and recall
all_scores_BIC = {}
all_lengths_BIC = {}
for filename in os.listdir(f"{model_name}/Experiment_2/Curated/"):
    if filename.endswith(".csv"):
        df_F1 = pd.read_csv(f"{model_name}/Experiment_2/Curated/" + filename, sep=";", encoding="utf-8", header=0)

        target_word = df_F1["source"][1]
        print("\n\n", target_word)
        total_df_length = len(df_F1)
        # We calculate the amount of different senses
        senses = len(df_F1["sense"].unique())
        print("Number of senses:", senses)
        target_word_dictionary = {}
        target_word_lengths = {}
        target_word_lengths["total"] = total_df_length
        for sense in df_F1["sense"].unique():
            scores = {}
            sense_df = df_F1[df_F1["sense"] == sense]
                    #precision: number of correct instances in a cluster / total cluster size
                    # We calculate the amount of "default" instances
            sense_len = len(sense_df)
        
            
            correct_df = sense_df[sense_df["BIC_default_sub"] == True]
            correct_len = len(correct_df)
            if correct_len > 0:
                correct_cluster = correct_df["BIC_cluster_sub"].iloc[0]
            else:
                correct_cluster = "No cluster"
                
            cluster_len = len(df_F1[df_F1["BIC_cluster_sub"] == correct_cluster])
            print("\nsense:", sense)
            print("instances of sense:", sense_len)
            print("amount of correct instances:", correct_len)
            print("correct cluster number:", correct_cluster)
            print("total amount of points in biggest cluster:", cluster_len)
            
            if cluster_len > 0:  
                precision = correct_len / cluster_len
            else:
                precision = 0
                print("precision zero")
            recall = correct_len / sense_len
            print("precision:", precision)
            print("recall", recall)
            
            F1 = statistics.harmonic_mean((precision, recall))
            print("F1", F1)
            
            scores["precision"] = precision
            scores["recall"] = recall
            scores["F1"] = F1
            target_word_dictionary[sense] = scores
            target_word_lengths[sense] = sense_len
        all_scores_BIC[target_word] = target_word_dictionary
        all_lengths_BIC[target_word] = target_word_lengths
            



 Avocat
Number of senses: 2

sense: lawyer
instances of sense: 56
amount of correct instances: 32
correct cluster number: 3
total amount of points in biggest cluster: 52
precision: 0.6153846153846154
recall 0.5714285714285714
F1 0.5925925925925926

sense: avocado
instances of sense: 27
amount of correct instances: 7
correct cluster number: 1
total amount of points in biggest cluster: 7
precision: 1.0
recall 0.25925925925925924
F1 0.4117647058823529


 Bien
Number of senses: 3

sense: property
instances of sense: 49
amount of correct instances: 46
correct cluster number: 1
total amount of points in biggest cluster: 93
precision: 0.4946236559139785
recall 0.9387755102040817
F1 0.647887323943662

sense: wellbeing
instances of sense: 20
amount of correct instances: 2
correct cluster number: 2
total amount of points in biggest cluster: 4
precision: 0.5
recall 0.1
F1 0.16666666666666666

sense: good
instances of sense: 29
amount of correct instances: 0
correct cluster number: No cluster
to

In [43]:
# Total F1: weighted average of the F1 scores of each gold standard sense
# We calculate the weight for each sense of the dataframe (the amount of instances of this sense divided by the total amount of instances)
# We make a dictionary F1_scores, in which each target word is associated with its F1 score
F1_scores_BIC = {}
for target_word in all_scores_BIC:
    F1 = 0
    weight = 0
    print("\n\n", target_word)
    for sense in all_scores_BIC[target_word]:
        score = all_scores_BIC[target_word][sense]["F1"]
        print(sense)
        print("score:", score)
        
        len_sense = all_lengths_BIC[target_word][sense]
        print("length of sense:", len_sense)
        
        len_total = all_lengths_BIC[target_word]["total"]
        print("total length:", len_total)
        added_weight = len_sense / len_total
        weight += added_weight
        print("added weight:", added_weight, "for a total of", weight)
        F1 += score * (len_sense / len_total)
        print("F1 score after calc:", F1*100)
    F1_scores_BIC[target_word] = F1





 Avocat
lawyer
score: 0.5925925925925926
length of sense: 56
total length: 83
added weight: 0.6746987951807228 for a total of 0.6746987951807228
F1 score after calc: 39.982150825524315
avocado
score: 0.4117647058823529
length of sense: 27
total length: 83
added weight: 0.3253012048192771 for a total of 1.0
F1 score after calc: 53.37690631808278


 Bien
property
score: 0.647887323943662
length of sense: 49
total length: 98
added weight: 0.5 for a total of 0.5
F1 score after calc: 32.3943661971831
wellbeing
score: 0.16666666666666666
length of sense: 20
total length: 98
added weight: 0.20408163265306123 for a total of 0.7040816326530612
F1 score after calc: 35.795726741400784
good
score: 0
length of sense: 29
total length: 98
added weight: 0.29591836734693877 for a total of 1.0
F1 score after calc: 35.795726741400784


 Bureau
desk
score: 0.40909090909090906
length of sense: 35
total length: 103
added weight: 0.33980582524271846 for a total of 0.33980582524271846
F1 score after calc: 1

In [44]:
# We will also calculate an F1 score for each word when no clustering takes place
# We will make a copy of each dataframe and set the "cluster" column to 0 for all rows
# We create a dictionary of dictionaries to save, for each sense, the ratings of its precision and recall
all_scores_no_cluster = {}
all_lengths_no_cluster = {}
# In all further experiments, we keep working with the same dataset as in Experiment 1.
for filename in os.listdir(f"{model_name}/Experiment_1/Curated"):
    if filename.endswith(".csv"):
        df_F1 = pd.read_csv(f"{model_name}/Experiment_1/Curated/" + filename, sep=";", encoding="utf-8", header=0)


        target_word = df_F1["source"][1]
        # We undo clustering: we set all clusters to 0
        # The "default" (i.e. "correct" category is the most frequent sense)
        df_F1["cluster"] = 0
        most_freq_sense = df_F1["sense"].value_counts().idxmax()
        df_F1["default"] = [1 if sense == most_freq_sense else 0 for sense in df_F1["sense"]]
        print("\n\n", target_word)
        total_df_length = len(df_F1)
        # We calculate the amount of different senses
        senses = len(df_F1["sense"].unique())
        print("Number of senses:", senses)
        target_word_dictionary = {}
        target_word_lengths = {}
        target_word_lengths["total"] = total_df_length
        for sense in df_F1["sense"].unique():
            scores = {}
            sense_df = df_F1[df_F1["sense"] == sense]
                    #precision: number of correct instances in a cluster / total cluster size
                    # We calculate the amount of "default" instances
            sense_len = len(sense_df)
        
            
            correct_df = sense_df[sense_df["default"] == True]
            correct_len = len(correct_df)
            if correct_len > 0:
                correct_cluster = correct_df["cluster"].iloc[0]
            else:
                correct_cluster = "No cluster"
                
            cluster_len = len(df_F1[df_F1["cluster"] == correct_cluster])
            print("\nsense:", sense)
            print("instances of sense:", sense_len)
            print("amount of correct instances:", correct_len)
            print("correct cluster number:", correct_cluster)
            print("total amount of points in biggest cluster:", cluster_len)
            
            if cluster_len > 0:  
                precision = correct_len / cluster_len
            else:
                precision = 0
                print("precision zero")
            recall = correct_len / sense_len
            print("precision:", precision)
            print("recall", recall)
            
            F1 = statistics.harmonic_mean((precision, recall))
            print("F1", F1)
            
            scores["precision"] = precision
            scores["recall"] = recall
            scores["F1"] = F1
            target_word_dictionary[sense] = scores
            target_word_lengths[sense] = sense_len
        all_scores_no_cluster[target_word] = target_word_dictionary
        all_lengths_no_cluster[target_word] = target_word_lengths
# Total F1: weighted average of the F1 scores of each gold standard sense
# We calculate the weight for each sense of the dataframe (the amount of instances of this sense divided by the total amount of instances)
# We make a dictionary F1_scores, in which each target word is associated with its F1 score
F1_scores_no_cluster = {}
for target_word in all_scores_no_cluster:
    F1 = 0
    weight = 0
    print("\n\n", target_word)
    for sense in all_scores_no_cluster[target_word]:
        score = all_scores_no_cluster[target_word][sense]["F1"]
        print(sense)
        print("score:", score)
        
        len_sense = all_lengths_no_cluster[target_word][sense]
        print("length of sense:", len_sense)
        
        len_total = all_lengths_no_cluster[target_word]["total"]
        print("total length:", len_total)
        added_weight = len_sense / len_total
        weight += added_weight
        print("added weight:", added_weight, "for a total of", weight)
        F1 += score * (len_sense / len_total)
        print("F1 score after calc:", F1)
    F1_scores_no_cluster[target_word] = F1





 Avocat
Number of senses: 2

sense: lawyer
instances of sense: 56
amount of correct instances: 56
correct cluster number: 0
total amount of points in biggest cluster: 84
precision: 0.6666666666666666
recall 1.0
F1 0.8

sense: avocado
instances of sense: 28
amount of correct instances: 0
correct cluster number: No cluster
total amount of points in biggest cluster: 0
precision zero
precision: 0
recall 0.0
F1 0


 Bien
Number of senses: 3

sense: property
instances of sense: 49
amount of correct instances: 49
correct cluster number: 0
total amount of points in biggest cluster: 98
precision: 0.5
recall 1.0
F1 0.6666666666666666

sense: wellbeing
instances of sense: 20
amount of correct instances: 0
correct cluster number: No cluster
total amount of points in biggest cluster: 0
precision zero
precision: 0
recall 0.0
F1 0

sense: good
instances of sense: 29
amount of correct instances: 0
correct cluster number: No cluster
total amount of points in biggest cluster: 0
precision zero
precisio

In [45]:
Nouns = ["Avocat", "Bien", "Bureau", "Faculté", "Glace", "Souris", "Tour", "Vol"]
Verbs = ["Filer", "Supporter", "Tirer"]

nouns_average_weighted_BIC = np.mean([weighted_scores_BIC[noun] for noun in Nouns])
nouns_average_overall_BIC = np.mean([overall_scores_BIC[noun] for noun in Nouns])
nouns_average_all_in_one_BIC = np.mean([all_in_one_cluster_overall_scores_BIC[noun] for noun in Nouns])
nouns_average_v_measure_BIC = np.mean([v_measure_scores_BIC[noun] for noun in Nouns])
nouns_average_rand_BIC = np.mean([rand_scores_BIC[noun] for noun in Nouns])
nouns_average_F1_BIC = np.mean([F1_scores_BIC[noun] for noun in Nouns])
nouns_average_F1_no_cluster = np.mean([F1_scores_no_cluster[noun] for noun in Nouns])


verbs_average_weighted_BIC = np.mean([weighted_scores_BIC[verb] for verb in Verbs])
verbs_average_overall_BIC = np.mean([overall_scores_BIC[verb] for verb in Verbs])
verbs_average_all_in_one_BIC = np.mean([all_in_one_cluster_overall_scores_BIC[verb] for verb in Verbs])
verbs_average_v_measure_BIC = np.mean([v_measure_scores_BIC[verb] for verb in Verbs])
verbs_average_rand_BIC = np.mean([rand_scores_BIC[verb] for verb in Verbs])
verbs_average_F1_BIC = np.mean([F1_scores_BIC[verb] for verb in Verbs])
verbs_average_F1_no_cluster = np.mean([F1_scores_no_cluster[verb] for verb in Verbs])

print("Nouns average of weighted scores:", nouns_average_weighted_BIC)
print("Nouns average of overall scores:", nouns_average_overall_BIC)
print("Nouns average of all in one cluster scores:", nouns_average_all_in_one_BIC)
print("Nouns average of v-measure cluster score:", nouns_average_v_measure_BIC)
print("Nouns average of rand score:", nouns_average_rand_BIC)
print("Nouns average of F1 scores:", nouns_average_F1_BIC)
print("Nouns average of F1 scores when no clustering takes place:", nouns_average_F1_no_cluster)
print("\n\n")
print("Verbs average of weighted scores:", verbs_average_weighted_BIC)
print("Verbs average of overall scores:", verbs_average_overall_BIC)
print("Verbs average of all in one cluster scores:", verbs_average_all_in_one_BIC)
print("Verbs average of v-measure cluster score:", verbs_average_v_measure_BIC)
print("Verbs average of rand score:", verbs_average_rand_BIC)
print("Verbs average of F1 scores:", verbs_average_F1_BIC)
print("Verbs average of F1 scores when no clustering takes place:", verbs_average_F1_no_cluster)


Nouns average of weighted scores: 0.38176513132096457
Nouns average of overall scores: 0.5032613633766473
Nouns average of all in one cluster scores: 0.5147473294527403
Nouns average of v-measure cluster score: 0.15203072593166256
Nouns average of rand score: 0.07069385580387882
Nouns average of F1 scores: 0.42146940886226164
Nouns average of F1 scores when no clustering takes place: 0.3574206110459456



Verbs average of weighted scores: 0.5746393903482027
Verbs average of overall scores: 0.6701388888888888
Verbs average of all in one cluster scores: 0.5670405982905983
Verbs average of v-measure cluster score: 0.30323259201636793
Verbs average of rand score: 0.22137464429113674
Verbs average of F1 scores: 0.6240837807293503
Verbs average of F1 scores when no clustering takes place: 0.41897442708157473


In [46]:
# We create a dataframe from the dictionaries to eventually save the results to a csv file
df_BIC = pd.DataFrame.from_dict(weighted_scores_BIC, orient="index", columns=["Weighted"])
df_BIC["Overall"] = overall_scores_BIC.values()
df_BIC["V_measure"] = v_measure_scores_BIC.values()
df_BIC["Rand"] = rand_scores_BIC.values()
df_BIC["F1"] = [F1_scores_BIC[word] for word in df_BIC.index]

df_BIC["Type"] = ["Noun" if word in Nouns else "Verb" for word in df_BIC.index]
df_BIC["Amount_senses"] = amount_senses.values()
# We add a "F1" column to the dataframe

# We also create a separate file for the all in one cluster scores
df_all_in_one_BIC = pd.DataFrame.from_dict(all_in_one_cluster_overall_scores_BIC, orient="index", columns=["Overall_no_cluster"])
df_all_in_one_BIC["F1_no_cluster"] = [F1_scores_no_cluster[word] for word in df_all_in_one_BIC.index]
df_all_in_one_BIC["Type"] = ["Noun" if word in Nouns else "Verb" for word in df_all_in_one_BIC.index]

# We add a row 'Average' to the dataframes
df_with_average_BIC = df_BIC.copy()
df_with_average_BIC.loc["Average_nouns"] = [nouns_average_weighted_BIC, nouns_average_overall_BIC, nouns_average_v_measure_BIC, nouns_average_rand_BIC, nouns_average_F1_BIC, "", ""]
df_with_average_BIC.loc["Average_verbs"] = [verbs_average_weighted_BIC, verbs_average_overall_BIC, verbs_average_v_measure_BIC, verbs_average_rand_BIC, verbs_average_F1_BIC, "", ""]
df_with_average_BIC.loc["Global_average"] = [df_BIC["Weighted"].mean(), df_BIC["Overall"].mean(), df_BIC["V_measure"].mean(), df_BIC["Rand"].mean(), df_BIC["F1"].mean(), "", ""] #use the og file, no account of averages


# As we don't make a copy, we have to calculate the mean before adding the other averages!
df_all_in_one_BIC.loc["Global_average"] = [df_all_in_one_BIC["Overall_no_cluster"].mean(), df_all_in_one_BIC["F1_no_cluster"].mean(), "",] 
df_all_in_one_BIC.loc["Average_nouns"] = [nouns_average_all_in_one_BIC, nouns_average_F1_no_cluster,""]
df_all_in_one_BIC.loc["Average_verbs"] = [verbs_average_all_in_one_BIC, verbs_average_F1_no_cluster,""]


In [47]:
fig_zegt_weinig = px.scatter(df_BIC, x="Weighted", y="Overall", text=df_BIC.index, color="Type", title="Scores for each word, op een manier die weinig zegt eigenlijk",template="plotly_white")
fig_zegt_weinig.show()
fig_zegt_weinig = px.scatter(df_BIC, x="Weighted", y="Overall", text=df_BIC.index, color="Type", title="Scores for each word, op een manier die weinig zegt eigenlijk",template="plotly_white")
fig = px.scatter(df_BIC, x="V_measure", y="F1", text=df_BIC.index, color="Type", title="Scores for each word",template="plotly_white")
fig.show()

In [48]:
correct = []
close = [] # One cluster off, often the result of different granularity. Is it ok to report like this or do we have to be more strict?
incorrect = []
differences_BIC = {}
for filename in os.listdir(f"{model_name}/Experiment_2/Curated/"):
    if filename.endswith(".csv"):
        df = pd.read_csv(f"{model_name}/Experiment_2/Curated/" + filename, sep=";", encoding="utf-8", header=0)
        target_word = df["source"][1]
        print(target_word)
        # We count the number of different senses
        senses = len(df["sense"].unique())
        # We count the number of clusters
        clusters = len(df["BIC_cluster_sub"].unique())
        print("Number of senses:", senses)
        print("Number of clusters:", clusters)
        if senses == clusters:
            print("Correct number of clusters")
            difference = 0
            correct.append(target_word)
        elif senses > clusters:
            difference = senses - clusters
            print(difference, "too few clusters")
            if difference == 1:
                close.append(target_word)
            else:
                incorrect.append(target_word)
            difference = -difference
        elif senses < clusters:
            difference = clusters - senses
            print(difference, "too many clusters")
            if difference == 1:
                close.append(target_word)
            else:
                incorrect.append(target_word)
                
        differences_BIC[target_word] = difference
        # We calculate the score
        #print("Score:", score)
        #scores[target_word] = score

print("Correct:", correct)
print("Close:", close)
print("Incorrect:", incorrect)
print("Percentage approach:", (len(correct) + len(close)) / ((len(correct) + len(close) + len(incorrect))))

Avocat
Number of senses: 2
Number of clusters: 3
1 too many clusters
Bien
Number of senses: 3
Number of clusters: 3
Correct number of clusters
Bureau
Number of senses: 3
Number of clusters: 3
Correct number of clusters
Faculté
Number of senses: 3
Number of clusters: 3
Correct number of clusters
Filer
Number of senses: 3
Number of clusters: 3
Correct number of clusters
Glace
Number of senses: 5
Number of clusters: 3
2 too few clusters
Souris
Number of senses: 3
Number of clusters: 3
Correct number of clusters
Supporter
Number of senses: 3
Number of clusters: 3
Correct number of clusters
Tirer
Number of senses: 2
Number of clusters: 3
1 too many clusters
Tour
Number of senses: 5
Number of clusters: 3
2 too few clusters
Vol
Number of senses: 2
Number of clusters: 3
1 too many clusters
Correct: ['Bien', 'Bureau', 'Faculté', 'Filer', 'Souris', 'Supporter']
Close: ['Avocat', 'Tirer', 'Vol']
Incorrect: ['Glace', 'Tour']
Percentage approach: 0.8181818181818182


In [49]:
# We add a "difference" column to the dataframe
df_with_average_BIC["Cluster_Estimation"] = [differences_BIC[word] if word in differences_BIC else "" for word in df_with_average_BIC.index]

In [50]:
# We save these dataframes to a csv file
if not os.path.exists(f"Results/Experiment_2/Curated/BIC/"):
    os.makedirs(f"Results/Experiment_2/Curated/BIC/")
df_with_average_BIC.to_csv(f"Results/Experiment_2/Curated/BIC/{model_name}_scores.csv", sep=";", encoding="utf-8")
fig.write_html(f"Results/Experiment_2/Curated/BIC/{model_name}_scores.html")
df_all_in_one_BIC.to_csv(f"Results/Experiment_2/Curated/BIC/all_in_one_scores.csv", sep=";", encoding="utf-8")

# Comparing the Models

In [59]:
Method = "AGG" # "AGG" or "BIC"

In [60]:
# We load in both dataframes in the "Results" folder
df_camembert = pd.read_csv(f"Results/Experiment_2/Curated/{Method}/Camembert_scores.csv", sep=";", encoding="utf-8", index_col=0)
df_flaubert = pd.read_csv(f"Results/Experiment_2/Curated/{Method}/Flaubert_scores.csv", sep=";", encoding="utf-8", index_col=0)

# We add a column to both dataframes to indicate the model
df_camembert["Model"] = "Camembert"
df_flaubert["Model"] = "Flaubert"

# We concatenate both dataframes
df = pd.concat([df_camembert, df_flaubert])

In [61]:
# We split the dataframe into two separate dataframes
df_weighted = df[["Weighted", "Model"]]
df_overall = df[["Overall", "Model"]]
df_v_measure = df[["V_measure", "Model"]]
df_rand = df[["Rand", "Model"]]
df_f1 = df[["F1", "Model"]]

In [62]:
# We transform the dataframes:
# - We make two columns, one for each model
# - We regroup the instances of the same word (index) in one row
df_weighted = df_weighted.pivot(columns="Model")
df_overall = df_overall.pivot(columns="Model")
df_v_measure = df_v_measure.pivot(columns="Model")
df_rand = df_rand.pivot(columns="Model")
df_f1 = df_f1.pivot(columns="Model")

In [63]:
# We save this dataframe to a csv file and to excel
df_weighted.to_csv(f"Results/Experiment_2/Curated/{Method}/weighted_scores.csv", sep=";", encoding="utf-8")
df_overall.to_csv(f"Results/Experiment_2/Curated/{Method}/overall_scores.csv", sep=";", encoding="utf-8")
df_v_measure.to_csv(f"Results/Experiment_2/Curated/{Method}/v_measure_scores.csv", sep=";", encoding="utf-8")
df_rand.to_csv(f"Results/Experiment_2/Curated/{Method}/rand_scores.csv", sep=";", encoding="utf-8")
df_f1.to_csv(f"Results/Experiment_2/Curated/{Method}/F1_scores.csv", sep=";", encoding="utf-8")

df_weighted.to_excel(f"Results/Experiment_2/Curated/{Method}/weighted_scores.xlsx")
df_overall.to_excel(f"Results/Experiment_2/Curated/{Method}/overall_scores.xlsx")
df_v_measure.to_excel(f"Results/Experiment_2/Curated/{Method}/v_measure_scores.xlsx")
df_rand.to_excel(f"Results/Experiment_2/Curated/{Method}/rand_scores.xlsx")
df_f1.to_excel(f"Results/Experiment_2/Curated/{Method}/F1_scores.xlsx")


In [64]:
# using openpyxl, we open the Results/weighted_scores.xlsx file
# we put the highest value of each row in bold
wb = openpyxl.load_workbook(f"Results/Experiment_2/Curated/{Method}/weighted_scores.xlsx")
ws = wb.active
for row in ws.iter_rows(min_row=4, max_row=ws.max_row, min_col=2, max_col=3):
    max_value = max([cell.value for cell in row])
    for cell in row:
        if cell.value == max_value:
            cell.font = openpyxl.styles.Font(bold=True)
wb.save(f"Results/Experiment_2/Curated/{Method}/weighted_scores.xlsx")

# We do the same for the overall scores
wb = openpyxl.load_workbook(f"Results/Experiment_2/Curated/{Method}/overall_scores.xlsx")
ws = wb.active
for row in ws.iter_rows(min_row=4, max_row=ws.max_row, min_col=2, max_col=3):
    max_value = max([cell.value for cell in row])
    for cell in row:
        if cell.value == max_value:
            cell.font = openpyxl.styles.Font(bold=True)
wb.save(f"Results/Experiment_2/Curated/{Method}/overall_scores.xlsx")

# We do the same for the v measure scores
wb = openpyxl.load_workbook(f"Results/Experiment_2/Curated/{Method}/v_measure_scores.xlsx")
ws = wb.active
for row in ws.iter_rows(min_row=4, max_row=ws.max_row, min_col=2, max_col=3):
    max_value = max([cell.value for cell in row])
    for cell in row:
        if cell.value == max_value:
            cell.font = openpyxl.styles.Font(bold=True)
wb.save(f"Results/Experiment_2/Curated/{Method}/v_measure_scores.xlsx")

# And for Rand scores
wb = openpyxl.load_workbook(f"Results/Experiment_2/Curated/{Method}/rand_scores.xlsx")
ws = wb.active
for row in ws.iter_rows(min_row=4, max_row=ws.max_row, min_col=2, max_col=3):
    max_value = max([cell.value for cell in row])
    for cell in row:
        if cell.value == max_value:
            cell.font = openpyxl.styles.Font(bold=True)
wb.save(f"Results/Experiment_2/Curated/{Method}/rand_scores.xlsx")

# and the same for f1 scores

# We do the same for the overall scores
wb = openpyxl.load_workbook(f"Results/Experiment_2/Curated/{Method}/F1_scores.xlsx")
ws = wb.active
for row in ws.iter_rows(min_row=4, max_row=ws.max_row, min_col=2, max_col=3):
    max_value = max([cell.value for cell in row])
    for cell in row:
        if cell.value == max_value:
            cell.font = openpyxl.styles.Font(bold=True)
wb.save(f"Results/Experiment_2/Curated/{Method}/F1_scores.xlsx")

In [65]:
df.to_csv(f"Results/Experiment_2/Curated/{Method}/results.csv", sep=";", encoding="utf-8")

In [66]:
# We will make boxplots for the weighted scores of each word
fig_weighted = px.box(df, x="Model" ,y="Weighted", color="Model", title="Weighted scores for each model", template="plotly_white")
fig_weighted.update(layout_showlegend=False)
fig_weighted.show()

fig_overall = px.box(df, x="Model" ,y="Overall", color="Model", title="Overall scores for each model", template="plotly_white")
fig_overall.update(layout_showlegend=False)
fig_overall.show()

fig_v_measure = px.box(df, x="Model" ,y="V_measure", color="Model", title="V measure clustering scores for each model", template="plotly_white")
fig_v_measure.update(layout_showlegend=False)
fig_v_measure.show()

fig_rand = px.box(df, x="Model" ,y="Rand", color="Model", title="Rand scores for each model", template="plotly_white")
fig_rand.update(layout_showlegend=False)
fig_rand.show()

fig_F1 = px.box(df, x="Model" ,y="F1", color="Model", title="F1 scores for each model", template="plotly_white")
fig_F1.update(layout_showlegend=False)
fig_F1.show()

As we are more familiar with data visualisation in R, we will use the saved files to create the visualisations in R.

