# Experiment 2: Clustering op de dataset verrijkt met échte automatische downloads

In [5]:
import torch
import pandas as pd
import os
import re
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.metrics.cluster import v_measure_score, adjusted_rand_score
import numpy as np
import statistics
import openpyxl

In [2]:
#from Experiment import run
#run("CamemBERT")
#run("FlauBERT")

The second part of this notebook only works if the first part has been ran with both 'Flaubert' and 'Camembert' as model names.

In [18]:
model_name = "Camembert"

# Question for this notebook
I calculate a "weighted percentage" averaging the percentage of correct predictions for each sense.
The "overall score" is the total percentage of correct predictions across all senses.
As the output of this code shows, the "harmonic mean" of the senses often amounts to 0 due to the presence of 0s.

Is the "weighted percentage" a good metric to evaluate the performance of the model_name?    

**We also add an overall "all in one cluster" score, which is the score if no clustering would take place.**


In [19]:
weighted_scores = {} # our original metric
overall_scores = {} # the percentage of correct predictions across all senses
all_in_one_cluster_overall_scores = {} # the percentage of correct predictions if no clustering would take place
amount_senses = {} # the amount of different senses
v_measure_scores = {} # the v-measure score
rand_scores = {} # the adjusted rand score
# We also calculate a F1 score in a later code block

for filename in os.listdir(f"{model_name}/Experiment_1/Automatic/"):
    if filename.endswith(".csv"):
        df = pd.read_csv(f"{model_name}/Experiment_1/Automatic/" + filename, sep=";", encoding="utf-8", header=0)
        target_word = df["source"][1]
        print(target_word)
        # We calculate the amount of different senses
        senses = len(df["sense"].unique())
        print("Number of senses:", senses)
        percentage_default = (df["default"].sum() / len(df))
        # We also calculate this separately for each "sense"
        percentage_default_mean = df.groupby("sense")["default"].mean()
        # We calculate a harmonic mean, as it is more representative of the overall performance
        harmonic_mean = statistics.harmonic_mean(df.groupby("sense")["default"].mean())
        # We want the mean score across all senses, as it does not mean a lot if a program can correctly define one big cluster containing most of the data and fail at all other senses.
        percentage_weighted = percentage_default_mean.mean()
        v_measure = v_measure_score(df["sense"], df["cluster"])
        rand = adjusted_rand_score(df["sense"], df["cluster"])


        print("Score for each", percentage_default_mean*100)
        print("Overall score", percentage_default*100)
        print("Weighted score", percentage_weighted*100)
        print("Harmonic mean", harmonic_mean*100)
        print("V-measure score", v_measure)
        print("Adjusted Rand score", rand)
        weighted_scores[target_word] = percentage_weighted
        overall_scores[target_word] = percentage_default
        amount_senses[target_word] = senses
        v_measure_scores[target_word] = v_measure
        rand_scores[target_word] = rand
        # We also calculate the score if no clustering would take place
        default = df["sense"].value_counts().idxmax()
        
        # We calculate the overall score: this is the percentage of this sense in the total dataframe
        all_in_one_cluster_score = (df["sense"] == default).sum() / len(df)
        all_in_one_cluster_overall_scores[target_word] = all_in_one_cluster_score
        print("All-in-one score", all_in_one_cluster_score*100, "\n\n")

Avocat
Number of senses: 2
Score for each sense
avocado    100.000000
lawyer      60.714286
Name: default, dtype: float64
Overall score 73.80952380952381
Weighted score 80.35714285714286
Harmonic mean 75.55555555555556
V-measure score 0.7402614432011363
Adjusted Rand score 0.5845667942506892
All-in-one score 66.66666666666666 


Bien
Number of senses: 3
Score for each sense
good         68.965517
property     77.551020
wellbeing    95.000000
Name: default, dtype: float64
Overall score 78.57142857142857
Weighted score 80.50551254984754
Harmonic mean 79.1117279666898
V-measure score 0.645563121172742
Adjusted Rand score 0.6046510838359593
All-in-one score 50.0 


Bureau
Number of senses: 3
Score for each sense
cabinet    57.142857
desk       74.285714
office     33.333333
Name: default, dtype: float64
Overall score 50.48543689320388
Weighted score 54.92063492063492
Harmonic mean 49.21135646687697
V-measure score 0.26754571539523003
Adjusted Rand score 0.15068512891001715
All-in-one score

# Calculating true F1 score

To have a comparable measure of cluster quality, we will calculate F1 scores
they consist of Precision and Recall scores, and is calculated by taking their harmonic mean

Precision is the amount of correct instances of a sense in its biggest cluster, divided by the number of points in that cluster
Recall is the amount of correct instances of a sense in its biggest cluster, divided by the number of points in the sense

we save these values in a dictionary of dictionaries for each target word, which in turn contain precision and recall scores for each sense


Using skmetrics f1 does not work (gives values of 0 where it should not), so we will calculate it ourselves

In [20]:
# We create a dictionary of dictionaries to save, for each sense, the ratings of its precision and recall
all_scores = {}
all_lengths = {}
for filename in os.listdir(f"{model_name}/Experiment_1/Automatic/"):
    if filename.endswith(".csv"):
        df_F1 = pd.read_csv(f"{model_name}/Experiment_1/Automatic/" + filename, sep=";", encoding="utf-8", header=0)

        target_word = df_F1["source"][1]
        print("\n\n", target_word)
        total_df_length = len(df_F1)
        # We calculate the amount of different senses
        senses = len(df_F1["sense"].unique())
        print("Number of senses:", senses)
        target_word_dictionary = {}
        target_word_lengths = {}
        target_word_lengths["total"] = total_df_length
        for sense in df_F1["sense"].unique():
            scores = {}
            sense_df = df_F1[df_F1["sense"] == sense]
                    #precision: number of correct instances in a cluster / total cluster size
                    # We calculate the amount of "default" instances
            sense_len = len(sense_df)
        
            
            correct_df = sense_df[sense_df["default"] == True]
            correct_len = len(correct_df)
            if correct_len > 0:
                correct_cluster = correct_df["cluster"].iloc[0]
            else:
                correct_cluster = "No cluster"
                
            cluster_len = len(df_F1[df_F1["cluster"] == correct_cluster])
            print("\nsense:", sense)
            print("instances of sense:", sense_len)
            print("amount of correct instances:", correct_len)
            print("correct cluster number:", correct_cluster)
            print("total amount of points in biggest cluster:", cluster_len)
            
            if cluster_len > 0:  
                precision = correct_len / cluster_len
            else:
                precision = 0
                print("precision zero")
            recall = correct_len / sense_len
            print("precision:", precision)
            print("recall", recall)
            
            F1 = statistics.harmonic_mean((precision, recall))
            print("F1", F1)
            
            scores["precision"] = precision
            scores["recall"] = recall
            scores["F1"] = F1
            target_word_dictionary[sense] = scores
            target_word_lengths[sense] = sense_len
        all_scores[target_word] = target_word_dictionary
        all_lengths[target_word] = target_word_lengths
            



 Avocat
Number of senses: 2

sense: lawyer
instances of sense: 56
amount of correct instances: 34
correct cluster number: 2
total amount of points in biggest cluster: 34
precision: 1.0
recall 0.6071428571428571
F1 0.7555555555555555

sense: avocado
instances of sense: 28
amount of correct instances: 28
correct cluster number: 1
total amount of points in biggest cluster: 28
precision: 1.0
recall 1.0
F1 1.0


 Bien
Number of senses: 3

sense: property
instances of sense: 49
amount of correct instances: 38
correct cluster number: 2
total amount of points in biggest cluster: 38
precision: 1.0
recall 0.7755102040816326
F1 0.8735632183908046

sense: wellbeing
instances of sense: 20
amount of correct instances: 19
correct cluster number: 4
total amount of points in biggest cluster: 28
precision: 0.6785714285714286
recall 0.95
F1 0.7916666666666667

sense: good
instances of sense: 29
amount of correct instances: 20
correct cluster number: 1
total amount of points in biggest cluster: 20
preci

In [21]:
# Total F1: weighted average of the F1 scores of each gold standard sense
# We calculate the weight for each sense of the dataframe (the amount of instances of this sense divided by the total amount of instances)
# We make a dictionary F1_scores, in which each target word is associated with its F1 score
F1_scores = {}
for target_word in all_scores:
    F1 = 0
    weight = 0
    print("\n\n", target_word)
    for sense in all_scores[target_word]:
        score = all_scores[target_word][sense]["F1"]
        print(sense)
        print("score:", score)
        
        len_sense = all_lengths[target_word][sense]
        print("length of sense:", len_sense)
        
        len_total = all_lengths[target_word]["total"]
        print("total length:", len_total)
        added_weight = len_sense / len_total
        weight += added_weight
        print("added weight:", added_weight, "for a total of", weight)
        F1 += score * (len_sense / len_total)
        print("F1 score after calc:", F1)
    F1_scores[target_word] = F1





 Avocat
lawyer
score: 0.7555555555555555
length of sense: 56
total length: 84
added weight: 0.6666666666666666 for a total of 0.6666666666666666
F1 score after calc: 0.5037037037037037
avocado
score: 1.0
length of sense: 28
total length: 84
added weight: 0.3333333333333333 for a total of 1.0
F1 score after calc: 0.837037037037037


 Bien
property
score: 0.8735632183908046
length of sense: 49
total length: 98
added weight: 0.5 for a total of 0.5
F1 score after calc: 0.4367816091954023
wellbeing
score: 0.7916666666666667
length of sense: 20
total length: 98
added weight: 0.20408163265306123 for a total of 0.7040816326530612
F1 score after calc: 0.5983462350457425
good
score: 0.8163265306122449
length of sense: 29
total length: 98
added weight: 0.29591836734693877 for a total of 1.0
F1 score after calc: 0.8399122492065088


 Bureau
desk
score: 0.6753246753246753
length of sense: 35
total length: 103
added weight: 0.33980582524271846 for a total of 0.33980582524271846
F1 score after calc

In [22]:
# We will also calculate an F1 score for each word when no clustering takes place
# We will make a copy of each dataframe and set the "cluster" column to 0 for all rows
# We create a dictionary of dictionaries to save, for each sense, the ratings of its precision and recall
all_scores_no_cluster = {}
all_lengths_no_cluster = {}
for filename in os.listdir(f"{model_name}/Experiment_1/Automatic/"):
    if filename.endswith(".csv"):
        df_F1 = pd.read_csv(f"{model_name}/Experiment_1/Automatic/" + filename, sep=";", encoding="utf-8", header=0)

        target_word = df_F1["source"][1]
        # We undo clustering: we set all clusters to 0
        # The "default" (i.e. "correct" category is the most frequent sense)
        df_F1["cluster"] = 0
        most_freq_sense = df_F1["sense"].value_counts().idxmax()
        df_F1["default"] = [1 if sense == most_freq_sense else 0 for sense in df_F1["sense"]]
        print("\n\n", target_word)
        total_df_length = len(df_F1)
        # We calculate the amount of different senses
        senses = len(df_F1["sense"].unique())
        print("Number of senses:", senses)
        target_word_dictionary = {}
        target_word_lengths = {}
        target_word_lengths["total"] = total_df_length
        for sense in df_F1["sense"].unique():
            scores = {}
            sense_df = df_F1[df_F1["sense"] == sense]
                    #precision: number of correct instances in a cluster / total cluster size
                    # We calculate the amount of "default" instances
            sense_len = len(sense_df)
        
            
            correct_df = sense_df[sense_df["default"] == True]
            correct_len = len(correct_df)
            if correct_len > 0:
                correct_cluster = correct_df["cluster"].iloc[0]
            else:
                correct_cluster = "No cluster"
                
            cluster_len = len(df_F1[df_F1["cluster"] == correct_cluster])
            print("\nsense:", sense)
            print("instances of sense:", sense_len)
            print("amount of correct instances:", correct_len)
            print("correct cluster number:", correct_cluster)
            print("total amount of points in biggest cluster:", cluster_len)
            
            if cluster_len > 0:  
                precision = correct_len / cluster_len
            else:
                precision = 0
                print("precision zero")
            recall = correct_len / sense_len
            print("precision:", precision)
            print("recall", recall)
            
            F1 = statistics.harmonic_mean((precision, recall))
            print("F1", F1)
            
            scores["precision"] = precision
            scores["recall"] = recall
            scores["F1"] = F1
            target_word_dictionary[sense] = scores
            target_word_lengths[sense] = sense_len
        all_scores_no_cluster[target_word] = target_word_dictionary
        all_lengths_no_cluster[target_word] = target_word_lengths
# Total F1: weighted average of the F1 scores of each gold standard sense
# We calculate the weight for each sense of the dataframe (the amount of instances of this sense divided by the total amount of instances)
# We make a dictionary F1_scores, in which each target word is associated with its F1 score
F1_scores_no_cluster = {}
for target_word in all_scores_no_cluster:
    F1 = 0
    weight = 0
    print("\n\n", target_word)
    for sense in all_scores_no_cluster[target_word]:
        score = all_scores_no_cluster[target_word][sense]["F1"]
        print(sense)
        print("score:", score)
        
        len_sense = all_lengths_no_cluster[target_word][sense]
        print("length of sense:", len_sense)
        
        len_total = all_lengths_no_cluster[target_word]["total"]
        print("total length:", len_total)
        added_weight = len_sense / len_total
        weight += added_weight
        print("added weight:", added_weight, "for a total of", weight)
        F1 += score * (len_sense / len_total)
        print("F1 score after calc:", F1)
    F1_scores_no_cluster[target_word] = F1





 Avocat
Number of senses: 2

sense: lawyer
instances of sense: 56
amount of correct instances: 56
correct cluster number: 0
total amount of points in biggest cluster: 84
precision: 0.6666666666666666
recall 1.0
F1 0.8

sense: avocado
instances of sense: 28
amount of correct instances: 0
correct cluster number: No cluster
total amount of points in biggest cluster: 0
precision zero
precision: 0
recall 0.0
F1 0


 Bien
Number of senses: 3

sense: property
instances of sense: 49
amount of correct instances: 49
correct cluster number: 0
total amount of points in biggest cluster: 98
precision: 0.5
recall 1.0
F1 0.6666666666666666

sense: wellbeing
instances of sense: 20
amount of correct instances: 0
correct cluster number: No cluster
total amount of points in biggest cluster: 0
precision zero
precision: 0
recall 0.0
F1 0

sense: good
instances of sense: 29
amount of correct instances: 0
correct cluster number: No cluster
total amount of points in biggest cluster: 0
precision zero
precisio

In [23]:
Nouns = ["Avocat", "Bien", "Bureau", "Faculté", "Glace", "Souris", "Tour", "Vol"]
Verbs = ["Filer", "Supporter", "Tirer"]

nouns_average_weighted = np.mean([weighted_scores[noun] for noun in Nouns])
nouns_average_overall = np.mean([overall_scores[noun] for noun in Nouns])
nouns_average_all_in_one = np.mean([all_in_one_cluster_overall_scores[noun] for noun in Nouns])
nouns_average_v_measure = np.mean([v_measure_scores[noun] for noun in Nouns])
nouns_average_rand = np.mean([rand_scores[noun] for noun in Nouns])
nouns_average_F1 = np.mean([F1_scores[noun] for noun in Nouns])
nouns_average_F1_no_cluster = np.mean([F1_scores_no_cluster[noun] for noun in Nouns])


verbs_average_weighted = np.mean([weighted_scores[verb] for verb in Verbs])
verbs_average_overall = np.mean([overall_scores[verb] for verb in Verbs])
verbs_average_all_in_one = np.mean([all_in_one_cluster_overall_scores[verb] for verb in Verbs])
verbs_average_v_measure = np.mean([v_measure_scores[verb] for verb in Verbs])
verbs_average_rand = np.mean([rand_scores[verb] for verb in Verbs])
verbs_average_F1 = np.mean([F1_scores[verb] for verb in Verbs])
verbs_average_F1_no_cluster = np.mean([F1_scores_no_cluster[verb] for verb in Verbs])

print("Nouns average of weighted scores:", nouns_average_weighted)
print("Nouns average of overall scores:", nouns_average_overall)
print("Nouns average of all in one cluster scores:", nouns_average_all_in_one)
print("Nouns average of v-measure cluster score:", nouns_average_v_measure)
print("Nouns average of adjusted rand score:", nouns_average_rand)
print("Nouns average of F1 scores:", nouns_average_F1)
print("Nouns average of F1 scores when no clustering takes place:", nouns_average_F1_no_cluster)
print("\n\n")
print("Verbs average of weighted scores:", verbs_average_weighted)
print("Verbs average of overall scores:", verbs_average_overall)
print("Verbs average of all in one cluster scores:", verbs_average_all_in_one)
print("Verbs average of v-measure cluster score:", verbs_average_v_measure)
print("Verbs average of adjusted rand score:", verbs_average_rand)
print("Verbs average of F1 scores:", verbs_average_F1)
print("Verbs average of F1 scores when no clustering takes place:", verbs_average_F1_no_cluster)


Nouns average of weighted scores: 0.6848634455301387
Nouns average of overall scores: 0.6641909694859185
Nouns average of all in one cluster scores: 0.5149002922851882
Nouns average of v-measure cluster score: 0.6027714751898909
Nouns average of adjusted rand score: 0.47025112930933577
Nouns average of F1 scores: 0.7256925291133266
Nouns average of F1 scores when no clustering takes place: 0.3574206110459456



Verbs average of weighted scores: 0.6198262039421247
Verbs average of overall scores: 0.6647630147630148
Verbs average of all in one cluster scores: 0.5700709013209013
Verbs average of v-measure cluster score: 0.4665299416210773
Verbs average of adjusted rand score: 0.339278419353348
Verbs average of F1 scores: 0.69574331049751
Verbs average of F1 scores when no clustering takes place: 0.41897442708157473


In [24]:
# We create a dataframe from the dictionaries to eventually save the results to a csv file
df = pd.DataFrame.from_dict(weighted_scores, orient="index", columns=["Weighted"])
df["Overall"] = overall_scores.values()
df["V_measure"] = v_measure_scores.values()
df["Rand"] = rand_scores.values()
df["F1"] = [F1_scores[word] for word in df.index]

df["Type"] = ["Noun" if word in Nouns else "Verb" for word in df.index]
df["Amount_senses"] = amount_senses.values()
# We add a "F1" column to the dataframe

# We also create a separate file for the all in one cluster scores
df_all_in_one = pd.DataFrame.from_dict(all_in_one_cluster_overall_scores, orient="index", columns=["Overall_no_cluster"])
df_all_in_one["F1_no_cluster"] = [F1_scores_no_cluster[word] for word in df_all_in_one.index]
df_all_in_one["Type"] = ["Noun" if word in Nouns else "Verb" for word in df_all_in_one.index]

In [25]:
# We add a row 'Average' to the dataframes
df_with_average = df.copy()
df_with_average.loc["Average_nouns"] = [nouns_average_weighted, nouns_average_overall, nouns_average_v_measure, nouns_average_rand, nouns_average_F1, "", ""]
df_with_average.loc["Average_verbs"] = [verbs_average_weighted, verbs_average_overall, verbs_average_v_measure, verbs_average_rand, verbs_average_F1, "", ""]
df_with_average.loc["Global_average"] = [df["Weighted"].mean(), df["Overall"].mean(), df["V_measure"].mean(), df["Rand"].mean(), df["F1"].mean(), "", ""] #use the og file, no account of averages


# As we don't make a copy, we have to calculate the mean before adding the other averages!
df_all_in_one.loc["Global_average"] = [df_all_in_one["Overall_no_cluster"].mean(), df_all_in_one["F1_no_cluster"].mean(), "",] 
df_all_in_one.loc["Average_nouns"] = [nouns_average_all_in_one, nouns_average_F1_no_cluster,""]
df_all_in_one.loc["Average_verbs"] = [verbs_average_all_in_one, verbs_average_F1_no_cluster,""]


In [26]:
fig_zegt_weinig = px.scatter(df, x="Weighted", y="Overall", text=df.index, color="Type", title="Scores for each word, op een manier die weinig zegt eigenlijk",template="plotly_white")
fig_zegt_weinig.show()
fig_zegt_weinig = px.scatter(df, x="Weighted", y="Overall", text=df.index, color="Type", title="Scores for each word, op een manier die weinig zegt eigenlijk",template="plotly_white")
fig = px.scatter(df, x="V_measure", y="F1", text=df.index, color="Type", title="Scores for each word",template="plotly_white")
fig.show()

# Estimating the number of clusters
We will now calculate if the model correctly predicts the number of clusters.
We will count the number of different manually annotated senses and compare this to the amount of generated clusters.

In [27]:
correct = []
close = [] # One cluster off, often the result of different granularity. Is it ok to report like this or do we have to be more strict?
incorrect = []
differences = {}
for filename in os.listdir(f"{model_name}/Experiment_1/Automatic/"):
    if filename.endswith(".csv"):
        df = pd.read_csv(f"{model_name}/Experiment_1/Automatic/" + filename, sep=";", encoding="utf-8", header=0)
        target_word = df["source"][1]
        print(target_word)
        # We count the number of different senses
        senses = len(df["sense"].unique())
        # We count the number of clusters
        clusters = len(df["cluster"].unique())
        print("Number of senses:", senses)
        print("Number of clusters:", clusters)
        if senses == clusters:
            print("Correct number of clusters")
            difference = 0
            correct.append(target_word)
        elif senses > clusters:
            difference = senses - clusters
            print(difference, "too few clusters")
            if difference == 1:
                close.append(target_word)
            else:
                incorrect.append(target_word)
            difference = -difference
        elif senses < clusters:
            difference = clusters - senses
            print(difference, "too many clusters")
            if difference == 1:
                close.append(target_word)
            else:
                incorrect.append(target_word)
                
        differences[target_word] = difference
        # We calculate the score
        #print("Score:", score)
        #scores[target_word] = score

print("Correct:", correct)
print("Close:", close)
print("Incorrect:", incorrect)
print("Percentage approach:", (len(correct) + len(close)) / ((len(correct) + len(close) + len(incorrect))))

Avocat
Number of senses: 2
Number of clusters: 3
1 too many clusters
Bien
Number of senses: 3
Number of clusters: 5
2 too many clusters
Bureau
Number of senses: 3
Number of clusters: 4
1 too many clusters
Faculté
Number of senses: 3
Number of clusters: 5
2 too many clusters
Filer
Number of senses: 3
Number of clusters: 5
2 too many clusters
Glace
Number of senses: 5
Number of clusters: 4
1 too few clusters
Souris
Number of senses: 3
Number of clusters: 6
3 too many clusters
Supporter
Number of senses: 3
Number of clusters: 5
2 too many clusters
Tirer
Number of senses: 2
Number of clusters: 2
Correct number of clusters
Tour
Number of senses: 5
Number of clusters: 7
2 too many clusters
Vol
Number of senses: 2
Number of clusters: 5
3 too many clusters
Correct: ['Tirer']
Close: ['Avocat', 'Bureau', 'Glace']
Incorrect: ['Bien', 'Faculté', 'Filer', 'Souris', 'Supporter', 'Tour', 'Vol']
Percentage approach: 0.36363636363636365


**Results for Camembert:**
Baaad (1 correct, 3 close and all the others plain wrong)

**Results for Flaubert:**
Much worse than experiment 1 (only .45 close, no exact matches)

In [28]:
# We add a "difference" column to the dataframe
df_with_average["Cluster_Estimation"] = [differences[word] if word in differences else "" for word in df_with_average.index]

In [29]:
# We save these dataframes to a csv file
if not os.path.exists("Results/Experiment_1/Automatic/"):
    os.makedirs("Results/Experiment_1/Automatic/")
df_with_average.to_csv(f"Results/Experiment_1/Automatic/{model_name}_scores.csv", sep=";", encoding="utf-8")
fig.write_html(f"Results/Experiment_1/Automatic/{model_name}_scores.html")
df_all_in_one.to_csv(f"Results/Experiment_1/Automatic/all_in_one_scores.csv", sep=";", encoding="utf-8")

# Comparing the Models

In [30]:
# We load in both dataframes in the "Results" folder
df_camembert = pd.read_csv("Results/Experiment_1/Automatic/Camembert_scores.csv", sep=";", encoding="utf-8", index_col=0)
df_flaubert = pd.read_csv("Results/Experiment_1/Automatic/Flaubert_scores.csv", sep=";", encoding="utf-8", index_col=0)

# We add a column to both dataframes to indicate the model
df_camembert["Model"] = "Camembert"
df_flaubert["Model"] = "Flaubert"

# We concatenate both dataframes
df = pd.concat([df_camembert, df_flaubert])

In [31]:
# We split the dataframe into two separate dataframes
df_weighted = df[["Weighted", "Model"]]
df_overall = df[["Overall", "Model"]]
df_v_measure = df[["V_measure", "Model"]]
df_rand = df[["Rand", "Model"]]
df_f1 = df[["F1", "Model"]]

In [32]:
# We transform the dataframes:
# - We make two columns, one for each model
# - We regroup the instances of the same word (index) in one row
df_weighted = df_weighted.pivot(columns="Model")
df_overall = df_overall.pivot(columns="Model")
df_v_measure = df_v_measure.pivot(columns="Model")
df_rand = df_rand.pivot(columns="Model")
df_f1 = df_f1.pivot(columns="Model")

In [33]:
# We save this dataframe to a csv file and to excel

df_weighted.to_csv("Results/Experiment_1/Automatic/weighted_scores.csv", sep=";", encoding="utf-8")
df_overall.to_csv("Results/Experiment_1/Automatic/overall_scores.csv", sep=";", encoding="utf-8")
df_v_measure.to_csv("Results/Experiment_1/Automatic/v_measure_scores.csv", sep=";", encoding="utf-8")
df_rand.to_csv("Results/Experiment_1/Automatic/rand_scores.csv", sep=";", encoding="utf-8")
df_f1.to_csv("Results/Experiment_1/Automatic/F1_scores.csv", sep=";", encoding="utf-8")

df_weighted.to_excel("Results/Experiment_1/Automatic/weighted_scores.xlsx")
df_overall.to_excel("Results/Experiment_1/Automatic/overall_scores.xlsx")
df_v_measure.to_excel("Results/Experiment_1/Automatic/v_measure_scores.xlsx")
df_rand.to_excel("Results/Experiment_1/Automatic/rand_scores.xlsx")
df_f1.to_excel("Results/Experiment_1/Automatic/F1_scores.xlsx")


In [34]:
# using openpyxl, we open the Results/weighted_scores.xlsx file
# we put the highest value of each row in bold
wb = openpyxl.load_workbook("Results/Experiment_1/Automatic/weighted_scores.xlsx")
ws = wb.active
for row in ws.iter_rows(min_row=4, max_row=ws.max_row, min_col=2, max_col=3):
    max_value = max([cell.value for cell in row])
    for cell in row:
        if cell.value == max_value:
            cell.font = openpyxl.styles.Font(bold=True)
wb.save("Results/Experiment_1/Automatic/weighted_scores.xlsx")

# We do the same for the overall scores
wb = openpyxl.load_workbook("Results/Experiment_1/Automatic/overall_scores.xlsx")
ws = wb.active
for row in ws.iter_rows(min_row=4, max_row=ws.max_row, min_col=2, max_col=3):
    max_value = max([cell.value for cell in row])
    for cell in row:
        if cell.value == max_value:
            cell.font = openpyxl.styles.Font(bold=True)
wb.save("Results/Experiment_1/Automatic/overall_scores.xlsx")

# We do the same for the v measure scores
wb = openpyxl.load_workbook("Results/Experiment_1/Automatic/v_measure_scores.xlsx")
ws = wb.active
for row in ws.iter_rows(min_row=4, max_row=ws.max_row, min_col=2, max_col=3):
    max_value = max([cell.value for cell in row])
    for cell in row:
        if cell.value == max_value:
            cell.font = openpyxl.styles.Font(bold=True)
wb.save("Results/Experiment_1/Automatic/v_measure_scores.xlsx")

# and the same for f1 scores

# We do the same for the overall scores
wb = openpyxl.load_workbook("Results/Experiment_1/Automatic/F1_scores.xlsx")
ws = wb.active
for row in ws.iter_rows(min_row=4, max_row=ws.max_row, min_col=2, max_col=3):
    max_value = max([cell.value for cell in row])
    for cell in row:
        if cell.value == max_value:
            cell.font = openpyxl.styles.Font(bold=True)
wb.save("Results/Experiment_1/Automatic/F1_scores.xlsx")

In [35]:
df.to_csv("Results/Experiment_1/Automatic/results.csv", sep=";", encoding="utf-8")

In [36]:
# We will make boxplots for the weighted scores of each word
fig_weighted = px.box(df, x="Model" ,y="Weighted", color="Model", title="Weighted scores for each model", template="plotly_white")
fig_weighted.update(layout_showlegend=False)
fig_weighted.show()

fig_overall = px.box(df, x="Model" ,y="Overall", color="Model", title="Overall scores for each model", template="plotly_white")
fig_overall.update(layout_showlegend=False)
fig_overall.show()

fig_v_measure = px.box(df, x="Model" ,y="V_measure", color="Model", title="V measure clustering scores for each model", template="plotly_white")
fig_v_measure.update(layout_showlegend=False)
fig_v_measure.show()

fig_rand = px.box(df, x="Model" ,y="Rand", color="Model", title="Adjusted Rand scores for each model", template="plotly_white")
fig_rand.update(layout_showlegend=False)
fig_rand.show()


fig_F1 = px.box(df, x="Model" ,y="F1", color="Model", title="F1 scores for each model", template="plotly_white")
fig_F1.update(layout_showlegend=False)
fig_F1.show()

As we are more familiar with data visualisation in R, we will use the saved files to create the visualisations in R.

