In [1]:
import torch
from transformers import AutoTokenizer, AutoModel, FlaubertWithLMHeadModel
import pandas as pd
import os
import re
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.metrics.cluster import v_measure_score, adjusted_rand_score
from sklearn.cluster import KMeans
import numpy as np
import warnings

  torch.utils._pytree._register_pytree_node(


In [2]:
!pip install sacremoses



In [3]:
tokenizer = AutoTokenizer.from_pretrained("flaubert/flaubert_base_cased")
model = AutoModel.from_pretrained("flaubert/flaubert_base_cased")
BERT_model= "Flaubert"
vocab = tokenizer.get_vocab()

In [4]:
cuda = torch.device('cuda')
model.cuda()

FlaubertModel(
  (position_embeddings): Embedding(512, 768)
  (embeddings): Embedding(68729, 768, padding_idx=2)
  (layer_norm_emb): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (attentions): ModuleList(
    (0-11): 12 x MultiHeadAttention(
      (q_lin): Linear(in_features=768, out_features=768, bias=True)
      (k_lin): Linear(in_features=768, out_features=768, bias=True)
      (v_lin): Linear(in_features=768, out_features=768, bias=True)
      (out_lin): Linear(in_features=768, out_features=768, bias=True)
    )
  )
  (layer_norm1): ModuleList(
    (0-11): 12 x LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  )
  (ffns): ModuleList(
    (0-11): 12 x TransformerFFN(
      (lin1): Linear(in_features=768, out_features=3072, bias=True)
      (lin2): Linear(in_features=3072, out_features=768, bias=True)
      (act): GELUActivation()
    )
  )
  (layer_norm2): ModuleList(
    (0-11): 12 x LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  )
)

In [5]:
WSD_PATTERN = r' (\w+)/\w+\.\w\.\d+' # (Woord), letterlijke slash, woord, letterlijke punt, letter, cijfer(s)

def get_target_word(sentence, target_word):
    word_match = re.search(r'\b{}\w*\b'.format(target_word), sentence, re.IGNORECASE)
    if word_match is None:
        return None
    return word_match.group(0)

def preproc_sentence(sentence):
    sent_preproc = re.sub(WSD_PATTERN, r'\1', sentence) # Alleen woord blijft over
    return sent_preproc

def find_position_word(sentence, word):
    ids_word = tokenizer.encode(word)
    tokens_word = tokenizer.convert_ids_to_tokens(ids_word)[1:-1]
    ids_sentence = tokenizer.encode(sentence)
    tokens_sentence = tokenizer.convert_ids_to_tokens(ids_sentence)
    #print(tokens_sentence) # if the code breaks, enabling this helps to see if tokenization of the sentence is what breaks it
    if len(tokens_word) == 1:
        position_word_in_sentence = tokens_sentence.index(tokens_word[0])
    else:
        position_word_in_sentence = [tokens_sentence.index(tokens_word[0]),tokens_sentence.index(tokens_word[-1])+1]
    return position_word_in_sentence

@torch.no_grad()
def encode_sentence_and_extract_position(sentence, position):
    ids = tokenizer.encode(sentence)
    bert_output = model.forward(torch.tensor(ids, device=cuda).unsqueeze(0))
    final_layer_embeddings = bert_output['last_hidden_state'].squeeze()
    if type(position) == int:
        return final_layer_embeddings[position].unsqueeze(0)
    elif type(position) == list:
        return torch.mean(
            final_layer_embeddings[position[0]:position[1]], 0
            ).unsqueeze(0)

def get_embeddings_from_dataframe(dataframe):
    word = dataframe['source'].iloc[0].lower()
    embeddings = []
    rows_to_keep = []
    for index, sentence in dataframe.iterrows():
        matched_word_form = get_target_word(sentence['match'], sentence['source'])
            # print(sentence["match"], matched_word_form, "\n")
        if matched_word_form == word:

            prep_sent = preproc_sentence(sentence['match'])
            try:
                position = find_position_word(prep_sent, matched_word_form)
                embeddings.append(encode_sentence_and_extract_position(prep_sent, position))
                rows_to_keep.append(sentence)
            except:
                print(f"Error with sentence: {sentence['match']}")
    new_df = pd.DataFrame(rows_to_keep)
    return new_df, embeddings

# functie om dataframe uit te breiden met drie PCA-waarden
def extend_df_with_pca(df, m_np):
    df_new = df.copy()

    pca = PCA(n_components=3)
    components = pca.fit_transform(m_np)

    df_new.insert(1, 'x', components[:,0])
    df_new.insert(2, 'y', components[:,1])
    df_new.insert(3, 'z', components[:,2])

    return df_new


In [71]:
Word = "Vol"
df = pd.read_csv(f'Corpus/Final/Manual/{Word}.csv', sep=";", encoding="utf-8", header=0)

new_df, embeddings = get_embeddings_from_dataframe(df)
emb_matrix = torch.cat(embeddings, dim=0)
print(f"{len(embeddings)} left of {len(df)}")
# constructie van numpy-matrix; die matrix kunnen we gebruiken voor PCA
matrix_np = emb_matrix.cpu().detach().numpy()

df_pca_og = extend_df_with_pca(new_df, matrix_np)

85 left of 90


# Clustering
We will calculate the optimal number of clusters using a Bayesian Information Criterion


In [72]:

warnings.filterwarnings("ignore")

df_pca_numerical_og = df_pca_og.select_dtypes(include=[np.number])
# Assuming data_og is your data
data_og = df_pca_numerical_og.to_numpy()

# List to hold BIC values
bic_values = []

# Range of potential cluster numbers to test
cluster_range = range(1,11)

# Fit Gaussian Mixture Models for each number of clusters
for i in cluster_range:
    print(f"Fitting model with {i} clusters")
    gmm = GaussianMixture(n_components=i, random_state=0).fit(data_og)
    bic_values.append(gmm.bic(data_og))

# Find the number of clusters that gives the minimum BIC
optimal_clusters = cluster_range[np.argmin(bic_values)]
print(f"Optimal number of clusters: {optimal_clusters}")

# Fit the optimal model
gmm_optimal = GaussianMixture(n_components=optimal_clusters).fit(data_og)

# Predict the cluster for each data point
clusters_og = gmm_optimal.predict(data_og)
# We want them to start counting at "1" instead of "0" (to assign cluster number "0" to wrong clustering later)
clusters_og += 1
print(f"Number of clusters: {len(np.unique(clusters_og))}")

# Add the clusters to the dataframe
df_pca_og["cluster"] = clusters_og

Fitting model with 1 clusters
Fitting model with 2 clusters
Fitting model with 3 clusters
Fitting model with 4 clusters
Fitting model with 5 clusters
Fitting model with 6 clusters
Fitting model with 7 clusters
Fitting model with 8 clusters
Fitting model with 9 clusters
Fitting model with 10 clusters
Optimal number of clusters: 3
Number of clusters: 3


# Baselines
We will make a copy were 1 cluster is made, and a copy where 3 clusters are always made


In [74]:

warnings.filterwarnings("ignore")

# Fit the optimal model
gmm_no_cluster = GaussianMixture(n_components=1).fit(data_og)
gmm_three_clusters = GaussianMixture(n_components=3).fit(data_og)
gmm_seven_clusters = GaussianMixture(n_components=7).fit(data_og)
kmeans_three_clusters = KMeans(n_clusters=3, random_state=0).fit(data_og)
kmeans_seven_clusters = KMeans(n_clusters=7, random_state=0).fit(data_og)

# Predict the cluster for each data point
clusters_no_cluster = gmm_no_cluster.predict(data_og)
clusters_three_cluster = gmm_three_clusters.predict(data_og)
clusters_seven_cluster = gmm_seven_clusters.predict(data_og)
clusters_kmeans_three_cluster = kmeans_three_clusters.predict(data_og)
clusters_kmeans_seven_cluster = kmeans_seven_clusters.predict(data_og)

# We want them to start counting at "1" instead of "0" (to assign cluster number "0" to wrong clustering later)
clusters_no_cluster += 1
clusters_three_cluster += 1
clusters_seven_cluster += 1
clusters_kmeans_three_cluster += 1
clusters_kmeans_seven_cluster += 1



# Add the clusters to the dataframe
df_pca_og["no_cluster"] = clusters_no_cluster
df_pca_og["three_cluster"] = clusters_three_cluster
df_pca_og["seven_cluster"] = clusters_seven_cluster
df_pca_og["kmeans_three_cluster"] = clusters_kmeans_three_cluster
df_pca_og["kmeans_seven_cluster"] = clusters_kmeans_seven_cluster


# Calculating the clustering score
As explained in the *methode* section of the report, we calculate a score for the clustering
 based on the ratio of datapoints falling into a "default" cluster for each manually annotated sense.

we use a Gaussian Mixture model like we do with BIC, but also KMeans as it is the most widely used and simple cluster algorithm


In [75]:
# we will compare the BIC with baseline values using v measure score
v_measure = v_measure_score(df_pca_og["sense"], df_pca_og["cluster"])
v_measure_no_cluster = v_measure_score(df_pca_og["sense"], df_pca_og["no_cluster"])
v_measure_three_cluster = v_measure_score(df_pca_og["sense"], df_pca_og["three_cluster"])
v_measure_seven_cluster = v_measure_score(df_pca_og["sense"], df_pca_og["seven_cluster"])
v_measure_kmeans_three_cluster = v_measure_score(df_pca_og["sense"], df_pca_og["kmeans_three_cluster"])
v_measure_kmeans_seven_cluster = v_measure_score(df_pca_og["sense"], df_pca_og["kmeans_seven_cluster"])

print(f"score with optimal amount of clusters ({optimal_clusters})", v_measure)
print("v-measure no cluster score", v_measure_no_cluster)
print("v-measure three cluster score", v_measure_three_cluster)
print("v-measure seven cluster score", v_measure_seven_cluster)
print("v-measure kmeans three cluster score", v_measure_kmeans_three_cluster)
print("v-measure kmeans seven cluster score", v_measure_kmeans_seven_cluster)


# We also use adjusted rand index scores

adjusted_rand = adjusted_rand_score(df_pca_og["sense"], df_pca_og["cluster"])
adjusted_rand_no_cluster = adjusted_rand_score(df_pca_og["sense"], df_pca_og["no_cluster"])
adjusted_rand_three_cluster = adjusted_rand_score(df_pca_og["sense"], df_pca_og["three_cluster"])
adjusted_rand_seven_cluster = adjusted_rand_score(df_pca_og["sense"], df_pca_og["seven_cluster"])
adjusted_rand_km_three_cluster = adjusted_rand_score(df_pca_og["sense"], df_pca_og["kmeans_three_cluster"])
adjusted_rand_km_seven_cluster = adjusted_rand_score(df_pca_og["sense"], df_pca_og["kmeans_seven_cluster"])

print(f"adjusted rand index score with optimal ({optimal_clusters}) clusters:", adjusted_rand)
print("adjusted rand index no cluster score", adjusted_rand_no_cluster)
print("adjusted rand index three cluster score", adjusted_rand_three_cluster)
print("adjusted rand index seven cluster score", adjusted_rand_seven_cluster)
print("adjusted rand index kmeans three cluster score", adjusted_rand_km_three_cluster)
print("adjusted rand index kmeans seven cluster score", adjusted_rand_km_seven_cluster)

score with optimal amount of clusters (3) 0.8963335202563367
v-measure no cluster score 0.0
v-measure three cluster score 0.8963335202563368
v-measure seven cluster score 0.5082382530662843
v-measure kmeans three cluster score 0.8963335202563367
v-measure kmeans seven cluster score 0.4646915030414075
adjusted rand index score with optimal (3) clusters: 0.9032147596421386
adjusted rand index no cluster score 0.0
adjusted rand index three cluster score 0.9032147596421386
adjusted rand index seven cluster score 0.27825150345570415
adjusted rand index kmeans three cluster score 0.9032147596421386
adjusted rand index kmeans seven cluster score 0.24453136317543098


In [6]:
# We do this automatically for each word
Word_list = ["Avocat", "Bien", "Bureau", "Faculté", "Filer", "Glace", "Souris", "Supporter", "Tirer", "Tour", "Vol"]

v_measure_results = {}
adjusted_rand_results = {}

for Word in Word_list:
    print(f"{Word}")
    df = pd.read_csv(f'Corpus/Final/Manual/{Word}.csv', sep=";", encoding="utf-8", header=0)
    
    new_df, embeddings = get_embeddings_from_dataframe(df)
    emb_matrix = torch.cat(embeddings, dim=0)
    print(f"{len(embeddings)} left of {len(df)}")
    # constructie van numpy-matrix; die matrix kunnen we gebruiken voor PCA
    matrix_np = emb_matrix.cpu().detach().numpy()
    
    df_pca_og = extend_df_with_pca(new_df, matrix_np)
        
        
    #warnings.filterwarnings("ignore")
    
    df_pca_numerical_og = df_pca_og.select_dtypes(include=[np.number])
    # Assuming data_og is your data
    data_og = df_pca_numerical_og.to_numpy()
    
    # List to hold BIC values
    bic_values = []
    
    # Range of potential cluster numbers to test
    cluster_range = range(1,11)
    
    # Fit Gaussian Mixture Models for each number of clusters
    for i in cluster_range:
        #print(f"Fitting model with {i} clusters")
        gmm = GaussianMixture(n_components=i, random_state=0).fit(data_og)
        bic_values.append(gmm.bic(data_og))
    
    # Find the number of clusters that gives the minimum BIC
    optimal_clusters = cluster_range[np.argmin(bic_values)]
    correct_clusters = len(df_pca_og["sense"].unique())
    print(f"Estimation optimal number of clusters: {optimal_clusters} (correct: {correct_clusters})")
    
    # Fit the optimal model
    # We use fixed random_state seeds so this does not cause unwanted variation in the results for comparison
    gmm_optimal = GaussianMixture(n_components=optimal_clusters, random_state=0).fit(data_og)
    gmm_correct_clusters = GaussianMixture(n_components=correct_clusters, random_state=0).fit(data_og)
    gmm_three_clusters = GaussianMixture(n_components=3, random_state=0).fit(data_og)
    gm_seven_clusters = GaussianMixture(n_components=7, random_state=0).fit(data_og)
    gmm_no_cluster = GaussianMixture(n_components=1, random_state=0).fit(data_og)
    
    
    # Predict the cluster for each data point
    clusters_og = gmm_optimal.predict(data_og)
    clusters_correct_clusters = gmm_correct_clusters.predict(data_og)
    clusters_no_cluster = gmm_no_cluster.predict(data_og)
    clusters_three_cluster = gmm_three_clusters.predict(data_og)
    clusters_seven_cluster = gm_seven_clusters.predict(data_og)
    # We want them to start counting at "1" instead of "0" (to assign cluster number "0" to wrong clustering later)
    clusters_og += 1
    clusters_correct_clusters += 1
    clusters_no_cluster += 1
    clusters_three_cluster += 1
    clusters_seven_cluster += 1

    # Add the clusters to the dataframe
    df_pca_og["cluster"] = clusters_og
    df_pca_og["correct_cluster"] = clusters_correct_clusters
    df_pca_og["no_cluster"] = clusters_no_cluster
    df_pca_og["three_cluster"] = clusters_three_cluster
    df_pca_og["seven_cluster"] = clusters_seven_cluster
    # we will comparing the BIC with baseline values using v measure score
    
    v_measure = v_measure_score(df_pca_og["sense"], df_pca_og["cluster"])
    v_measure_correct_clusters = v_measure_score(df_pca_og["sense"], df_pca_og["correct_cluster"])
    v_measure_no_cluster = v_measure_score(df_pca_og["sense"], df_pca_og["no_cluster"])
    v_measure_three_cluster = v_measure_score(df_pca_og["sense"], df_pca_og["three_cluster"])
    v_measure_seven_cluster = v_measure_score(df_pca_og["sense"], df_pca_og["seven_cluster"])
    
    print(f"score with optimal amount of clusters ({optimal_clusters})", v_measure)
    print(f"score with correct amount of clusters ({correct_clusters})", v_measure_correct_clusters)
    print("v-measure no cluster score", v_measure_no_cluster)
    print("v-measure three cluster score", v_measure_three_cluster)
    print("v-measure seven cluster score", v_measure_seven_cluster)
    
    # We also use adjusted rand index scores
    adjusted_rand = adjusted_rand_score(df_pca_og["sense"], df_pca_og["cluster"])
    adjusted_rand_correct_clusters = adjusted_rand_score(df_pca_og["sense"], df_pca_og["correct_cluster"])
    adjusted_rand_no_cluster = adjusted_rand_score(df_pca_og["sense"], df_pca_og["no_cluster"])
    adjusted_rand_three_cluster = adjusted_rand_score(df_pca_og["sense"], df_pca_og["three_cluster"])
    adjusted_rand_seven_cluster = adjusted_rand_score(df_pca_og["sense"], df_pca_og["seven_cluster"])
    
    print(f"adjusted rand index score with optimal ({optimal_clusters}) clusters:", adjusted_rand)
    print(f"adjusted rand index score with correct ({correct_clusters}) clusters:", adjusted_rand_correct_clusters)
    print("adjusted rand index no cluster score", adjusted_rand_no_cluster)
    print("adjusted rand index three cluster score", adjusted_rand_three_cluster)
    print("adjusted rand index seven cluster score", adjusted_rand_seven_cluster)
    
       # Save the v-measure results
    v_measure_results[Word] = {
        "optimal_clusters": optimal_clusters,
        "correct_clusters": correct_clusters,
        "score": v_measure,
        "correct_cluster_score": v_measure_correct_clusters,
        "no_cluster_baseline": v_measure_no_cluster,
        "three_cluster_baseline": v_measure_three_cluster,
        "seven_cluster_baseline": v_measure_seven_cluster
    }

    # Save the adjusted rand index results
    adjusted_rand_results[Word] = {
        "optimal_clusters": optimal_clusters,
        "correct_clusters": correct_clusters,
        "score":   adjusted_rand,
        "correct_cluster_score": adjusted_rand_correct_clusters,
        "no_cluster_baseline": adjusted_rand_no_cluster,
        "three_cluster_baseline": adjusted_rand_three_cluster,
        "seven_cluster_baseline": adjusted_rand_seven_cluster
    } 
    print("\n\n")

Avocat
84 left of 107




Estimation optimal number of clusters: 3 (correct: 2)
score with optimal amount of clusters (3) 0.7020502716093676
score with correct amount of clusters (2) 0.7158920010774074
v-measure no cluster score 0.0
v-measure three cluster score 0.7020502716093676
v-measure seven cluster score 0.4549075121905148
adjusted rand index score with optimal (3) clusters: 0.8036426589295034
adjusted rand index score with correct (2) clusters: 0.8150984378045724
adjusted rand index no cluster score 0.0
adjusted rand index three cluster score 0.8036426589295034
adjusted rand index seven cluster score 0.26117861836355516



Bien
98 left of 99




Estimation optimal number of clusters: 3 (correct: 3)
score with optimal amount of clusters (3) 0.9541014187403705
score with correct amount of clusters (3) 0.9541014187403705
v-measure no cluster score 0.0
v-measure three cluster score 0.9541014187403705
v-measure seven cluster score 0.6193532561843749
adjusted rand index score with optimal (3) clusters: 0.9654410605877043
adjusted rand index score with correct (3) clusters: 0.9654410605877043
adjusted rand index no cluster score 0.0
adjusted rand index three cluster score 0.9654410605877043
adjusted rand index seven cluster score 0.44985935368469426



Bureau




103 left of 123




Estimation optimal number of clusters: 2 (correct: 3)
score with optimal amount of clusters (2) 0.3185719610891792
score with correct amount of clusters (3) 0.27203746491149194
v-measure no cluster score 0.0
v-measure three cluster score 0.27203746491149194
v-measure seven cluster score 0.2846532004741612
adjusted rand index score with optimal (2) clusters: 0.19385283320417607
adjusted rand index score with correct (3) clusters: 0.16733117687402116
adjusted rand index no cluster score 0.0
adjusted rand index three cluster score 0.16733117687402116
adjusted rand index seven cluster score 0.12884464129290873



Faculté




80 left of 106




Estimation optimal number of clusters: 2 (correct: 3)
score with optimal amount of clusters (2) 0.7988668250065365
score with correct amount of clusters (3) 0.6752659110733857
v-measure no cluster score 0.0
v-measure three cluster score 0.6752659110733857
v-measure seven cluster score 0.7195925940286167
adjusted rand index score with optimal (2) clusters: 0.7336714213301259
adjusted rand index score with correct (3) clusters: 0.49102688164483094
adjusted rand index no cluster score 0.0
adjusted rand index three cluster score 0.49102688164483094
adjusted rand index seven cluster score 0.48800279362674304



Filer
96 left of 96




Estimation optimal number of clusters: 4 (correct: 3)
score with optimal amount of clusters (4) 0.8818594500715287
score with correct amount of clusters (3) 0.8788495222931244
v-measure no cluster score 0.0
v-measure three cluster score 0.8788495222931244
v-measure seven cluster score 0.7038029206215771
adjusted rand index score with optimal (4) clusters: 0.7909868993574418
adjusted rand index score with correct (3) clusters: 0.8797819352773357
adjusted rand index no cluster score 0.0
adjusted rand index three cluster score 0.8797819352773357
adjusted rand index seven cluster score 0.4476363183814248



Glace




123 left of 152




Estimation optimal number of clusters: 4 (correct: 5)
score with optimal amount of clusters (4) 0.4643790540203865
score with correct amount of clusters (5) 0.5671114709758175
v-measure no cluster score 0.0
v-measure three cluster score 0.34766134646155905
v-measure seven cluster score 0.5038918686197962
adjusted rand index score with optimal (4) clusters: 0.4259438760334223
adjusted rand index score with correct (5) clusters: 0.5353303160667116
adjusted rand index no cluster score 0.0
adjusted rand index three cluster score 0.360389886666572
adjusted rand index seven cluster score 0.3910921343997244



Souris




68 left of 68




Estimation optimal number of clusters: 9 (correct: 3)
score with optimal amount of clusters (9) 0.5032115129253564
score with correct amount of clusters (3) 0.4425301335934341
v-measure no cluster score 0.0
v-measure three cluster score 0.4425301335934341
v-measure seven cluster score 0.5653970427251596
adjusted rand index score with optimal (9) clusters: 0.30354728942339065
adjusted rand index score with correct (3) clusters: 0.33552305472006466
adjusted rand index no cluster score 0.0
adjusted rand index three cluster score 0.33552305472006466
adjusted rand index seven cluster score 0.34202766818979513



Supporter




55 left of 74




Estimation optimal number of clusters: 10 (correct: 3)
score with optimal amount of clusters (10) 0.36798138559774607
score with correct amount of clusters (3) 0.4276795776950087
v-measure no cluster score 0.0
v-measure three cluster score 0.4276795776950087
v-measure seven cluster score 0.39267357546694237
adjusted rand index score with optimal (10) clusters: 0.1416918893961935
adjusted rand index score with correct (3) clusters: 0.304319041064864
adjusted rand index no cluster score 0.0
adjusted rand index three cluster score 0.304319041064864
adjusted rand index seven cluster score 0.1757253616817201



Tirer
104 left of 106




Estimation optimal number of clusters: 3 (correct: 2)
score with optimal amount of clusters (3) 0.1665425464505238
score with correct amount of clusters (2) 0.12430999715752401
v-measure no cluster score 0.0
v-measure three cluster score 0.1665425464505238
v-measure seven cluster score 0.32186242620610556
adjusted rand index score with optimal (3) clusters: 0.04061766522544781
adjusted rand index score with correct (2) clusters: -0.08180165623106443
adjusted rand index no cluster score 0.0
adjusted rand index three cluster score 0.04061766522544781
adjusted rand index seven cluster score 0.17836425865594357



Tour




74 left of 97




Estimation optimal number of clusters: 8 (correct: 5)
score with optimal amount of clusters (8) 0.7290700908922257
score with correct amount of clusters (5) 0.6591649016645261
v-measure no cluster score 0.0
v-measure three cluster score 0.587961128712006
v-measure seven cluster score 0.7373873043571656
adjusted rand index score with optimal (8) clusters: 0.5711883211463235
adjusted rand index score with correct (5) clusters: 0.525775593420053
adjusted rand index no cluster score 0.0
adjusted rand index three cluster score 0.31411662203400753
adjusted rand index seven cluster score 0.6086454852748627



Vol
85 left of 90




Estimation optimal number of clusters: 3 (correct: 2)
score with optimal amount of clusters (3) 0.8963335202563367
score with correct amount of clusters (2) 1.0
v-measure no cluster score 0.0
v-measure three cluster score 0.8963335202563367
v-measure seven cluster score 0.5087587500347632
adjusted rand index score with optimal (3) clusters: 0.9032147596421386
adjusted rand index score with correct (2) clusters: 1.0
adjusted rand index no cluster score 0.0
adjusted rand index three cluster score 0.9032147596421386
adjusted rand index seven cluster score 0.27870155850595446


In [7]:
# Save the results from the dictionaries to csv files
v_measure_df = pd.DataFrame(v_measure_results).T
adjusted_rand_df = pd.DataFrame(adjusted_rand_results).T

if not os.path.exists("Results/Experiment_1/Curated/cluster_comparison"):
    os.makedirs("Results/Experiment_1/Curated/cluster_comparison")
v_measure_df.to_csv("Results/Experiment_1/Curated/cluster_comparison/v_measure_results.csv")
adjusted_rand_df.to_csv("Results/Experiment_1/Curated/cluster_comparison/adjusted_rand_results.csv")

 Using R, we calculated if the BIC scores are significantly higher than the baselines with 3 or 7 clusters.

We get significant results for: 
    v-measure / 3 clusters (p = 0.03)
    adjusted rand / 7 clusters (p = 0.02)
    all together (v-measure and adj rand) / 7 clusters = 0.005

(Almost but not quite significant:
    v-measure / 7 clusters (p = 0.06)
    all together (v-measure and adj rand) / 3 clusters = 0.06)
    
Not significant: 
    adjusted rand / 3 clusters (p = 0.24)


