# Ajustament de les diferents combinacions d'encoder-mètrica de similitud per determinar l'índex de similitud semàntica

### Importo llibreries i dependències

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import glob
import os
import pickle
from googletrans import Translator
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import nltk
import unidecode
import re
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import time
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
stemmer = SnowballStemmer('spanish')

### Càrrega dels documents de la base de dades d'una categoria específica

In [3]:
#LListo tots els tfg dins de la categoria mecanica de fluids. 
#Si volgués estudiar una altre categoria hauria de canviar el path.
llista_text_documents_mecanica_fluids = []
for file in glob.glob('../00Data/dataset_txt/Mecanica_fluids/*'):
    with open(file,'r') as txt:
        llista_text_documents_mecanica_fluids.append(txt.read())
        


### Càrrega del document plagiat a analitzar

In [4]:
#Especificar el document a analitzar en el bucle inferior.
llista_text_documents_plagiats = []
for file in glob.glob('../00Data/plagiados_por_categorias/txt/Mecanica_fluids/Turbinas dinámicas y golpe de ariete.txt'):
    with open(file,'r') as txt:
        llista_text_documents_plagiats.append(txt.read())

### Preprocessament del text

In [5]:
#Funció que permet pre-processar el text dins de qualsevol tfg i separar-lo en oracions
def clean_raw_text (documento):
    text_sentences = []
    text = str(documento).replace('\n',' ')
    text = nltk.sent_tokenize(text)
    
    for sentence in text:
        words = []
        sentence = sentence.lower()
        word_tokens = nltk.word_tokenize(sentence)
        filtered_text = " ".join([w for w in word_tokens if not w in stopwords.words('spanish')])
        stemmed_text = " ".join(stemmer.stem(word) for word in nltk.word_tokenize(filtered_text))  
        no_punctuaction_text = re.sub(r'[^\w\s]','',stemmed_text)
        no_accents = unidecode.unidecode(no_punctuaction_text)
        no_digits_text = re.sub('\d', '', no_accents)
        for word in nltk.word_tokenize(no_digits_text):
            if len(word) >=2:
                words.append(word)
                                
        no_digits_text = " ".join(words)
        clean1 = re.sub(' +', ' ',no_digits_text)
        full_clean = clean1.strip()    
        text_sentences.append(full_clean)
        
    text_sentences = [x for x in text_sentences if x !='']
    return(text_sentences)

In [6]:
#Es crea una llista de llista de llistes on per a cada document es llisten les diferents oracions pre-processades

base_document_sentences = []
for i in range (0,len(llista_text_documents_mecanica_fluids)):
    base_document_sentences.append(clean_raw_text(llista_text_documents_mecanica_fluids[i]))
    
plag_document_sentences = []
for i in range (0,len(llista_text_documents_plagiats)):
    plag_document_sentences.append(clean_raw_text(llista_text_documents_plagiats[i]))
    


### Generar vocabulari

In [7]:
#Creo el vocabulari pel Word2Vec amb totes les paraules dels documents base

vocab_words = []

for tfg in llista_text_documents_mecanica_fluids:
    tfg = tfg.replace('\n',' ')
    for parag in tfg.split(' '):
        words = []
        parag = parag.lower()
        word_tokens = nltk.word_tokenize(parag)
        filtered_text = " ".join([w for w in word_tokens if not w in stopwords.words('spanish')])
        stemmed_text = " ".join(stemmer.stem(word) for word in nltk.word_tokenize(filtered_text))
        no_punctuaction_text = re.sub(r'[^\w\s]','',stemmed_text)
        no_accents = unidecode.unidecode(no_punctuaction_text)
        no_digits_text = re.sub('\d', '', no_accents)
        for word in nltk.word_tokenize(no_digits_text):
            if len(word) >=2:
                words.append(word)
                                
        no_digits_text = " ".join(words)
        clean1 = re.sub(' +', ' ',no_digits_text)
        full_clean = clean1.strip()    
        
        
        
        vocab_words.append(nltk.word_tokenize(full_clean))
        

    vocab_words

vocab_words = [x for x in vocab_words if x !=[]]
print(len(vocab_words))


92696


### TF-IDF + cos similarity

In [80]:
all_sentences_in_one_list = []
for document in base_document_sentences:
    for sentence in document:
        all_sentences_in_one_list.append(sentence)

In [81]:
len(all_sentences_in_one_list)

6899

In [82]:
#Inicialitzo TF-IDF
tfidf_vectorizer = TfidfVectorizer()


In [83]:
#Fit_transform amb les sentences de base
tfidf_vectorizer.fit(all_sentences_in_one_list)

tf_idf_matrius_documents_base = []
for document in base_document_sentences:
    tf_idf_matrius_documents_base.append(tfidf_vectorizer.transform(document))



#Transform les sentences de plagi
tfidf_matrix_plag = tfidf_vectorizer.transform(plag_document_sentences[0])#Especifico l'unic document que hi ha

In [84]:
%%time
similituds_per_cada_document = []
for document in tf_idf_matrius_documents_base:
    similituds_per_cada_document.append(cosine_similarity(document,tfidf_matrix_plag))

CPU times: user 13.2 ms, sys: 2.13 ms, total: 15.3 ms
Wall time: 15.9 ms


In [85]:
similitud_global_per_document = []
for document in similituds_per_cada_document:
    similitud_global_per_document.append(sum(document.max(axis=0))/len(document.max(axis=0)))
    
    

In [86]:
similitud_global_per_document

[0.2101534358616269,
 0.16400976859556846,
 0.07648039469877646,
 0.2097877427776879,
 0.2678788919295006,
 0.10316638994815484,
 0.2033653580661739,
 0.19758900534534019,
 0.20177721925011524,
 0.1782031863029215,
 0.15612839115754448,
 0.186245235899371,
 0.22758953708905946,
 0.2336213877690274,
 0.17369625792652288,
 0.6464136971379421]

In [87]:
#L'index del document que té la similitud màxima. Podria printar els que tinguessin una similitud major al 35%
similitud_global_per_document.index(max(similitud_global_per_document))

15

In [88]:
#EL DOCUMENT DEL QUAL S'HA FET EL PLAGI
print('S''ha detectat plagi en el document analitzat')
print('El document amb títol : '+str(glob.glob('../00Data/dataset_txt/Mecanica_fluids/*')[similitud_global_per_document.index(max(similitud_global_per_document))].split('/')[-1])+' té una semblança del '+str(max(similitud_global_per_document)*100)+'%')


Sha detectat plagi en el document analitzat
El document amb títol : Estudio_fluidodinamico_de_un_agitador_de_turbina té una semblança del 64.6413697137942%


In [89]:
#MITJANA DE SIMILITUDS ENTRE ELS DOCUMENTS SENSE CONSIDERAR LA MÀXIMA SIMILITUD:
sum(similitud_global_per_document[:-1])/len(similitud_global_per_document[:-1])

0.18597948017449273

### WORD2VEC 

In [8]:
from gensim.models import Word2Vec
%time
w2v_model = Word2Vec(vocab_words,
                     size=50,
                     min_count=1, #MIN COUNT EN 1. SINO TINC PROBLEMES A L'HORA DE COMPUTAR COSINE SIMILARITY
                     window=2,
                     workers=4) 

w2v_model.init_sims(replace=True)  # Normalizes the vectors in the word2vec class.

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.01 µs


In [91]:
len(w2v_model.wv.vocab) #Vocabulari del word2vec

7534

#Descarrego el model
pkl_filename = "w2v_semantic_text_similarity_mecanica_fluids.pkl"
with open('../03Models/w2v_models_for_compute_similarities/'+pkl_filename, 'wb') as file:
    pickle.dump(w2v_model, file)

### Word2vec + cosine similarity

In [92]:
#Creo una llista de llistes on tinc les diferents frases tokenitzades de cada document. [[[]]]
base_documents_frases_tokenitzades = []
for i in range (0,len(base_document_sentences)):
    frases_tokenitzades = []
    for sentence in base_document_sentences[i]:
        frases_tokenitzades.append(nltk.word_tokenize(sentence))#Tokenitzo cada frase, obtenint una llista de paraules per a cada frase
    base_documents_frases_tokenitzades.append(frases_tokenitzades)
    
    
    
#Faig el mateix amb el document plagiat
plag_document_frases_tokenitzades = []
for sentence in plag_document_sentences[0]: 
    plag_document_frases_tokenitzades.append(nltk.word_tokenize(sentence))

In [93]:
#Filtro les paraules dels documents base de cada frase segons si pertanyen al vocabulari del word2vec
base_filtered_document_sentence_global = []
for document in base_documents_frases_tokenitzades: 
    filtered_sentence_global=[] 
    for sentence in document: 
        filtered_sentence_global.append([word for word in sentence if word in w2v_model.wv.vocab])
    base_filtered_document_sentence_global.append(filtered_sentence_global)
    
    
#Faig el mateix amb el document plagiat
plag_filtered_document_sentence = []
for sentence in plag_document_frases_tokenitzades:
    plag_filtered_document_sentence.append([word for word in sentence if word in w2v_model.wv.vocab])
    

In [94]:
#L'índex son els documents, a cada row hi ha una llista de les diferents frases tokentizades.
df_base = pd.DataFrame({'Filtered_Sentence': base_filtered_document_sentence_global})
df_plag = pd.DataFrame({'Filtered Sentence': plag_filtered_document_sentence})

In [95]:
#Transformo a vectors les paraules de cada frase de cada document a partir del wor2vec.
#No ha estat necessari filtrar les paraules perquè totes ja formen part del vocabulari. Si no fos així, hauria de filtrar-les
base_document_sentence_vectors = []
for i in range (0,len(base_filtered_document_sentence_global)):
    frase_vectorizada = []
    for sentence in base_filtered_document_sentence_global[i]:
        mean = []
        for word in sentence:
            mean.append(w2v_model.wv.get_vector(word))
        mean = np.array(mean).mean(axis=0)
        frase_vectorizada.append(mean)
    base_document_sentence_vectors.append(frase_vectorizada)
    
    
#plagi
plag_document_sentence_vectors = []
for sentence in plag_filtered_document_sentence:
    mean = []
    for word in sentence:
        mean.append(w2v_model.wv.get_vector(word))
    mean = np.array(mean).mean(axis=0)
    plag_document_sentence_vectors.append(mean)

In [96]:
#Transformo els vectors d'una dimensió a dues dimensions per tal de poder aplicar posteriorment cosine similarity
base_document_sentence_vectors_good_format = []
for document in base_document_sentence_vectors:
    vectorizado_bueno = []
    for vector in document:
        vectorizado_bueno.append([vector])
    base_document_sentence_vectors_good_format.append(vectorizado_bueno)
    
#El mateix pel document plagiat
plag_document_sentence_vectors_good_format = []
for vector in plag_document_sentence_vectors:
    plag_document_sentence_vectors_good_format.append([vector])
    

In [97]:
%%time
similarities_between_each_vector_plag_each_vector_base = []
for i in range (0,len(base_document_sentence_vectors_good_format)):
    each_document = []
    for vector2 in plag_document_sentence_vectors_good_format:
        each_plag_sentence = []
        for vector1 in base_document_sentence_vectors_good_format[i]:
            each_plag_sentence.append(cosine_similarity(vector2,vector1))
        each_document.append(each_plag_sentence)
    similarities_between_each_vector_plag_each_vector_base.append(each_document)

CPU times: user 16.2 s, sys: 71.2 ms, total: 16.3 s
Wall time: 16.4 s


In [98]:
len(similarities_between_each_vector_plag_each_vector_base)

16

In [99]:
len(similarities_between_each_vector_plag_each_vector_base[0])

14

In [100]:
len(similarities_between_each_vector_plag_each_vector_base[0][0])

541

In [101]:
#La llista similarities_between_each_vector_plag_each_vector_base es una llista que conté 16 elements (documents):
#Per a cada document hi ha una llista de 14 elements. 
#Cadascun dels 14 elements es correspon a un dels vectors del document plagiat. 
#Dins de cadascun dels 14 elements hi ha 541 similituds.
#S'ha comparat cada vector del document plagiat amb cada vector del document base.
#L'objectiu es que per a cada document extreure la similitud màxima de cadascun dels 14 elements 
#i calcular l'index de simlitud global.

In [102]:
best_similarities_sentence_document = []
for i in range (0,len(similarities_between_each_vector_plag_each_vector_base)):
    each_document = []
    for each_vector in similarities_between_each_vector_plag_each_vector_base[i]:
        each_document.append(max(each_vector))
    best_similarities_sentence_document.append(each_document)

In [103]:
index_entre_documents_w2v = []
for each_document in best_similarities_sentence_document:
    index_entre_documents_w2v.append(sum(each_document)/len(each_document))
    
index_entre_documents_w2v  

[array([[0.48598108]], dtype=float32),
 array([[0.40634847]], dtype=float32),
 array([[0.314605]], dtype=float32),
 array([[0.43441653]], dtype=float32),
 array([[0.46903542]], dtype=float32),
 array([[0.35735634]], dtype=float32),
 array([[0.4866169]], dtype=float32),
 array([[0.4273612]], dtype=float32),
 array([[0.4642219]], dtype=float32),
 array([[0.4320094]], dtype=float32),
 array([[0.4183214]], dtype=float32),
 array([[0.4368577]], dtype=float32),
 array([[0.52176607]], dtype=float32),
 array([[0.4840319]], dtype=float32),
 array([[0.43751302]], dtype=float32),
 array([[0.7719879]], dtype=float32)]

In [104]:
#Índex de similitud entre documents
max(index_entre_documents_w2v)

array([[0.7719879]], dtype=float32)

In [105]:
glob.glob('../00Data/dataset_txt/Mecanica_fluids/*')[index_entre_documents_w2v.index(max(index_entre_documents_w2v))].split('/')[-1]

'Estudio_fluidodinamico_de_un_agitador_de_turbina'

In [106]:
#Mitjana de similituds entre els documents sense considerar la màxima similitud
sum(index_entre_documents_w2v[:-1])/len(index_entre_documents_w2v[:-1])

array([[0.43842947]], dtype=float32)

### Word2vec embeddings + word mover's distance

In [107]:
import pyemd

In [108]:
%%time
wmdist_between_each_vector_plag_each_vector_base = []

for i in range (0,len(base_filtered_document_sentence_global)):
    each_document = [] 
    for sentence1 in plag_filtered_document_sentence:
        each_plag_sentence = []
        for sentence2 in base_filtered_document_sentence_global[i]:
            each_plag_sentence.append(w2v_model.wmdistance(sentence2,sentence1))
                                      
        each_document.append(each_plag_sentence)
                                      
    wmdist_between_each_vector_plag_each_vector_base.append(each_document)

  


CPU times: user 4min 23s, sys: 693 ms, total: 4min 23s
Wall time: 4min 25s


In [109]:
len(wmdist_between_each_vector_plag_each_vector_base)

16

In [110]:
len(wmdist_between_each_vector_plag_each_vector_base[0])

14

In [111]:
len(wmdist_between_each_vector_plag_each_vector_base[0][0])

541

In [112]:
less_distance_sentence_document = []
for i in range (0,len(wmdist_between_each_vector_plag_each_vector_base)):
    each_document = []
    for each_vector in wmdist_between_each_vector_plag_each_vector_base[i]:
        each_document.append(min(each_vector))
    less_distance_sentence_document.append(each_document)

In [113]:
less_distance_sentence_document[0]

[0.9428449303501133,
 1.0771786720228143,
 1.05127246715718,
 0.859252649579193,
 1.0986147916534428,
 1.0875597113012556,
 0.9730258332700894,
 1.0505522011919317,
 0.9993910348652768,
 0.9489173508866168,
 1.0092065774347254,
 1.046650517796031,
 0.9698651883140696,
 0.8989752608608694]

In [114]:
distance_entre_documents_w2v = []
for each_document in less_distance_sentence_document:
    distance_entre_documents_w2v.append(sum(each_document)/len(each_document))
    
distance_entre_documents_w2v  

[1.0009505133345435,
 1.055623165602435,
 1.1427019837248673,
 1.0356003308711226,
 1.011058408232542,
 1.1044433624207233,
 1.0394452348479715,
 1.0289421062446322,
 1.020307430692151,
 1.031916893890839,
 1.080745489271955,
 1.0182708095978548,
 0.9915994734672594,
 1.0369422084411264,
 1.0481899604027833,
 0.44136889358612336]

In [115]:
#Distància mínima entre documents
min(distance_entre_documents_w2v)

0.44136889358612336

In [116]:
glob.glob('../00Data/dataset_txt/Mecanica_fluids/*')[distance_entre_documents_w2v.index(min(distance_entre_documents_w2v))].split('/')[-1]

'Estudio_fluidodinamico_de_un_agitador_de_turbina'

In [117]:
sum(distance_entre_documents_w2v[:-1])/len(distance_entre_documents_w2v[:-1])

1.0431158247361871

### Word2vec + Smooth inverse frequency + cosine similarity

In [118]:
plag_filtered_document_sentence_no_tokenized = []
for sentence in plag_filtered_document_sentence:
    plag_filtered_document_sentence_no_tokenized.append(' '.join(sentence))

In [119]:
base_filtered_document_sentence_no_tokenized = []
for document in base_filtered_document_sentence_global:
    listado_documentos = []
    for sentence in document:
        listado_documentos.append(' '.join(sentence))
    base_filtered_document_sentence_no_tokenized.append(listado_documentos)

In [120]:
def get_word_frequency(word_text):
    return 0.0001  


In [121]:
plag_sif_sentences_vectorized = []
a: float=1e-3
for sentence in plag_filtered_document_sentence_no_tokenized:
    vs = np.zeros(50)
    sentence_length = len(sentence)
    for word in nltk.word_tokenize(sentence):
        a_value = a/(a+get_word_frequency(word))
        vs = np.add(vs, np.multiply(a_value,w2v_model.wv.get_vector(word)))
    vs = np. divide(vs,sentence_length)
    plag_sif_sentences_vectorized.append(vs)

In [122]:
base_sif_document_sentences_vectorized = []
a: float=1e-3
    
for document in base_filtered_document_sentence_no_tokenized:
    lista_documents = []
    for sentence in document:
        vs = np.zeros(50)
        sentence_length = len(sentence)
        for word in nltk.word_tokenize(sentence):
            a_value = a/(a+get_word_frequency(word))
            vs = np.add(vs, np.multiply(a_value,w2v_model.wv.get_vector(word)))
        vs = np. divide(vs,sentence_length)
        lista_documents.append(vs)
    base_sif_document_sentences_vectorized.append(lista_documents)

In [125]:
#Converteixo els vectors generats per a cada frase a vectors de 2D per tal d'aplicar cosine similarity entre ells.
plag_sif_sentences_vectorized_good_format = []
for vector in plag_sif_sentences_vectorized:
    plag_sif_sentences_vectorized_good_format.append([vector])
    
base_sif_document_sentences_vectorized_good_format = []
for document in base_sif_document_sentences_vectorized:
    list_document = []
    for vector_sentence in document:
        list_document.append([vector_sentence])
    base_sif_document_sentences_vectorized_good_format.append(list_document)

In [128]:
#Computo similituds entre vectors
%%time
similarities_between_vectors_base_plag_sif = []

for i in range (0,len(base_sif_document_sentences_vectorized_good_format)):
    each_document = []
    for vector2 in plag_sif_sentences_vectorized_good_format:
        each_plag_sentence = []
        for vector1 in base_sif_document_sentences_vectorized_good_format[i]:
            each_plag_sentence.append(cosine_similarity(vector2,vector1))
        each_document.append(each_plag_sentence)
    similarities_between_vectors_base_plag_sif.append(each_document)

CPU times: user 17.6 s, sys: 213 ms, total: 17.8 s
Wall time: 18.5 s


In [129]:
#Ara la llista best_similarities_sentence_document_sif és una llista dels 16 documents de la base de dades que contenen
#Les màximes similituds amb cada frase del document plagiat (14 similituds)
best_similarities_sentence_document_sif = []
for i in range (0,len(similarities_between_vectors_base_plag_sif)):
    each_document = []
    for each_vector in similarities_between_vectors_base_plag_sif[i]:
        each_document.append(max(each_vector))
    best_similarities_sentence_document_sif.append(each_document)

In [130]:
#Calculo l'índex de simliitud per a cada document amb el document plagiat
index_entre_documents_w2v_sif = []
for each_document in best_similarities_sentence_document_sif:
    index_entre_documents_w2v_sif.append(sum(each_document)/len(each_document))
    
index_entre_documents_w2v_sif

[array([[0.48598111]]),
 array([[0.40634844]]),
 array([[0.31460501]]),
 array([[0.43441651]]),
 array([[0.46903542]]),
 array([[0.35735636]]),
 array([[0.48661694]]),
 array([[0.42736121]]),
 array([[0.46422189]]),
 array([[0.43200934]]),
 array([[0.41832141]]),
 array([[0.4368577]]),
 array([[0.52176605]]),
 array([[0.48403186]]),
 array([[0.43751297]]),
 array([[0.77198786]])]

In [131]:
#Document amb màxima similitud
print(max(index_entre_documents_w2v_sif))
glob.glob('../00Data/dataset_txt/Mecanica_fluids/*')[index_entre_documents_w2v_sif.index(max(index_entre_documents_w2v_sif))].split('/')[-1]


[[0.77198786]]


'Estudio_fluidodinamico_de_un_agitador_de_turbina'

In [132]:
#Weighting amb SIF i fent mean, els resultats son pràcticament els mateixos

In [133]:
sum(index_entre_documents_w2v_sif[:-1])/len(index_entre_documents_w2v_sif[:-1])

array([[0.43842948]])

### UNIVERSAL SENTENCE ENCODER - MULTILINGUAL

In [134]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text #ERA EL QUE NECESSITAVA!!!!!

In [135]:
%%time
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")

CPU times: user 2.21 s, sys: 723 ms, total: 2.93 s
Wall time: 3.51 s


In [136]:
#Faig embed de la llista de sentences de tot un document:
base_document_sentences_embeddings_use = []
for i in range(0,len(base_document_sentences)):
    base_document_sentences_embeddings_use.append(embed(base_document_sentences[i]))
    

In [137]:
#Calculo el embeddings de cada frase d'un document en una mateixa matriu que sera una 14x512
plag_document_sentences_embeddings_use = embed(plag_document_sentences[0])

In [138]:
base_document_sentences_embeddings_use[0].shape

TensorShape([541, 512])

In [139]:
%%time
#Calculo la similitud de la matriu d'embeddings de cada document base amb la matriu d'embeddings de la matriu plagi
#Adjunto els resultats a similarities_between_base_plag
similarities_between_base_plag = []
for document_tensor in base_document_sentences_embeddings_use:
    similarities_between_base_plag.append(cosine_similarity(document_tensor,plag_document_sentences_embeddings_use))
    

CPU times: user 37.7 ms, sys: 5.76 ms, total: 43.4 ms
Wall time: 23.1 ms


In [140]:
best_similarities_between_base_plag = []
for document in similarities_between_base_plag:
    best_similarities_between_base_plag.append(document.max(axis=0))

In [141]:
similarity_index_use_between_base_plag = []
for i in range (0,len(best_similarities_between_base_plag)):
    similarity_index_use_between_base_plag.append(sum(best_similarities_between_base_plag[i]/len(best_similarities_between_base_plag[i])))

In [142]:
#Printar totes les similituds i la maxima tambe
similarity_index_use_between_base_plag

[0.5140185728669167,
 0.4290044233202934,
 0.36235099472105503,
 0.45256517082452774,
 0.5055232439190149,
 0.3982669096440077,
 0.4534240011125803,
 0.4882507435977459,
 0.503564853221178,
 0.4508955143392086,
 0.44092260859906673,
 0.48860074020922184,
 0.5004921369254589,
 0.49465914256870747,
 0.4727472700178623,
 0.7707100156694651]

In [143]:
print(max(similarity_index_use_between_base_plag))
glob.glob('../00Data/dataset_txt/Mecanica_fluids/*')[similarity_index_use_between_base_plag.index(max(similarity_index_use_between_base_plag))].split('/')[-1]

0.7707100156694651


'Estudio_fluidodinamico_de_un_agitador_de_turbina'

In [144]:
#Mitjana de similituds sense tenir en compte la màxima
sum(similarity_index_use_between_base_plag[:-1])/len(similarity_index_use_between_base_plag[:-1])

0.463685755059123