In [1]:
%load_ext autoreload
%autoreload 2
import os
os.chdir('..\\..')

In [2]:
from experiments.index_investigation.investigator import IndexInvestigator, IndexInvestigationResultDTO
from mirage.index.MirageIndex import MirageIndex
from faiss import IndexFlatL2


In [3]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import StandardScaler
from sklearn.manifold import TSNE

def plot_vectors_with_labels(vectors, labels, method='tsne', random_state=42):
    """
    Визуализирует векторы с метками, используя снижение размерности и цветовую палитру magma.
    
    Параметры:
    vectors : numpy.ndarray или подобный массив
        Матрица векторов для визуализации (n_samples, n_features)
    labels : numpy.ndarray или подобный массив
        Массив меток для каждого вектора (n_samples,)
    method : str, optional ('tsne' или 'pca')
        Метод снижения размерности (по умолчанию 'tsne')
    random_state : int, optional
        Seed для воспроизводимости (по умолчанию 42)
    """
    # Проверка входных данных
    if len(vectors) != len(labels):
        raise ValueError("Количество векторов и меток должно совпадать")
    
    # Стандартизация данных
    scaler = StandardScaler()
    vectors_scaled = scaler.fit_transform(vectors)
    
    # Снижение размерности
    if method == 'tsne':
        reducer = TSNE(n_components=2, random_state=random_state, perplexity=30, n_iter=1000)
        reduced_vectors = reducer.fit_transform(vectors_scaled)
    elif method == 'pca':
        reducer = PCA(n_components=2, random_state=random_state)
        reduced_vectors = reducer.fit_transform(vectors_scaled)
    else:
        raise ValueError("Метод должен быть 'tsne' или 'pca'")
    
    # Создание DataFrame для удобства построения
    import pandas as pd
    df = pd.DataFrame({
        'x': reduced_vectors[:, 0],
        'y': reduced_vectors[:, 1],
        'label': labels
    })
    
    # Настройка стиля
    sns.set(style="whitegrid", font_scale=1.2)
    plt.figure(figsize=(10, 8))
    
    # Построение графика
    scatter = sns.scatterplot(
        data=df,
        x='x',
        y='y',
        hue='label',
        palette='magma',
        s=70,
        alpha=0.8,
        edgecolor='w',
        linewidth=0.5
    )
    
    # Добавление заголовка и легенды
    method_name = 't-SNE' if method == 'tsne' else 'PCA'
    plt.title(f'Визуализация меток с помощью {method_name}', pad=20)
    plt.xlabel('Компонента 1')
    plt.ylabel('Компонента 2')
    
    # Улучшение легенды
    legend = scatter.legend()
    legend.set_title('Метки')
    
    # Улучшение внешнего вида
    sns.despine()
    plt.tight_layout()
    plt.show()

In [4]:
# plot_vectors_with_labels(
#     vectors=investigator.vector_matrix, 
#     labels=investigator.document_labels
# )

In [8]:
import pandas as pd
from tqdm import tqdm

path = 'E:/indexes1'
files = [fr'{path}/{i}' for i in os.listdir(path)]
results = []
for file in tqdm(files):
    mirage_index = MirageIndex.load(file)
    investigator = IndexInvestigator(mirage_index)
    result: IndexInvestigationResultDTO = investigator.process()
    result.file = file
    results.append(result.model_dump())

resdf = pd.DataFrame(results)

100%|██████████| 240/240 [40:41<00:00, 10.17s/it] 


In [9]:
resdf

Unnamed: 0,file,vector_variance,silhouette_score_euclidian,silhouette_score_cosine,EID,redundancy,LOFs,mean_LOF,std_LOF
0,E:/indexes1/S_16_0.1_BAAI_ip.mirage_index,0.404626,0.094661,0.168449,2.000000,0.001953,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",0.996198,0.087121
1,E:/indexes1/S_16_0.1_BAAI_l2.mirage_index,0.404626,0.094661,0.168449,2.000000,0.001953,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",0.996198,0.087121
2,E:/indexes1/S_16_0.1_DeepPavlov_ip.mirage_index,68.276460,0.126166,0.198874,1.996205,0.002599,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",0.969582,0.244768
3,E:/indexes1/S_16_0.1_DeepPavlov_l2.mirage_index,68.276460,0.126166,0.198874,1.996205,0.002599,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",0.969582,0.244768
4,E:/indexes1/S_16_0.1_intfloat_ip.mirage_index,0.081222,0.011460,0.017142,1.992410,0.005189,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",0.996190,0.087204
...,...,...,...,...,...,...,...,...,...
235,E:/indexes1/W_512_0.5_DeepPavlov_l2.mirage_index,67.057603,0.129361,0.201328,1.998325,0.002602,"[1.0, 1.0, 1.0, -1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...",0.979883,0.199575
236,E:/indexes1/W_512_0.5_intfloat_ip.mirage_index,0.078214,0.008574,0.014493,2.000000,0.005208,"[1.0, 1.0, 1.0, -1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...",0.991618,0.129206
237,E:/indexes1/W_512_0.5_intfloat_l2.mirage_index,0.078214,0.008574,0.014493,2.000000,0.005208,"[1.0, 1.0, 1.0, -1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...",0.991618,0.129206
238,E:/indexes1/W_512_0.5_thenlper_ip.mirage_index,0.125572,0.000081,-0.001615,2.000000,0.002604,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1.000000,0.000000


In [10]:
resdf.to_pickle('E:/indres/bench1.pkl')