# Тестирование модулей

In [None]:
%load_ext autoreload
%autoreload 2

import os

In [None]:
path_list = os.getcwd().split('\\')
if path_list[-2] == 'recs' and path_list[-1] == 'recs':
    pass
else:
    os.chdir('..')
    print(os.getcwd())

In [None]:
path_list

In [None]:
SEED = 1

## Модуль `datasets`

In [None]:
import dataset

Загрузка DataFrame

In [None]:
df_video_games = dataset.load_video_games()
df_video_games

Загрузка кастомных датасетов

In [None]:
dataset_video_games = dataset.StandartDataset('load_video_games')
print(dataset_video_games)
print()

for i in range(5):
    print(dataset_video_games[i])

In [None]:
dataset_video_games = dataset.SentenceTransformerDataset('load_video_games')
print(dataset_video_games)
print()

for i in range(5):
    print(dataset_video_games[i])

## Модуль `preprocessing`

In [None]:
import preprocessing

In [None]:
cleaner_base = preprocessing.BaseCleaner(remove_number=False)

In [None]:
%%time
clear_text = cleaner_base.transform(df_video_games.target.values)
clear_text[-5:]

## Модуль `models`

In [None]:
import models

#### TFIDF:

In [None]:
model_tfidf_wrapper = models.TfidfWrapperModel()

In [None]:
%%time
embedding_database_tfidf = model_tfidf_wrapper.fit_transform(clear_text)

print(embedding_database_tfidf.shape)
embedding_database_tfidf

In [None]:
# %%time
# search_faiss_tfidf = similarity_search.FaissSearch(model_tfidf_wrapper, embedding_database_tfidf, df_video_games.target.values)
# search_faiss_tfidf.search('mario 9', 5)

In [None]:
# %%time
# search_chromadb_tfidf = similarity_search.ChromaDBSearch(model_tfidf_wrapper, embedding_database_tfidf, df_video_games.target.values)
# search_chromadb_tfidf.search('mario 9', 5)

In [None]:
# %%time
# validate = models.Validate(
#     searcher=search_faiss_tfidf,
#     augmentation_transforms=[
#         augmentation.MisspellingAugmentation(
#             add_syms={'p': 0.01, 'language': 'eng'},
#             change_syms={'p': 0.01, 'language': 'eng'},
#             delete_syms={'p': 0.01},
#             multiply_syms={'p': 0.01},
#             swap_syms={'p': 0.01},
#             seed=SEED,
#         ),
#     ],
#     accuracy_top=[1, 5, 10],
# )

![image.png](attachment:1431c801-4897-45eb-b87e-65ae2e7eef40.png)

In [None]:
# model_tfidf_wrapper.save('testing')

In [None]:
# model_tfidf_wrapper.load('testing')

#### FastText

In [None]:
model_fasttext_wrapper = models.FastTextWrapperModel(
    min_count=1,
    vector_size=200,
    window=2,
    sg=1,
    hs=1,
    epochs=70,
    min_n=0,
    seed=SEED,
)

In [None]:
%%time
embedding_database_fasttext = model_fasttext_wrapper.fit_transform(clear_text)

print(embedding_database_fasttext.shape)
embedding_database_fasttext

#### SentenceTransformers

In [None]:
# model_transformer_wrapper = models.SentenceTransformerWrapperModel()

In [None]:
# model_transformer_wrapper.fit(clear_text)

In [None]:
# embedding_database_transformer = model_transformer_wrapper.transform(clear_text)

# print(embedding_database_transformer.shape)
# embedding_database_transformer

## Модуль `similarity_search`

In [None]:
import similarity_search

### ForCycleSearch

Для TfIdf:

In [None]:
%%time
search_cycle_tfidf = similarity_search.ForCycleSearch(model_tfidf_wrapper, embedding_database_tfidf, df_video_games.target.values)
search_cycle_tfidf.search('mario 9', 5)

Для FastText:

In [None]:
%%time
search_cycle_fasttext = similarity_search.ForCycleSearch(model_fasttext_wrapper, embedding_database_fasttext, df_video_games.target.values)
search_cycle_fasttext.search('mario 9', 5)

Для SentenceTransformer:

In [None]:
# %%time
# search_cycle_transformer = similarity_search.ForCycleSearch(model_transformer_wrapper, embedding_database_transformer, df_video_games.target.values)
# search_cycle_transformer.search('mario 9', 5)

### TheFuzzSearch

In [None]:
%%time
search_fuzz = similarity_search.TheFuzzSearch(df_video_games.target.values)
search_fuzz.search('mario 9', 5)

### NearestNeighbors

Для Fasttext

In [None]:
%%time
search_knn_fasttext = similarity_search.NearestNeighborsSearch(
    model_fasttext_wrapper,
    embedding_database_fasttext,
    df_video_games.target.values,

    # algorithm='auto',
    # metric='minkowski',
)
search_knn_fasttext.search('mario 9', 5)

Для TF-IDF

In [None]:
%%time
search_knn_tfidf = similarity_search.NearestNeighborsSearch(
    model_tfidf_wrapper,
    embedding_database_tfidf,
    df_video_games.target.values,

    # algorithm='auto',
    # metric='cosine',
)
search_knn_tfidf.search('mario 9', 5)

### FaissSearch

Для Fasttext

In [None]:
%%time
search_faiss_fasttext = similarity_search.FaissSearch(model_fasttext_wrapper, embedding_database_fasttext, df_video_games.target.values)
search_faiss_fasttext.search('mario 9', 5)

Для TF-IDF

In [None]:
%%time
search_faiss_tfidf = similarity_search.FaissSearch(model_tfidf_wrapper, embedding_database_tfidf, df_video_games.target.values)
search_faiss_tfidf.search('mario 9', 5)

### ChromaDB

Для FastText:

In [None]:
%%time
search_chromadb_fasttext = similarity_search.ChromaDBSearch(model_fasttext_wrapper, embedding_database_fasttext, df_video_games.target.values)

In [None]:
%%time
search_chromadb_fasttext.search('mario 9', 5)

Для Tf-Idf

In [None]:
%%time
search_chromadb_tfidf = similarity_search.ChromaDBSearch(model_tfidf_wrapper, embedding_database_tfidf, df_video_games.target.values)

In [None]:
%%time
search_chromadb_tfidf.search('mario 9', 5)

## Модуль `augmentation`

In [None]:
import augmentation

Создание ошибок в словах

In [None]:
%%time
misspelling_augmentation = augmentation.MisspellingAugmentation(
    add_syms={'p': 0.01, 'language': 'eng'},
    change_syms={'p': 0.01, 'language': 'eng'},
    delete_syms={'p': 0.01},
    multiply_syms={'p': 0.01},
    swap_syms={'p': 0.01},
    seed=SEED,
)
misspelling_augmentation.transform(df_video_games.target.values)

In [None]:
%%time
structure_augmentation = augmentation.StructureAugmentation(
    delete_words={'p': 0.1},
    get_abbreviation=False,
    shuffle_words=True,
    seed=SEED,
)
structure_augmentation.transform(df_video_games.target.values)

In [None]:
%%time
structure_augmentation = augmentation.StructureAugmentation(
    delete_words=False,
    get_abbreviation=True,
    shuffle_words=False,
    seed=SEED,
)
structure_augmentation.transform(df_video_games.target.values)

Датасет с аугментацией

In [None]:
dataset_video_games = dataset.SentenceTransformerDataset('load_video_games', [misspelling_augmentation])
print(dataset_video_games)
print()

for i in range(5):
    print(dataset_video_games[i])

## Реализация `Validate` моделей

In [None]:
%%time
validate = models.Validate(
    searcher=search_chromadb_tfidf,
    augmentation_transforms=[
        augmentation.MisspellingAugmentation(
            add_syms={'p': 0.01, 'language': 'eng'},
            change_syms={'p': 0.01, 'language': 'eng'},
            delete_syms={'p': 0.01},
            multiply_syms={'p': 0.01},
            swap_syms={'p': 0.01},
            seed=SEED,
        ),
    ],
    accuracy_top=[1, 5, 10],
)

search_knn_fasttext

<img src='attachment:628a2d71-6f8d-4add-b36f-94a3bff4135e.png' width=600>

search_fuzz

<img src='attachment:727fe0d9-a83f-4346-b28d-caa53036e141.png' width=600>

search_cycle_fasttext

<img src='attachment:0633326a-a5ff-40d1-92f4-e6763e7a5f27.png' width=600>

search_faiss_fasttext

<img src='attachment:5c95c56c-d91f-474d-a49b-56c00d380b90.png' width=600>

search_faiss_tfidf

<img src='attachment:3e07db42-d618-4ea1-96bd-8ba76cb9f2bd.png' width=600>

search_chromadb_fasttext

<img src='attachment:6165b204-ca73-4cf1-b8f1-d6dcc99a0880.png' width=600>

search_chromadb_tfidf

<img src='attachment:cfe7b499-e847-4255-8f65-ef21874f1cc8.png' width=600>

## `TEMP` code: