# Тестирование модулей

In [None]:
cd ..

In [None]:
SEED = 1

## Модуль `datasets`

In [None]:
import dataset

Загрузка DataFrame

In [None]:
df_video_games = dataset.load_video_games()
df_video_games

Загрузка кастомных датасетов

In [None]:
dataset_video_games = dataset.StandartDataset('load_video_games')
print(dataset_video_games)
print()

for i in range(5):
    print(dataset_video_games[i])

In [None]:
dataset_video_games = dataset.SentenceTransformerDataset('load_video_games')
print(dataset_video_games)
print()

for i in range(5):
    print(dataset_video_games[i])

## Модуль `preprocessing`

In [None]:
import preprocessing

In [None]:
cleaner_base = preprocessing.BaseCleaner(remove_number=False)

In [None]:
%%time
clear_text = cleaner_base.transform(df_video_games.target.values)
clear_text[-5:]

## Модуль `models`

In [None]:
import models

#### TFIDF:

In [None]:
model_tfidf_wrapper = models.TfidfWrapperModel()

In [None]:
%%time
embedding_database_tfidf = model_tfidf_wrapper.fit_transform(clear_text)

print(embedding_database_tfidf.shape)
embedding_database_tfidf

In [None]:
# model_tfidf_wrapper.save('testing')

In [None]:
# model_tfidf_wrapper.load('testing')

#### FastText

In [None]:
model_fasttext_wrapper = models.FastTextWrapperModel(
    min_count=3,
    vector_size=100, 
    window=3, 
    sg=1
)

In [None]:
%%time
embedding_database_fasttext = model_fasttext_wrapper.fit_transform(clear_text)

print(embedding_database_fasttext.shape)
embedding_database_fasttext

#### SentenceTransformers

In [None]:
# model_transformer_wrapper = models.SentenceTransformerWrapperModel()

In [None]:
# model_transformer_wrapper.fit(clear_text)

In [None]:
# embedding_database_transformer = model_transformer_wrapper.transform(clear_text)

# print(embedding_database_transformer.shape)
# embedding_database_transformer

## Модуль `similarity_search`

In [None]:
import similarity_search

### ForCycleSearch

Для TfIdf:

In [None]:
%%time
search_cycle_tfidf = similarity_search.ForCycleSearch(model_tfidf_wrapper, embedding_database_tfidf, df_video_games.target.values)
search_cycle_tfidf.search('mario 9', 5)

Для FastText:

In [None]:
%%time
search_cycle_fasttext = similarity_search.ForCycleSearch(model_fasttext_wrapper, embedding_database_fasttext, df_video_games.target.values)
search_cycle_fasttext.search('mario 9', 5)

Для SentenceTransformer:

In [None]:
# %%time
# search_cycle_transformer = similarity_search.ForCycleSearch(model_transformer_wrapper, embedding_database_transformer, df_video_games.target.values)
# search_cycle_transformer.search('mario 9', 5)

### TheFuzzSearch

In [None]:
%%time
search_fuzz = similarity_search.TheFuzzSearch(df_video_games.target.values)
search_fuzz.search('mario 9', 5)

### FaissSearch

Для Fasttext

In [None]:
%%time
search_faiss_fasttext = similarity_search.FaissSearch(model_fasttext_wrapper, embedding_database_fasttext, df_video_games.target.values)
search_faiss_fasttext.search('mario 9', 5)

Для TF-IDF

In [None]:
%%time
search_faiss_tfidf = similarity_search.FaissSearch(model_tfidf_wrapper, embedding_database_tfidf, df_video_games.target.values)
search_faiss_tfidf.search('mario 9', 5)

## Модуль `augmentation`

In [None]:
import augmentation

Создание ошибок в словах

In [None]:
%%time
misspelling_augmentation = augmentation.MisspellingAugmentation(
    add_syms={'p': 0.05},
    change_syms={'p': 0.05},
    delete_syms={'p': 0.05},
    multiply_syms={'p': 0.05},
    swap_syms={'p': 0.05},
    seed=SEED,
)
misspelling_augmentation.transform(df_video_games.target.values)

In [None]:
%%time
structure_augmentation = augmentation.StructureAugmentation(
    delete_words={'p': 0.05},
    get_abbreviation=False,
    shuffle_words=True,
    seed=SEED,
)
structure_augmentation.transform(df_video_games.target.values)

In [None]:
%%time
structure_augmentation = augmentation.StructureAugmentation(
    delete_words=False,
    get_abbreviation=True,
    shuffle_words=False,
    seed=SEED,
)
structure_augmentation.transform(df_video_games.target.values)

Датасет с аугментацией

In [None]:
dataset_video_games = dataset.SentenceTransformerDataset('load_video_games', [misspelling_augmentation])
print(dataset_video_games)
print()

for i in range(5):
    print(dataset_video_games[i])

## Реализация `Validate` моделей

In [None]:
%%time
validate = models.Validate(
    search_cycle_tfidf,
    [
        augmentation.MisspellingAugmentation(
            add_syms={'p': 0.04},
            change_syms={'p': 0.04},
            delete_syms={'p': 0.04},
            multiply_syms={'p': 0.04},
            swap_syms={'p': 0.04},
            seed=SEED,
        )
    ]
)

TheFuzz

<img src='attachment:af73c173-1c5f-49ef-8911-7e4fab1e97eb.png' width=600>

С Faiss Fasttext

<img src='attachment:18380537-57a3-4f5f-95e3-445cb3a0bceb.png' width=600>

С Faiss TfIdf

<img src='attachment:1e309eb7-a9cb-4e38-b3a9-308224405dc5.png' width=600>

С TfIdf

<img src='attachment:2c6f6607-8f5b-4b1d-927f-7bf8bfdc2373.png' width=600>

## `TEMP` code: