# Тестирование модулей

In [None]:
!cd

In [None]:
%load_ext autoreload
%autoreload 2

import os

In [None]:
if 'notebooks' in os.listdir():
    pass
else:
    os.chdir('..')
    print(os.getcwd())

In [None]:
SEED = 1

# Модуль `datasets`

In [None]:
from recs_searcher import dataset

Загрузка DataFrame

In [None]:
df_video_games = dataset.load_video_games()
df_video_games

Загрузка кастомных датасетов

In [None]:
dataset_video_games = dataset.StandartDataset(df_video_games.target.values)
print(dataset_video_games)
print()

for i in range(5):
    print(dataset_video_games[i])

In [None]:
dataset_video_games = dataset.SentenceTransformerDataset(df_video_games.target.values)
print(dataset_video_games)
print()

for i in range(5):
    print(dataset_video_games[i])

# Модуль `preprocessing`

In [None]:
from recs_searcher import preprocessing

In [None]:
preprocessing_list = [
    preprocessing.BaseCleaner(remove_number=False),
    preprocessing.BaseNormalizer(
        'english',
        remove_stopwords=True,
        number_extract=True,
        lemmatize=True,
    ),
]

#### Очистка текста

In [None]:
%%time
cleaner_base = preprocessing.BaseCleaner(remove_number=False)

df_video_games['clear'] = cleaner_base.transform(df_video_games.target.values)
df_video_games

#### Нормализация текста

In [None]:
# %%time
# normilizer = preprocessing.BaseNormalizer(
#     'english',
#     remove_stopwords=True,
#     number_extract=True,
#     lemmatize=True,
# )

# df_video_games['norm'] = normilizer.transform(df_video_games.target.values)
# df_video_games

#### Чистка + нормализация

In [None]:
%%time
normilizer = preprocessing.BaseNormalizer(
    'english',
    remove_stopwords=True,
    number_extract=True,
    lemmatize=True,
)

df_video_games['clear_norm'] = normilizer.transform(df_video_games.clear.values)
df_video_games

#### Кастомные тесты нормализации с числами

In [None]:
%%time
normilizer = preprocessing.BaseNormalizer(
    'russian',
    remove_stopwords=True,
    number_extract=True,
    lemmatize=True,
)

rus_text = ['В две тысячи семьдесят седьмом году была очень хорошая погода. Примерно + двадцать пять градусов было днём!']
normilizer.transform(rus_text)

In [None]:
%%time
normilizer = preprocessing.BaseNormalizer(
    'english',
    remove_stopwords=True,
    number_extract=True,
    lemmatize=True,
)

eng_text = ['In the year two thousand seventy seven, the weather was very good. It was about + twenty five degrees in the afternoon!']
normilizer.transform(eng_text)

# Модуль `models`

In [None]:
from recs_searcher import models

#### TFIDF:

In [None]:
model_tfidf_wrapper = models.TfidfWrapperModel()

In [None]:
%%time
embedding_database_tfidf = model_tfidf_wrapper.fit_transform(df_video_games.clear.values)

print(embedding_database_tfidf.shape)
embedding_database_tfidf

In [None]:
# model_tfidf_wrapper.save('testing')

In [None]:
# model_tfidf_wrapper.load('testing')

#### FastText

In [None]:
model_fasttext_wrapper = models.FastTextWrapperModel(
    min_count=1,
    vector_size=200,
    window=2,
    sg=1,
    hs=1,
    epochs=70,
    min_n=0,
    seed=SEED,
)

In [None]:
%%time
embedding_database_fasttext = model_fasttext_wrapper.fit_transform(df_video_games.clear.values)

print(embedding_database_fasttext.shape)
embedding_database_fasttext

#### SentenceTransformers

In [None]:
# model_transformer_wrapper = models.SentenceTransformerWrapperModel()

In [None]:
# model_transformer_wrapper.fit(df_video_games.clear.values)

In [None]:
# embedding_database_transformer = model_transformer_wrapper.transform(df_video_games.clear.values)

# print(embedding_database_transformer.shape)
# embedding_database_transformer

# Модуль `similarity_search`

In [None]:
from recs_searcher import similarity_search

Для SentenceTransformer:

In [None]:
# %%time
# search_cycle_transformer = similarity_search.ForCycleSearch(model_transformer_wrapper, embedding_database_transformer, df_video_games.target.values)
# search_cycle_transformer.search('mario 9', 5)

### TheFuzzSearch

In [None]:
%%time
search_fuzz = similarity_search.TheFuzzSearch(df_video_games.target.values)
search_fuzz.search('mario 9', 5)

### NearestNeighbors

Для Fasttext

In [None]:
%%time
search_knn_fasttext = similarity_search.NearestNeighborsSearch(
    model_fasttext_wrapper,
    embedding_database_fasttext,
    df_video_games.target.values,

    # algorithm='auto',
    # metric='minkowski',
)
search_knn_fasttext.search('mario 9', 5)

Для TF-IDF

In [None]:
%%time
search_knn_tfidf = similarity_search.NearestNeighborsSearch(
    model_tfidf_wrapper,
    embedding_database_tfidf,
    df_video_games.target.values,

    # algorithm='auto',
    # metric='cosine',
)
search_knn_tfidf.search('mario 9', 5)

### FaissSearch

Для Fasttext

In [None]:
%%time
search_faiss_fasttext = similarity_search.FaissSearch(model_fasttext_wrapper, embedding_database_fasttext, df_video_games.target.values)
search_faiss_fasttext.search('mario 9', 5)

Для TF-IDF

In [None]:
%%time
search_faiss_tfidf = similarity_search.FaissSearch(model_tfidf_wrapper, embedding_database_tfidf, df_video_games.target.values)
search_faiss_tfidf.search('mario 9', 5)

### ChromaDB

Для FastText:

In [None]:
%%time
search_chromadb_fasttext = similarity_search.ChromaDBSearch(model_fasttext_wrapper, embedding_database_fasttext, df_video_games.target.values)

In [None]:
%%time
search_chromadb_fasttext.search('mario 9', 5)

Для Tf-Idf

In [None]:
%%time
search_chromadb_tfidf = similarity_search.ChromaDBSearch(model_tfidf_wrapper, embedding_database_tfidf, df_video_games.target.values)

In [None]:
%%time
search_chromadb_tfidf.search('mario 9', 5)

# Модуль `augmentation`

In [None]:
from recs_searcher import augmentation

Создание ошибок в словах

In [None]:
%%time
misspelling_augmentation = augmentation.MisspellingAugmentation(
    add_syms={'p': 0.01, 'language': 'english'},
    change_syms={'p': 0.01, 'language': 'english'},
    delete_syms={'p': 0.01},
    multiply_syms={'p': 0.01},
    swap_syms={'p': 0.01},
    seed=SEED,
)
misspelling_augmentation.transform(df_video_games.target.values)

In [None]:
%%time
structure_augmentation = augmentation.StructureAugmentation(
    delete_words={'p': 0.1},
    get_abbreviation=False,
    shuffle_words=True,
    seed=SEED,
)
structure_augmentation.transform(df_video_games.target.values)

In [None]:
%%time
structure_augmentation = augmentation.StructureAugmentation(
    delete_words=False,
    get_abbreviation=True,
    shuffle_words=False,
    seed=SEED,
)
structure_augmentation.transform(df_video_games.target.values)

Датасет с аугментацией

In [None]:
dataset_video_games = dataset.SentenceTransformerDataset('load_video_games', [misspelling_augmentation])
print(dataset_video_games)
print()

for i in range(5):
    print(dataset_video_games[i])

## Реализация `Validate` моделей

In [None]:
%%time
metrics = models.Validate(
    searcher=search_knn_fasttext,
    augmentation_transforms=[
        augmentation.MisspellingAugmentation(
            add_syms={'p': 0.01, 'language': 'english'},
            change_syms={'p': 0.01, 'language': 'english'},
            delete_syms={'p': 0.01},
            multiply_syms={'p': 0.01},
            swap_syms={'p': 0.01},
            seed=SEED,
        ),
    ],
    accuracy_top=[1, 5, 10],
)