# Пример использования API

In [1]:
%load_ext autoreload
%autoreload 2

import os

In [2]:
SEED = 1

## Импорт модулей

In [3]:
from recs_searcher import (
    dataset,
    preprocessing,
    models,
    similarity_search,
    augmentation,
)

#### Настройки для pipeline:

In [4]:
dataset_phones = dataset.load_mobile_phones()
dataset_games = dataset.load_video_games()

SPACY_MODEL_NAME = 'en_core_web_md'
preprocessing_list = [
    preprocessing.TextLower(),
    preprocessing.RemovePunct(),
    preprocessing.RemoveNumber(),
    preprocessing.RemoveWhitespace(),
    preprocessing.RemoveHTML(),
    preprocessing.RemoveURL(),
    preprocessing.RemoveEmoji(),

    preprocessing.RemoveStopwordsSpacy(spacy_model_name=SPACY_MODEL_NAME),
    preprocessing.LemmatizeSpacy(spacy_model_name=SPACY_MODEL_NAME),
]

model_fasttext = models.FastTextWrapperModel(
    min_count=1,
    vector_size=20,
    window=2,
    sg=1,
    hs=1,
    epochs=70,
    min_n=0,
    seed=SEED,
)

searcher_faiss = similarity_search.FaissSearch
searcher_chroma = similarity_search.ChromaDBSearch
searcher_knn = similarity_search.NearestNeighborsSearch
searcher_fuzz = similarity_search.TheFuzzSearch

#### Настройки для валидации:

In [5]:
LANGUAGE = 'english'
validate_augmentation_transforms = [
    augmentation.ChangeSyms(p=0.013, language=LANGUAGE, change_only_alpha=True, seed=SEED),
    augmentation.DeleteSyms(p=0.013, delete_only_alpha=True, seed=SEED),
    augmentation.AddSyms(p=0.013, language=LANGUAGE, seed=SEED),
    augmentation.MultiplySyms(p=0.013, count_multiply=2, multiply_only_alpha=True, seed=SEED),
    augmentation.SwapSyms(p=0.013, seed=SEED),
    augmentation.ChangeSyms(p=0.013, language=LANGUAGE, change_only_alpha=True, seed=SEED),
    augmentation.ChangeSyms(p=0.013, language=LANGUAGE, change_only_alpha=True, seed=SEED),
]
accuracy_top = [1, 5, 10]

## Модуль `api`

In [6]:
from recs_searcher import api

#### Faiss-fasttext

In [7]:
pipeline1 = api.Pipeline(
    dataset=dataset_phones.target.values,
    preprocessing=preprocessing_list,
    model=model_fasttext,
    searcher=searcher_faiss,
    verbose=True,
)

Data preparation for training has begun...
The training of the model has begun...
Pipeline ready!


In [8]:
%%time
pipeline1.search('apple', 5)

CPU times: total: 15.6 ms
Wall time: 18 ms


Unnamed: 0,name,similarity
0,Apple iPhone 13,0.001139
1,Apple iPhone 12,0.001139
2,Apple iPhone 11,0.001139
3,Apple iPhone 6,0.001139
4,Apple iPhone 5s,0.002705


In [9]:
pipeline1.validate(validate_augmentation_transforms, accuracy_top)

100%|██████████████████████████████████████████████████████████| 223/223 [00:03<00:00, 61.74it/s]

Top 1Acc = 0.3991031390134529
Top 5Acc = 0.6995515695067265
Top 10Acc = 0.7937219730941704





{1: 0.3991031390134529, 5: 0.6995515695067265, 10: 0.7937219730941704}

In [10]:
# pipeline1.save(path_folder_save='pipelines', filename='tmp')

In [11]:
# pipeline1 = api.load_pipeline(path_to_filename='tmp')
# pipeline1

#### KNN-fasttext

In [12]:
pipeline2 = api.Pipeline(
    dataset=dataset_phones.target.values,
    preprocessing=preprocessing_list,
    model=model_fasttext,
    searcher=searcher_knn,
    verbose=True,

    algorithm='brute',
)

Data preparation for training has begun...
The training of the model has begun...
Pipeline ready!


In [13]:
pipeline2.search('apple', 5)

Unnamed: 0,name,similarity
0,Apple iPhone 13,0.999431
1,Apple iPhone 6,0.999431
2,Apple iPhone 12,0.999431
3,Apple iPhone 11,0.999431
4,Apple iPhone 5s,0.998648


In [14]:
pipeline2.validate(validate_augmentation_transforms, accuracy_top)

100%|██████████████████████████████████████████████████████████| 223/223 [00:03<00:00, 56.68it/s]

Top 1Acc = 0.4304932735426009
Top 5Acc = 0.6860986547085202
Top 10Acc = 0.7937219730941704





{1: 0.4304932735426009, 5: 0.6860986547085202, 10: 0.7937219730941704}

#### TheFuzzSearch

In [15]:
pipeline3 = api.Pipeline(
    dataset=dataset_phones.target.values,
    preprocessing=preprocessing_list,
    searcher=searcher_fuzz,
    verbose=True,
)

Data preparation for training has begun...
Pipeline ready!


In [16]:
pipeline3.search('apple', 5)

Unnamed: 0,name,similarity
0,Apple iPhone 13 Pro Max,90
1,Apple iPhone 13 Pro,90
2,Apple iPhone 13,90
3,Apple iPhone 13 mini,90
4,Apple iPhone 12 Pro Max,90


In [17]:
pipeline3.validate(validate_augmentation_transforms, accuracy_top)

100%|██████████████████████████████████████████████████████████| 223/223 [00:03<00:00, 57.03it/s]

Top 1Acc = 0.4798206278026906
Top 5Acc = 0.7937219730941704
Top 10Acc = 0.8789237668161435





{1: 0.4798206278026906, 5: 0.7937219730941704, 10: 0.8789237668161435}

Добавим данные к существующему поисковику:

In [18]:
dataset_games.head(3)

Unnamed: 0,target
0,Wii Sports
1,Super Mario Bros.
2,Mario Kart Wii


In [19]:
pipeline3 = pipeline3.fine_tuning(dataset_games.target.values)

Data preparation for training has begun...
Pipeline ready!


In [20]:
pipeline3.search('mario', 5)

Unnamed: 0,name,similarity
0,Super Mario Bros.,90
1,Mario Kart Wii,90
2,New Super Mario Bros.,90
3,New Super Mario Bros. Wii,90
4,Mario Kart DS,90


#### SentenceTransformer

In [21]:
augmentation_transforms_seed_none = [
    augmentation.ChangeSyms(p=0.013, language=LANGUAGE, change_only_alpha=True, seed=None),
    augmentation.DeleteSyms(p=0.013, delete_only_alpha=True, seed=None),
    augmentation.AddSyms(p=0.013, language=LANGUAGE, seed=None),
    augmentation.MultiplySyms(p=0.013, count_multiply=2, multiply_only_alpha=True, seed=None),
    augmentation.SwapSyms(p=0.013, seed=None),
    augmentation.ChangeSyms(p=0.013, language=LANGUAGE, change_only_alpha=True, seed=None),
    augmentation.ChangeSyms(p=0.013, language=LANGUAGE, change_only_alpha=True, seed=None),
]

model_sentence_transformer = models.SentenceTransformerWrapperModel(
    augmentation_transform=augmentation_transforms_seed_none,
    batch_size=32,
    epochs=3,
    optimizer_params={'lr': 2e-2},
)

In [22]:
pipeline4 = api.Pipeline(
    dataset=dataset_phones.target.values,
    preprocessing=preprocessing_list,
    model=model_sentence_transformer,
    searcher=searcher_faiss,
    verbose=True,
)

Data preparation for training has begun...
The training of the model has begun...


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/7 [00:00<?, ?it/s]

Iteration:   0%|          | 0/7 [00:00<?, ?it/s]

Iteration:   0%|          | 0/7 [00:00<?, ?it/s]

Pipeline ready!


In [23]:
pipeline4.search('apple', 5)

Unnamed: 0,name,similarity
0,Apple iPhone 13,10.015034
1,Apple iPhone 12,10.015034
2,Apple iPhone 11,10.015034
3,Apple iPhone 6,10.015034
4,Apple iPhone SE (2020),12.554626


In [24]:
pipeline4.validate(validate_augmentation_transforms, accuracy_top)

100%|██████████████████████████████████████████████████████████| 223/223 [00:13<00:00, 16.56it/s]

Top 1Acc = 0.47085201793721976
Top 5Acc = 0.8340807174887892
Top 10Acc = 0.905829596412556





{1: 0.47085201793721976, 5: 0.8340807174887892, 10: 0.905829596412556}