# Пример использования API

In [1]:
%load_ext autoreload
%autoreload 2

import os

In [2]:
SEED = 1

## Импорт модулей

In [3]:
from recs_searcher import (
    dataset,
    preprocessing,
    models,
    similarity_search,
    augmentation,
)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Максим\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Настройки для pipeline:

In [4]:
dataset_games = dataset.load_video_games()
dataset_phones = dataset.load_mobile_phones()
dataset_exoplanes = dataset.load_exoplanes()

preprocessing_list = [
    preprocessing.BaseCleaner(remove_number=False),
    preprocessing.BaseNormalizer(
        'english',
        remove_stopwords=True,
        number_extract=True,
        lemmatize=True,
    ),
]

model_fasttext = models.FastTextWrapperModel(
    min_count=1,
    vector_size=200,
    window=2,
    sg=1,
    hs=1,
    epochs=70,
    min_n=0,
    seed=SEED,
)

searcher_faiss = similarity_search.FaissSearch
searcher_knn = similarity_search.NearestNeighborsSearch
searcher_fuzz = similarity_search.TheFuzzSearch

#### Настройки для валидации:

In [5]:
validate_augmentation_transforms = [
    augmentation.MisspellingAugmentation(
        add_syms={'p': 0.01, 'language': 'english'},
        change_syms={'p': 0.01, 'language': 'english'},
        delete_syms={'p': 0.01},
        multiply_syms={'p': 0.01},
        swap_syms={'p': 0.01},
        seed=SEED,
    ),
]
accuracy_top = [1, 5, 10]

## Модуль `api`

In [6]:
from recs_searcher import api

#### Faiss-fasttext

In [7]:
pipeline1 = api.Pipeline(
    dataset=dataset_phones.target.values,
    preprocessing=preprocessing_list,
    model=model_fasttext,
    searcher=searcher_faiss,
    verbose=True,
)

Data preparation for training has begun...
The training of the model has begun...
Pipeline ready!


In [8]:
%%time
pipeline1.search('apple', 5)

CPU times: total: 31.2 ms
Wall time: 21 ms


Unnamed: 0,name,similarity
0,Apple iPhone 6,0.009397
1,Apple iPhone 13,0.009823
2,Apple iPhone 5s,0.010217
3,Apple iPhone 12,0.010554
4,Apple iPhone 13 mini,0.013489


In [9]:
pipeline1.validate(validate_augmentation_transforms, accuracy_top)

100%|██████████████████████████████████████████████████████████████████████| 223/223 [00:01<00:00, 118.57it/s]

Top 1Acc = 0.8071748878923767
Top 5Acc = 0.9551569506726457
Top 10Acc = 0.9820627802690582





{1: 0.8071748878923767, 5: 0.9551569506726457, 10: 0.9820627802690582}

In [11]:
pipeline1.save('pipelines', 'tmp')

<recs_searcher.api.api.Pipeline at 0x1d779364e10>

In [None]:
# pipeline1 = api.load_pipeline('tmp')
# pipeline1

#### KNN-fasttext

In [None]:
pipeline2 = api.Pipeline(
    dataset=dataset_phones.target.values,
    preprocessing=preprocessing_list,
    model=model_fasttext,
    searcher=searcher_knn,
    verbose=True,

    algorithm='brute',
)

In [None]:
pipeline2.search('apple', 5)

In [None]:
pipeline2.validate(validate_augmentation_transforms, accuracy_top)

#### TheFuzzSearch

In [None]:
pipeline3 = api.Pipeline(
    dataset=dataset_phones.target.values,
    preprocessing=preprocessing_list,
    searcher=searcher_fuzz,
    verbose=True,
)

In [None]:
pipeline3.search('apple', 5)

In [None]:
pipeline3.validate(validate_augmentation_transforms, accuracy_top)

Добавим данные к существующему поисковику:

In [None]:
dataset_games.head(3)

In [None]:
pipeline3 = pipeline3.fine_tuning(dataset_games.target.values)

In [None]:
pipeline3.search('mario', 5)

#### SentenceTransformer

In [None]:
augmentation_transforms_seed_none = [
    augmentation.MisspellingAugmentation(
        add_syms={'p': 0.01, 'language': 'english'},
        change_syms={'p': 0.01, 'language': 'english'},
        delete_syms={'p': 0.01},
        multiply_syms={'p': 0.01},
        swap_syms={'p': 0.01},
        seed=None,
    ),
]

model_sentence_transformer = models.SentenceTransformerWrapperModel(
    augmentation_transform=augmentation_transforms_seed_none,
    batch_size=32,
    epochs=5,
    optimizer_params={'lr': 2e-2},
)

In [None]:
pipeline4 = api.Pipeline(
    dataset=dataset_phones.target.values,
    preprocessing=preprocessing_list,
    model=model_sentence_transformer,
    searcher=searcher_faiss,
    verbose=True,
)

In [None]:
pipeline4.search('apple', 5)

In [None]:
pipeline4.validate(validate_augmentation_transforms, accuracy_top)

In [None]:
pipeline4.save('pipeline4')