# Тестирование API

In [1]:
%load_ext autoreload
%autoreload 2

import os

In [2]:
path_list = os.getcwd().split('\\')
if path_list[-2] == 'recs' and path_list[-1] == 'recs':
    pass
else:
    os.chdir('..')
    print(os.getcwd())

F:\study\recs\recs


In [3]:
SEED = 1

## Импорт модулей

In [4]:
import dataset
import preprocessing
import models
import similarity_search
import augmentation

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Максим\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Настройки для pipeline:

In [5]:
dataset_games = dataset.load_video_games()
dataset_phones = dataset.load_mobile_phones()

preprocessing_list = [
    preprocessing.BaseCleaner(remove_number=False),
    preprocessing.BaseNormalizer(
        'english',
        remove_stopwords=True,
        number_extract=True,
        lemmatize=True,
    ),
]

model_fasttext = models.FastTextWrapperModel(
    min_count=1,
    vector_size=200,
    window=2,
    sg=1,
    hs=1,
    epochs=70,
    min_n=0,
    seed=SEED,
)

searcher_faiss = similarity_search.FaissSearch
searcher_knn = similarity_search.NearestNeighborsSearch
searcher_fuzz = similarity_search.TheFuzzSearch

#### Настройки для валидации:

In [6]:
validate_augmentation_transforms = [
    augmentation.MisspellingAugmentation(
        add_syms={'p': 0.01, 'language': 'english'},
        change_syms={'p': 0.01, 'language': 'english'},
        delete_syms={'p': 0.01},
        multiply_syms={'p': 0.01},
        swap_syms={'p': 0.01},
        seed=SEED,
    ),
]
accuracy_top = [1, 5, 10]

## Модуль `api`

In [7]:
import api

#### Faiss-fasttext

In [8]:
pipeline1 = api.Pipeline(
    dataset=dataset_phones.target.values,
    preprocessing=preprocessing_list,
    model=model_fasttext,
    searcher=searcher_faiss,
    verbose=True,
)

Data preparation for training has begun...
The training of the model has begun...
Pipeline ready!


In [9]:
pipeline1.search('apple', 5)

Unnamed: 0,name,similarity
0,Apple iPhone 6,0.009397
1,Apple iPhone 13,0.009823
2,Apple iPhone 5s,0.010217
3,Apple iPhone 12,0.010554
4,Apple iPhone 13 mini,0.013489


In [10]:
pipeline1.validate(validate_augmentation_transforms, accuracy_top)

100%|██████████████████████████████████████████████████████████████████████| 223/223 [00:01<00:00, 116.50it/s]

Top 1Acc = 0.8340807174887892
Top 5Acc = 0.9596412556053812
Top 10Acc = 0.9730941704035875





{1: 0.8340807174887892, 5: 0.9596412556053812, 10: 0.9730941704035875}

In [11]:
# pipeline1.save('tmp')

In [12]:
# pipeline1 = api.load_pipeline('tmp')
# pipeline1

#### KNN-fasttext

In [13]:
pipeline2 = api.Pipeline(
    dataset=dataset_phones.target.values,
    preprocessing=preprocessing_list,
    model=model_fasttext,
    searcher=searcher_knn,
    verbose=True,

    algorithm='brute',
)

Data preparation for training has begun...
The training of the model has begun...
Pipeline ready!


In [14]:
pipeline2.search('apple', 5)

Unnamed: 0,name,similarity
0,Apple iPhone 6,0.995302
1,Apple iPhone 13,0.995079
2,Apple iPhone 5s,0.994937
3,Apple iPhone 12,0.994712
4,Apple iPhone 13 mini,0.993242


In [15]:
pipeline2.validate(validate_augmentation_transforms, accuracy_top)

100%|███████████████████████████████████████████████████████████████████████| 223/223 [00:02<00:00, 90.55it/s]

Top 1Acc = 0.874439461883408
Top 5Acc = 0.9730941704035875
Top 10Acc = 0.9820627802690582





{1: 0.874439461883408, 5: 0.9730941704035875, 10: 0.9820627802690582}

#### TheFuzzSearch

In [16]:
pipeline3 = api.Pipeline(
    dataset=dataset_phones.target.values,
    preprocessing=preprocessing_list,
    searcher=searcher_fuzz,
    verbose=True,
)

Data preparation for training has begun...
Pipeline ready!


In [17]:
pipeline3.search('apple', 5)

Unnamed: 0,name,similarity
0,Apple iPhone 13 Pro Max,90
1,Apple iPhone 13 Pro,90
2,Apple iPhone 13,90
3,Apple iPhone 13 mini,90
4,Apple iPhone 12 Pro Max,90


In [18]:
pipeline3.validate(validate_augmentation_transforms, accuracy_top)

100%|███████████████████████████████████████████████████████████████████████| 223/223 [00:02<00:00, 98.31it/s]

Top 1Acc = 0.9506726457399103
Top 5Acc = 0.9955156950672646
Top 10Acc = 0.9955156950672646





{1: 0.9506726457399103, 5: 0.9955156950672646, 10: 0.9955156950672646}

Добавим данные к существующему поисковику:

In [19]:
dataset_games.head(3)

Unnamed: 0,target
0,Wii Sports
1,Super Mario Bros.
2,Mario Kart Wii


In [20]:
pipeline3 = pipeline3.fine_tuning(dataset_games.target.values)

Data preparation for training has begun...
Pipeline ready!


In [21]:
pipeline3.search('mario', 5)

Unnamed: 0,name,similarity
0,Super Mario Bros.,90
1,Mario Kart Wii,90
2,New Super Mario Bros.,90
3,New Super Mario Bros. Wii,90
4,Mario Kart DS,90


#### SentenceTransformer

In [24]:
augmentation_transforms_seed_none = [
    augmentation.MisspellingAugmentation(
        add_syms={'p': 0.01, 'language': 'english'},
        change_syms={'p': 0.01, 'language': 'english'},
        delete_syms={'p': 0.01},
        multiply_syms={'p': 0.01},
        swap_syms={'p': 0.01},
        seed=None,
    ),
]

model_sentence_transformer = models.SentenceTransformerWrapperModel(
    augmentation_transform=augmentation_transforms_seed_none,
    batch_size=32,
    epochs=5,
    optimizer_params={'lr': 2e-2},
)

In [25]:
pipeline4 = api.Pipeline(
    dataset=dataset_phones.target.values,
    preprocessing=preprocessing_list,
    model=model_sentence_transformer,
    searcher=searcher_faiss,
    verbose=True,
)

Data preparation for training has begun...
The training of the model has begun...


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/7 [00:00<?, ?it/s]

<InputExample> label: 0, texts: xiaomi mi 11 ultra; xiaomi mi 11 ultra
<InputExample> label: 0, texts: huawei mate 40 pro; huawei mate 40 pro
<InputExample> label: 0, texts: huawei p50 pro; huawei p50 pro


Iteration:   0%|          | 0/7 [00:00<?, ?it/s]

<InputExample> label: 0, texts: xiaomi mi 11 ultra; iaomi mii 11 ultar
<InputExample> label: 0, texts: huawei mate 40 pro; huawei mate 40 pro
<InputExample> label: 0, texts: huawei p50 pro; huawei p50 prIi


Iteration:   0%|          | 0/7 [00:00<?, ?it/s]

<InputExample> label: 0, texts: huawei p50 pro; huawei p50 pr
<InputExample> label: 0, texts: xiaomi mi 11 ultra; xiaomi miK 11 ultra
<InputExample> label: 0, texts: huawei mate 40 pro; hiuawei mate 40 pro


Iteration:   0%|          | 0/7 [00:00<?, ?it/s]

<InputExample> label: 0, texts: xiaomi mi 11 ultra; xiaomi mi 11 ultra
<InputExample> label: 0, texts: huawei p50 pro; huaei p50 po
<InputExample> label: 0, texts: huawei mate 40 pro; huawei mte 40 pro


Iteration:   0%|          | 0/7 [00:00<?, ?it/s]

<InputExample> label: 0, texts: xiaomi mi 11 ultra; xiaomi mi 11 uKltra
<InputExample> label: 0, texts: huawei mate 40 pro; huawei Pate 40 pro
<InputExample> label: 0, texts: huawei p50 pro; huawei p50 prHo
Pipeline ready!


In [26]:
pipeline4.search('apple', 5)

Unnamed: 0,name,similarity
0,Apple iPhone X,17.619295
1,Apple iPhone XR,24.337732
2,Apple iPhone 11,29.169186
3,Intex Aqua Selfie,29.768295
4,Apple iPhone XS Max,30.071764


In [27]:
pipeline4.validate(validate_augmentation_transforms, accuracy_top)

100%|███████████████████████████████████████████████████████████████████████| 223/223 [00:10<00:00, 21.82it/s]

Top 1Acc = 0.9417040358744395
Top 5Acc = 0.9955156950672646
Top 10Acc = 1.0





{1: 0.9417040358744395, 5: 0.9955156950672646, 10: 1.0}

In [28]:
pipeline4.save('pipeline4')

<api.api.Pipeline at 0x206acbde4d0>