# Мэтчинг товаров для маркетплейса

## Открытие файлов с данными и их изучение

Маркетплейс поставил перед нами задачу мэтчинга: необходимо разработать алгоритм, который для каждого товара из датасета `validation.csv` предложит 5 наиболее похожих товаров из `base.csv`. Оценивать качество алгоритма нужно оценивать по метрике accuracy@5 по ответам из датасета `validation_answer.csv`.  
  
Нам доступно 4 датасета с данными:
- `base.csv` - анонимизированный набор товаров. Каждый товар представлен как уникальный id и вектор признаков размерностью 72
- `train.csv` - датасет с данными для обучения модели. У каждого товара есть уникальный id, вектор признаков размерностью 72 и id товара из датасета `base.csv`, который максимально похож на него по мнению экспертов
- `validation.csv` - датасет с набором товаров, для которых надо найти наиболее близкие товары из `base.csv`. У каждого товара есть уникальный id и набор признаков размерностью 72
- `validation_answer.csv` - правильные ответы к предыдущему файлу

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, PredefinedSplit, cross_val_score
!pip install faiss-gpu
import faiss
import requests
from zipfile import ZipFile
import urllib
import json
from urllib.parse import urlencode
import time
from google.colab import files
!pip  install catboost
from catboost import CatBoostClassifier, Pool
import os



In [2]:
base_url = 'https://cloud-api.yandex.net/v1/disk/public/resources/download?'
public_key = 'https://disk.yandex.ru/d/BBEphK0EHSJ5Jw'

# Формируем ссылку с яндекс диском для загрузки
final_url = base_url + urlencode(dict(public_key=public_key))
response = requests.get(final_url)
download_url = response.json()['href']

# Загружаем файл и сохраняем его
download_response = requests.get(download_url)
with open('data.zip', 'wb') as f:
    f.write(download_response.content)

# Сохраняем zip file, в котором хранятся данные
zip_file = ZipFile('data.zip')

In [3]:
train = pd.read_csv(zip_file.open('train.csv'), index_col=0) # Обучающий датасет
base = pd.read_csv(zip_file.open('base.csv'), index_col=0) # Набор товаров
validation = pd.read_csv(zip_file.open('validation.csv'), index_col=0) # Товары, для которых ищем наиболее похожие
answers = pd.read_csv(zip_file.open('validation_answer.csv'), index_col=0) # Ответы к предыдущему сету товаров

In [4]:
# Удалим ненужные файлы для очистки памяти
del zip_file, download_response

delete_filepath = '/content/data.zip'

open(delete_filepath, 'w').close()
os.remove(delete_filepath)

In [5]:
def data_overview(data, corr=False):

    # Настроим вывод датасетов, чтобы показывались все столбцы
    pd.set_option('display.max_columns', None)

    # Вывод первых строчек датасета для лучшего понимания структуры
    display(data.head())

    # Выведем размер датасетае
    print(f'Размер датасета: {data.shape}')

    # Проверим датасет на наличие пропусков
    nulls = pd.DataFrame(data.isna().sum(), columns=['NA_count'])
    if sum(nulls['NA_count'] == 0):
        print('В датасете нет пропусков')
    else:
        display(nulls.query('NA_count != 0'))

    # Выведем информацию о распределении признаков
    display(data.describe())

    # Посмотрим на корреляцию между признаками
    if corr:
        display(data.corr())

In [6]:
# Обучающий датасет 'train'
data_overview(train)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,Target
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1
0-query,-53.882748,17.971436,-42.117104,-183.93668,187.51749,-87.14493,-347.360606,38.307602,109.08556,30.413513,-88.08269,-52.69554,-27.692442,4.872923,198.348,-30.075249,-3.803569,-135.81061,-161.84137,-78.51218,-94.42894,898.436927,-70.14052,78.42036,108.032776,813.770071,-74.79088,12.610422,-183.82184,149.71584,-27.042316,-133.21217,106.420746,-303.939233,48.73079,58.185707,24.25095,-151.2241,-104.282265,-34.49281,-12.587054,2.622891,-120.96992,149.99164,-31.94847,82.31443,-115.83047,-243.30939,6.622036,-132.19766,68.71256,-38.806824,62.937435,-132.65445,89.189026,87.03978,-121.27988,-6.986934,-206.51382,29.485587,-77.02959,-132.38617,-105.42782,70.10736,-155.80257,-101.965943,65.90379,34.4575,62.642094,134.7636,-415.750254,-25.958572,675816-base
1-query,-87.77637,6.806268,-32.054546,-177.26039,120.80333,-83.81059,-94.572749,-78.43309,124.9159,140.33107,-177.6058,-84.995514,42.81081,-57.256332,96.792534,-19.261467,0.739535,50.619213,-155.26703,-78.65943,-92.76149,353.157741,-34.744545,82.48711,-28.450592,813.770071,-137.52963,26.595627,-136.78345,153.35791,48.810093,-115.92215,87.46422,-222.286354,25.12415,91.88714,-30.63687,-136.59314,-140.50012,-43.449757,-7.226884,8.265747,-117.91547,149.1509,-18.751057,95.315384,-60.093273,-83.82058,37.449867,-23.298859,74.06108,-7.139753,75.8624,-112.04511,82.85773,54.067215,-134.00539,-26.142574,-214.63211,-457.848461,21.459618,-137.41136,-40.812233,4.669178,-151.69771,-1.638704,68.170876,25.096191,89.974976,130.58963,-1035.092211,-51.276833,366656-base
2-query,-49.979565,3.841486,-116.11859,-180.40198,190.12843,-50.83762,26.943937,-30.447489,125.771164,211.60782,-86.34656,-35.666546,16.395317,-80.80285,137.90865,-23.53276,-47.256584,-16.650242,-194.50568,-78.372925,-69.32448,1507.231274,-52.50097,-34.165775,52.958652,813.770071,-18.021725,20.951107,-50.32178,158.76062,0.178065,-183.06967,99.05357,-1018.469545,-51.80112,97.76677,-10.86585,-144.42316,-133.81949,-78.9023,-17.200352,4.467452,-63.970737,154.63953,-30.211614,48.5274,-122.40664,-112.71362,53.461838,-31.11726,107.84151,16.482935,77.93448,-95.61873,91.460075,63.11951,-126.93925,8.066627,-195.67767,-163.12,-72.83,-139.22307,-52.031662,78.039764,-169.1462,82.144186,66.00822,18.400496,212.40973,121.93147,-1074.464888,-22.547178,1447819-base
3-query,-47.810562,9.086598,-115.401695,-121.01136,94.65284,-109.25541,-775.150134,79.18652,124.0031,242.65065,-146.51707,-159.46985,-13.844755,-6.113928,118.939255,-44.585907,9.559358,14.435648,-156.90683,-78.78932,-78.73709,1507.231274,19.957405,34.83429,-8.820732,813.770071,-125.6068,17.584084,-58.452904,141.2818,-54.95931,-136.98854,63.880493,-1018.469545,89.22893,65.91996,-24.078644,-152.3341,-91.19938,-28.22539,-4.767386,0.158236,-129.12866,122.95837,-30.800995,123.6234,-37.540867,-72.1398,71.24099,-168.11559,118.23645,-18.065195,37.25572,-137.69104,87.50077,62.43729,-131.26064,35.69266,-86.03883,-379.33909,-153.46577,-131.19829,-61.567047,44.515266,-145.41675,93.990981,64.13135,106.06192,83.17876,118.277725,-1074.464888,-19.902788,1472602-base
4-query,-79.632126,14.442886,-58.903397,-147.05254,57.127068,-16.239529,-321.317964,45.984676,125.941284,103.39267,-107.15302,-8.800034,-50.9778,29.457338,143.38931,5.614824,-45.27476,9.643625,-77.55463,-79.06661,-77.92646,1507.231274,16.6124,116.28429,33.754898,813.770071,-105.765335,6.523008,-19.812988,157.69392,-20.604088,-146.59128,78.84957,-780.449185,87.56077,73.03666,16.89103,-144.6579,-116.12215,-19.353254,-7.709266,-5.394988,-140.25212,193.18497,-53.147078,79.869446,-151.13135,-45.05616,79.796234,46.763016,47.68181,-24.104229,75.14259,-207.34506,93.436935,51.505203,-135.47598,99.80366,-49.158073,-203.212852,-127.74786,-103.3417,-68.7706,45.02891,-196.09207,-117.626337,66.92622,42.45617,77.621765,92.47993,-1074.464888,-21.149351,717819-base


Размер датасета: (100000, 73)
В датасете нет пропусков


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,-85.328679,7.664345,-43.667046,-146.11863,111.770592,-73.178792,-440.615118,21.231588,123.067891,126.120201,-141.168584,-69.228444,-42.559541,6.506719,149.077582,-23.942253,-2.219093,-12.964439,-138.71461,-79.159795,-97.474574,1297.269578,-3.600685,55.908014,23.470181,742.788817,-64.970553,16.368072,-86.20117,152.098083,-13.659323,-144.506022,89.747894,-587.551768,36.46821,73.109595,-13.866967,-146.487267,-118.336458,-41.131969,-9.261504,4.139978,-104.928806,122.240784,-40.320211,59.051981,-84.054387,-140.787854,46.137851,-34.136875,29.462927,-22.343451,67.581086,-120.052756,88.193676,69.538752,-132.336967,14.452681,-120.301843,-315.282239,-66.37866,-130.672183,-81.145553,36.898233,-152.223082,14.001695,67.79956,23.029277,73.412076,115.189717,-709.761548,-48.505704
std,25.803845,4.955651,39.111064,20.434841,47.700958,28.718853,279.01755,65.985214,6.492081,64.912752,41.607785,51.258647,40.819544,37.5938,34.167843,13.024184,20.978943,55.21092,48.737245,0.77556,28.382144,375.123794,50.06368,35.617513,40.802653,191.052074,63.16065,4.828575,49.511818,10.392531,29.191689,35.816357,28.312124,273.294442,39.756311,20.00817,25.824911,10.383975,13.578397,19.868303,8.295381,5.922103,56.519443,48.109909,14.739497,31.088173,66.055063,59.851232,46.328286,59.325454,60.934591,54.991787,12.988132,55.920785,4.824051,12.339684,7.952113,49.234745,55.730177,210.657513,64.913422,9.408099,30.675871,25.357686,41.020285,99.079597,1.838012,55.470761,62.203132,21.582238,405.961084,41.215124
min,-186.28027,-11.560507,-224.89606,-223.30722,-93.27202,-184.96245,-791.469482,-278.58282,94.83115,-142.15695,-330.5706,-352.4052,-214.74246,-145.86028,8.81839,-82.645004,-91.65702,-247.58928,-326.64874,-82.637184,-228.80826,136.873137,-211.9718,-87.32378,-122.584274,-61.803358,-338.5625,-6.629074,-285.6687,111.38725,-130.44492,-290.124,-25.55072,-1044.135662,-119.90381,-11.926712,-124.56038,-193.91815,-166.07846,-123.06242,-43.063824,-18.980648,-324.94357,-67.540375,-63.102401,-64.78969,-327.32272,-365.06088,-129.87918,-280.37183,-250.33757,-236.14847,18.47704,-354.9157,66.068665,18.104557,-161.18932,-183.84988,-330.63818,-681.029305,-358.46045,-169.5664,-210.05931,-60.779335,-317.83167,-157.590189,60.66889,-203.74638,-181.97382,22.598862,-1297.871984,-209.93576
25%,-103.309147,4.313784,-69.494487,-160.02852,79.474322,-92.83408,-738.536868,-22.131764,118.674642,82.545028,-168.59902,-103.048815,-69.801705,-18.815189,126.369124,-32.379752,-16.455603,-49.612492,-171.661355,-79.656816,-115.407545,1248.126198,-37.047511,31.436791,-5.051067,813.770071,-106.860569,13.187895,-119.61359,145.143008,-32.812503,-168.535163,70.432237,-803.390708,9.505674,59.978086,-30.926924,-153.33921,-127.603408,-54.589425,-14.77218,0.086251,-144.015452,89.138641,-53.806738,37.832466,-129.672705,-181.176132,14.323097,-73.41532,-11.033693,-59.606014,58.553989,-157.929585,84.912363,61.211434,-137.752178,-19.537687,-158.719347,-497.152639,-110.07139,-137.069838,-101.50483,19.774804,-178.92947,-71.745018,66.560655,-14.144571,31.685548,100.589324,-1074.464888,-76.478591
50%,-85.273695,7.652854,-42.830246,-146.067445,112.2601,-73.36442,-511.837758,22.278989,123.08754,125.96334,-140.46144,-68.753325,-41.919796,6.739488,149.705675,-23.707227,-2.339008,-12.189709,-139.00915,-79.13746,-96.862695,1507.231274,-3.60893,55.273705,23.306557,813.770071,-65.226217,16.473505,-86.514155,151.992225,-13.218622,-144.5845,89.966625,-579.590387,36.398791,73.121085,-13.399444,-146.415015,-118.3592,-41.346603,-9.213719,4.213268,-106.112495,122.344315,-39.954565,59.343556,-85.365553,-140.503565,45.499802,-31.942193,30.274884,-23.291442,67.26128,-120.103632,88.187425,69.65509,-132.25644,13.610872,-120.229032,-315.177219,-65.254957,-130.803775,-80.85964,36.711365,-151.576985,14.072663,67.813585,23.441363,72.880192,115.236635,-808.801696,-48.700929
75%,-67.33081,10.981191,-17.71369,-132.277745,143.76083,-53.699612,-201.116578,65.899595,127.541802,169.701005,-113.136535,-34.668559,-14.631057,32.074534,172.627318,-15.438074,11.702535,24.464662,-106.746862,-78.63889,-78.583615,1507.231274,29.473402,79.55754,51.787124,813.770071,-22.893323,19.662112,-53.064181,158.893878,5.975015,-120.4393,109.219697,-349.54503,63.628358,86.277285,3.638963,-139.454015,-109.068438,-27.693264,-3.671712,8.247889,-66.750507,154.74447,-26.757267,80.604097,-39.212476,-100.564151,77.57892,6.814449,70.794575,14.649595,76.49666,-81.45645,91.45409,77.97801,-126.864683,48.039566,-81.68471,-132.410422,-21.824901,-124.460043,-60.492451,53.91952,-124.436974,100.011024,69.05875,59.941065,114.469224,129.791085,-357.087323,-20.487869
max,14.585236,28.917845,128.10846,-60.751625,301.3636,51.84971,109.625188,288.17184,149.3562,392.82715,37.017212,152.5,116.2101,171.12697,279.15256,35.552612,94.32458,215.48805,79.24516,-75.83564,26.48407,1557.383334,193.88431,207.02106,199.46176,845.708887,192.10797,35.11241,101.514206,195.69856,102.20405,10.103592,198.14978,-137.646757,206.76524,156.6482,87.089355,-101.97762,-61.24847,34.684986,24.319061,25.914757,116.18791,335.55945,-16.052626,175.80774,200.44989,110.09,232.16551,191.03741,257.1524,212.1686,121.79599,111.49347,107.38754,116.727646,-99.66117,193.6935,107.60625,48.723814,189.1755,-92.03704,47.73345,145.66713,11.907364,185.095137,74.779884,266.49332,319.86752,201.76126,98.768233,126.19179


In [7]:
# Набор товаров 'base'
data_overview(base)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1
0-base,-115.08389,11.152912,-64.42676,-118.88089,216.48244,-104.69806,-469.070588,44.348083,120.915344,181.4497,-124.06151,-32.458237,-57.42056,36.207405,182.71677,-28.071688,-5.209374,-1.732182,-91.09186,-79.46667,-103.93909,1507.231274,-32.591667,51.41436,-51.90536,813.770071,-41.556538,8.419807,-129.97664,148.76503,-10.069234,-143.00504,79.848694,-537.183707,2.77318,111.51868,4.257666,-153.34058,-123.62608,-39.707664,-12.768708,-4.170106,-152.96713,75.91152,-57.470088,17.266476,-98.871155,-138.06754,91.00038,-170.702,-50.02039,59.794296,78.10638,-89.375725,89.4121,79.81218,-129.51361,-44.659496,-80.44221,5.149714,-64.51895,-138.77763,-42.808693,38.800827,-151.76218,-74.38909,63.66634,-4.703861,92.93361,115.26919,-112.75664,-60.830353
1-base,-34.562202,13.332763,-69.78761,-166.53348,57.680607,-86.09837,-85.076666,-35.637436,119.718636,195.23419,-141.30435,-126.7689,-68.85353,8.314717,143.08174,8.778257,15.157185,48.24636,-244.95486,-80.27942,-81.78804,429.880035,-5.93769,70.142654,66.57684,813.770071,3.214371,17.164303,-39.418385,148.15588,-7.57959,-174.36995,101.22463,-303.939233,16.91463,53.394676,33.451824,-148.55263,-110.34017,-33.277206,-8.688703,13.650302,-91.228745,38.445015,-31.94847,62.48482,-166.47336,-189.01042,-56.731556,-30.027319,59.17106,-56.71154,63.542606,-121.31917,97.426056,74.83284,-138.77705,58.887608,-71.59534,-251.952358,11.392853,-148.57166,-117.767525,41.1,-157.8294,-94.446806,68.20211,24.346846,179.93793,116.834,-84.888941,-59.52461
2-base,-54.233746,6.379371,-29.210136,-133.41383,150.89583,-99.435326,52.554795,62.381706,128.95145,164.38147,-140.82245,-22.486748,-51.66699,2.521726,140.58545,-22.03923,-7.128634,-5.184787,-111.71212,-79.02927,-82.452576,1507.231274,-0.421011,84.10801,63.34451,813.770071,-105.21105,18.8447,-94.789474,151.38771,-21.48344,-144.84537,48.502934,-780.449185,64.76731,79.781555,-13.390142,-150.06166,-118.75855,-40.896286,-15.22262,-1.860338,-93.820656,98.74907,-46.745782,65.927475,-10.286392,-125.32137,21.048609,-22.637775,68.915985,-54.90226,66.33547,-142.92792,83.03098,69.97963,-123.73389,65.32088,-114.03718,-529.396956,-101.581375,-134.65822,-76.3978,46.011803,-207.14442,127.32557,65.56618,66.32568,81.07349,116.594154,-1074.464888,-32.527206
3-base,-87.52013,4.037884,-87.80303,-185.06763,76.36954,-58.985165,-383.182845,-33.611237,122.03191,136.23358,-108.47044,-104.53778,-82.38849,-79.98674,113.59631,-38.99588,-4.78736,8.327808,-124.85576,-79.10902,-107.943275,1479.17053,-125.271614,24.656485,-16.921055,813.770071,-182.68027,14.715704,-62.9327,151.20198,10.613454,-134.93434,58.77773,-529.295053,43.665924,92.806305,-38.936657,-145.0453,-124.29577,-36.87673,-17.74287,10.536242,-124.58415,121.91415,-17.321358,105.21724,-94.37965,-63.76927,29.217487,-26.423973,5.869829,64.06155,63.51328,-112.2581,80.92659,72.6361,-137.31432,7.912551,-62.18892,-540.321044,-89.588715,-149.32669,-70.64794,-6.358921,-147.20105,-37.69275,66.20289,-20.56691,137.20694,117.4741,-1074.464888,-72.91549
4-base,-72.74385,6.522049,43.671265,-140.60803,5.820023,-112.07408,-397.711282,45.1825,122.16718,112.119064,-110.34245,-38.98206,-50.228867,-26.69092,135.2729,-7.510103,8.924209,111.57695,-130.83319,-79.31585,-57.843952,1262.919738,19.22591,59.811813,76.15967,312.698903,-76.726974,21.299923,-24.069107,167.38553,-44.05947,-148.13185,98.04261,-657.524388,96.36317,71.06152,-36.563557,-135.37244,-123.70898,-44.739437,-10.85522,8.3548,-168.32886,215.92807,-32.975979,66.86488,30.83863,-211.54037,57.695343,-67.82263,-29.389133,-34.12374,54.71055,-26.032013,90.67391,58.086998,-134.78206,77.62039,-176.61588,-529.762412,-100.56552,-125.31733,-57.199104,56.642403,-159.35184,85.944724,66.76632,-2.505783,65.315285,135.05159,-1074.464888,0.319401


Размер датасета: (2918139, 72)
В датасете нет пропусков


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71
count,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0,2918139.0
mean,-86.22947,8.080077,-44.5808,-146.635,111.3166,-71.99138,-392.2239,20.35283,123.6842,124.4581,-143.1396,-69.88662,-41.26493,7.58778,148.9852,-23.16701,-0.9366916,-19.31312,-137.2712,-79.18932,-98.25383,1257.528,-6.824936,56.67382,26.69563,727.1026,-66.2534,16.31537,-86.37482,151.7346,-11.60764,-144.2328,87.21997,-554.1608,36.23933,75.19918,-16.4867,-146.6421,-118.5122,-41.87896,-9.482269,4.050654,-112.744,120.8653,-39.80939,58.93878,-84.88885,-138.4695,47.92759,-32.38229,26.58408,-21.71709,66.99684,-117.8976,88.02818,69.20454,-132.9056,13.22645,-119.5996,-316.0935,-64.94332,-130.6677,-79.02286,33.29735,-154.7962,14.15132,67.79167,23.5449,74.9593,115.5667,-799.339,-47.79125
std,24.89132,4.953387,38.63166,19.8448,46.34809,28.18607,271.655,64.21638,6.356109,64.43058,41.68737,51.21991,38.96358,36.59725,33.63559,13.07563,20.83201,52.9721,46.62566,0.7574403,28.15861,399.9612,51.97251,34.36061,38.94157,205.5724,61.53334,4.656628,48.51313,10.09,28.95059,34.75739,27.85556,260.6486,39.89506,21.44713,25.85462,9.953913,13.14411,19.19689,8.19072,5.83575,56.9405,46.71995,14.74482,32.1103,63.19335,57.73822,45.20157,58.28693,58.95291,51.73898,13.07173,54.70584,4.774309,11.94907,7.815245,51.07988,55.24317,210.6644,62.48236,9.569063,30.45642,28.88603,41.22929,98.95115,1.823356,55.34224,61.345,21.17518,385.4131,41.74802
min,-199.4687,-13.91461,-240.0734,-232.6671,-105.583,-211.0086,-791.4699,-301.8597,93.15305,-173.8719,-343.8436,-368.4526,-226.0906,-175.6239,-15.26698,-85.91711,-96.36816,-272.5656,-351.7478,-83.15235,-243.7279,136.8182,-238.2874,-88.18329,-143.1833,-61.98046,-358.5515,-8.364693,-314.0082,107.4433,-156.7305,-298.6792,-34.65488,-1045.312,-137.8509,-21.51936,-137.9866,-195.4172,-176.911,-132.7073,-47.57769,-21.58085,-338.1192,-87.70834,-63.1031,-77.24959,-362.9306,-418.6953,-145.3876,-322.8121,-280.2731,-264.6334,6.196133,-381.2643,65.58115,14.08166,-167.0627,-196.1836,-359.7054,-681.0424,-400.7911,-171.3491,-220.5662,-88.50774,-353.9028,-157.5944,59.50944,-233.1382,-203.6016,15.72448,-1297.931,-226.7801
25%,-103.0654,4.708491,-69.55949,-159.9051,80.50795,-91.37994,-629.3318,-22.22147,119.484,81.76751,-170.3175,-104.0333,-66.62606,-16.94393,125.9664,-31.76832,-15.1545,-55.46463,-167.5851,-79.67751,-116.8426,1088.597,-42.24413,33.29542,0.556467,813.7701,-107.8521,13.26106,-118.6601,144.8869,-30.75366,-167.0386,68.29836,-771.2967,9.347785,60.90705,-34.14425,-153.2671,-127.4493,-54.79372,-14.98294,0.03813005,-151.5911,88.20286,-52.87369,36.99374,-128.1978,-176.5964,17.90398,-69.94666,-12.33175,-56.2822,58.46433,-154.4882,84.71401,61.30621,-138.104,-22.38689,-157.4316,-498.4835,-106.5525,-137.3891,-98.7639,16.98862,-180.7799,-71.30038,66.58096,-12.51624,33.77574,101.6867,-1074.465,-75.66641
50%,-86.2315,8.03895,-43.81661,-146.7768,111.873,-71.9223,-422.2016,20.80477,123.8923,123.4977,-141.7156,-70.22748,-40.39738,8.128445,149.3146,-22.9103,-1.032792,-19.8047,-136.8043,-79.17803,-97.52848,1507.231,-6.418125,56.99871,27.13143,813.7701,-66.19686,16.395,-85.53529,151.6445,-11.18774,-144.121,86.86744,-529.2951,35.32246,74.60604,-16.31658,-146.7152,-118.5961,-42.02502,-9.468345,4.050146,-113.0552,119.903,-40.03325,58.96474,-85.11492,-137.3809,46.96564,-29.17426,27.50635,-22.21488,67.22437,-117.6273,88.01158,69.35902,-132.8263,13.11331,-120.459,-315.9894,-63.68938,-130.8757,-78.48812,34.71502,-153.9773,13.82693,67.81458,23.41649,74.92997,116.0244,-1074.465,-48.59196
75%,-69.25658,11.47007,-19.62527,-133.3277,142.3743,-52.44111,-156.6686,63.91821,127.9705,167.2206,-114.9862,-35.6953,-14.8356,32.67095,172.1625,-14.34341,13.37403,16.58952,-107.5373,-78.67896,-79.24342,1507.231,28.23144,79.66341,53.08961,813.7701,-25.57611,19.43675,-53.87801,158.276,8.294287,-121.4661,106.0111,-341.7818,62.86465,88.86667,1.219331,-139.9523,-109.6737,-28.9217,-4.01325,8.07398,-73.8073,152.2359,-25.97449,81.88782,-43.00754,-100.1017,78.53315,7.648376,66.00199,12.81711,75.89855,-80.63755,91.31103,77.24293,-127.6164,48.15557,-81.7063,-133.6101,-22.53313,-124.2669,-58.53355,52.16429,-127.3405,99.66753,69.02666,59.75511,115.876,129.5524,-505.7445,-19.71424
max,21.51555,29.93721,160.9372,-51.37478,319.6645,58.80624,109.6325,341.2282,152.2612,427.5421,41.40846,182.4389,120.7674,177.527,291.748,45.07333,105.391,220.7994,97.68367,-75.79646,32.12392,1557.433,228.8031,214.3112,213.5212,845.8495,227.8015,38.26621,154.8162,200.8107,124.0331,11.4109,213.9868,-137.6468,227.8165,170.7491,99.34335,-99.77887,-60.83402,40.97299,26.92393,31.28809,133.5638,348.025,-16.05263,186.2096,222.294,152.5796,244.9093,210.0491,319.0618,252.8907,121.7197,144.9036,111.6596,119.4334,-98.52727,232.7078,130.5232,48.73138,217.2591,-89.23699,60.17411,154.1678,24.36099,185.0981,75.71203,314.8988,339.5738,214.7063,98.77081,126.9732


In [8]:
# Товары, для которых ищем похожие 'validation'
data_overview(validation)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1
100000-query,-57.372734,3.597752,-13.213642,-125.92679,110.74594,-81.279594,-461.003172,139.81572,112.88098,75.21575,-131.8928,-140.96857,-57.987164,-22.868887,150.89552,7.965574,17.622066,-34.868217,-216.13855,-80.90873,-52.57952,263.363136,56.266876,66.92471,21.609911,813.770071,-32.78294,20.794031,-79.779076,156.30708,-42.83133,-71.723335,83.28366,-304.174382,1.609402,55.834587,-29.474255,-139.16277,-126.03835,-62.64383,-5.012346,11.98492,-43.084946,190.124,-24.996636,76.1539,-245.26157,-143.65648,-4.259628,-46.664196,-27.085403,-34.346962,75.530106,-47.171707,92.69732,60.47563,-127.48687,-39.484753,-124.384575,-307.94976,45.506813,-144.19095,-75.51302,52.830902,-143.43945,59.051935,69.28224,61.927513,111.59253,115.140656,-1099.130485,-117.07936
100001-query,-53.758705,12.7903,-43.268543,-134.41762,114.44991,-90.52013,-759.626065,63.995087,127.117905,53.128998,-153.71725,-63.95133,-52.369495,-33.390945,148.6195,-22.48383,15.164185,-56.202,-153.61438,-79.831825,-101.05548,1203.537156,81.59713,101.018654,56.783424,92.209628,-126.86034,10.382887,-38.52336,165.38391,-77.840485,-169.53868,103.48324,-915.735701,16.109938,14.669937,-38.707085,-149.53838,-138.79292,-36.076176,-2.781422,2.283144,-142.47789,189.95395,-18.40823,90.51705,-95.531,-259.63605,52.437836,-30.004599,14.50206,-1.071201,66.84267,-161.27989,94.794174,50.419983,-125.07526,-25.169033,-176.17688,-655.836897,-99.23837,-141.53522,-79.44183,29.185436,-168.6059,-82.872443,70.7656,-65.97595,97.07716,123.39164,-744.442332,-25.00932
100002-query,-64.175095,-3.980927,-7.679249,-170.16093,96.44616,-62.37774,-759.626065,87.477554,131.27011,168.92032,-220.30954,-31.378445,-8.788761,2.285323,133.26611,-41.30908,14.305538,-18.231812,-205.5337,-78.16031,-96.60767,1507.231274,-5.9642,34.937443,-56.086887,813.770071,-13.200474,18.966661,-35.11019,151.3685,-17.490252,-145.8843,15.533379,-655.395514,39.412827,62.554955,9.924992,-143.93462,-123.107796,-37.032475,-13.501337,12.913328,-116.03802,176.27615,-45.909942,103.49136,-90.65699,-162.6157,117.128235,13.079479,69.82689,-6.874451,63.707214,-123.85107,91.61082,59.760067,-129.56618,-12.822194,-154.19765,-407.199067,5.522629,-126.81297,-134.79541,37.36873,-159.66231,-119.232725,67.71044,86.00206,137.63641,141.08163,-294.052271,-70.969604
100003-query,-99.28686,16.123936,9.837166,-148.06044,83.69708,-133.72972,58.576403,-19.04666,115.042404,75.20673,-114.27196,-71.406456,-65.34932,24.37707,50.4673,-14.721335,15.069309,-46.682995,-176.60437,-78.6907,-139.22745,325.547112,3.632292,74.929504,-4.802103,813.770071,-52.982597,15.644381,-54.087467,151.30914,21.08857,-134.50789,65.11896,-529.295053,131.56552,67.6427,-22.884491,-145.90652,-86.91733,-11.863579,-22.188885,0.46372,-212.53375,170.52258,-48.092532,99.712555,-194.69241,-141.52318,60.21705,73.38638,118.567856,58.90081,55.56903,-181.09166,83.340485,66.08324,-114.04887,-57.15687,-56.335075,-318.680065,-15.984783,-128.10133,-77.23611,44.100494,-132.53012,-106.318982,70.88396,23.577892,133.18396,143.25294,-799.363667,-89.39267
100004-query,-79.53292,-0.364173,-16.027431,-170.88495,165.45392,-28.291668,33.931936,34.411217,128.90398,102.086914,-76.21417,-26.39386,34.42364,50.93889,157.68318,-23.786497,-33.175415,-0.592607,-193.31854,-79.65103,-91.889786,1358.481072,44.027733,121.52721,46.183,433.623103,-82.2332,21.068508,-32.940117,149.26895,0.404718,-97.67453,81.71999,-825.644804,9.397169,49.35934,17.725466,-160.16815,-129.36795,-55.532898,-2.597821,-0.226103,-41.36914,92.090195,-58.626857,73.65544,-10.25737,-175.65678,25.395056,47.874825,51.464676,140.95168,58.751133,-215.48764,91.25537,44.16503,-135.29533,-19.50816,-106.674866,-127.978884,-11.433113,-135.57036,-123.77025,45.635944,-134.25893,13.735359,70.61763,15.332115,154.56812,101.70064,-1171.892332,-125.30789


Размер датасета: (100000, 72)
В датасете нет пропусков


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,-85.302233,7.669724,-43.842474,-146.119797,111.635071,-73.273042,-441.396759,21.594242,123.077725,126.001062,-141.215384,-69.340237,-42.626768,6.272022,149.006081,-23.961728,-2.009594,-13.151269,-138.5406,-79.16268,-97.356361,1301.395163,-3.439855,56.097825,23.357262,741.156981,-64.986388,16.367853,-86.139332,152.078754,-13.476064,-144.568331,89.709031,-588.597042,36.554961,73.0854,-13.889099,-146.4615,-118.391474,-40.992214,-9.193917,4.157828,-104.775965,122.095524,-40.311826,59.123753,-83.620997,-141.003063,46.18776,-34.154328,29.40487,-22.542637,67.486142,-120.307717,88.174847,69.49554,-132.377437,14.411927,-120.381837,-316.401557,-66.614887,-130.653256,-81.129311,36.778882,-152.341249,14.397762,67.788892,23.250779,73.114446,115.196935,-709.457021,-48.416276
std,25.777321,4.95699,39.138775,20.495541,47.751576,28.51574,279.242585,66.478048,6.504931,64.981697,41.568432,51.483403,40.858017,37.845939,34.031665,13.01467,20.993654,55.340411,48.790143,0.777431,28.503558,371.543911,50.29141,35.599512,40.849607,193.199671,63.274623,4.845841,49.51197,10.406558,29.275776,35.911251,28.430405,273.788805,39.610961,20.044353,25.802824,10.389778,13.602543,19.845968,8.364438,5.910225,56.986969,48.130993,14.753263,31.12699,65.925312,59.595488,45.996789,59.61185,60.943297,55.140398,13.036243,56.107653,4.809309,12.324416,7.894675,49.387024,55.81822,211.131373,65.195278,9.426921,30.69159,25.427103,41.246347,98.695231,1.844522,55.403862,62.056224,21.493081,405.665764,41.292843
min,-190.35333,-11.109877,-217.53842,-220.05089,-81.19899,-176.70503,-791.460243,-265.60843,96.21835,-135.6673,-321.54425,-363.49332,-204.29913,-144.10074,-14.816055,-78.59745,-86.84328,-272.5656,-329.86017,-82.6653,-231.37665,136.846377,-208.90085,-84.22453,-138.217,-61.879841,-341.96222,-6.662937,-282.24933,110.357254,-150.15472,-281.12354,-24.076744,-1044.135662,-117.473145,-17.641396,-124.311615,-194.13293,-166.12445,-126.336075,-43.32117,-19.319794,-326.3013,-79.30472,-63.102401,-76.79204,-365.28253,-375.4522,-121.267914,-277.37592,-279.88638,-234.5215,12.404751,-381.60938,65.98603,17.301888,-162.87299,-156.58159,-329.7008,-681.038139,-339.96634,-168.73557,-215.15228,-66.26597,-337.63287,-157.593541,60.377728,-210.6728,-175.92178,25.271042,-1297.923999,-209.93576
25%,-103.23179,4.292425,-69.680916,-160.210287,79.10192,-92.789803,-740.620025,-21.80473,118.657689,82.311205,-168.562772,-103.338022,-70.147153,-19.089412,126.442184,-32.356516,-16.392179,-49.942476,-171.695623,-79.665475,-115.390365,1248.126198,-36.891637,31.741733,-5.265194,813.770071,-107.339855,13.182383,-119.527813,145.12613,-32.815898,-168.729353,70.166879,-803.390708,9.700724,59.971492,-30.78621,-153.332742,-127.661534,-54.462001,-14.75752,0.111931,-143.897497,89.14584,-53.806738,37.646882,-129.168005,-180.8764,14.915657,-73.757892,-10.882261,-59.953465,58.41217,-158.217357,84.88728,61.296285,-137.75133,-20.101644,-158.84789,-499.828267,-110.385372,-137.093315,-101.690472,19.564135,-179.115082,-70.420034,66.54613,-13.863008,31.49484,100.538862,-1074.464888,-76.35919
50%,-85.296745,7.657888,-43.230835,-146.080365,111.95933,-73.579283,-513.922978,22.138012,123.062568,126.083595,-140.410225,-68.618967,-42.192298,6.420291,149.639465,-23.758115,-2.075724,-12.401822,-139.00072,-79.139723,-96.693777,1507.231274,-3.386905,55.46128,23.119656,813.770071,-65.251958,16.497591,-86.373472,151.95553,-13.156792,-144.62845,89.94199,-579.590387,36.466842,72.989995,-13.341261,-146.36191,-118.378315,-41.189537,-9.162772,4.215425,-105.75932,121.80571,-39.871485,59.460886,-84.957413,-140.933335,45.577067,-32.077689,30.384748,-23.197395,67.172812,-120.326053,88.14989,69.5855,-132.32154,13.510024,-120.456705,-317.215462,-65.75555,-130.75305,-81.02725,36.41415,-151.426035,14.768218,67.800445,23.508739,72.152398,115.28099,-807.029697,-48.670001
75%,-67.422104,10.9939,-17.765821,-132.119278,143.903735,-53.861139,-202.57244,66.097697,127.538967,169.6805,-113.316906,-34.844935,-14.724666,31.930171,172.417195,-15.463551,11.971967,24.22903,-106.607981,-78.636325,-78.378098,1507.231274,29.878309,79.815228,51.817363,813.770071,-22.846247,19.688615,-52.950719,158.917625,6.254709,-120.418845,109.514725,-350.783326,63.613413,86.360483,3.8241,-139.428275,-109.20411,-27.543321,-3.593927,8.254853,-66.323114,154.687072,-26.63196,81.013712,-39.192944,-101.451846,77.262764,7.036819,70.646433,14.738863,76.4029,-81.607422,91.400723,77.932812,-126.955706,47.903254,-81.473065,-132.568961,-21.913417,-124.386617,-60.274745,53.88842,-124.400438,99.803923,69.040832,60.154775,114.317053,129.770215,-358.400478,-20.283335
max,14.427986,27.409784,134.8598,-57.38189,302.06537,50.255325,109.632035,279.4689,151.82256,392.82715,34.50528,170.34402,120.53197,177.76248,277.41785,36.47161,92.66164,197.92683,82.68381,-76.17781,9.297035,1557.293478,194.32181,203.8921,198.85036,845.461904,202.0539,35.58746,114.95685,195.22458,99.95479,1.120392,199.34808,-137.646757,223.98203,150.63416,80.063774,-100.16241,-66.97267,38.425518,23.95834,27.23896,117.0307,324.19107,-16.052626,178.59961,168.7309,114.07901,228.37883,178.69681,259.9794,186.12505,123.38977,115.4648,108.94717,116.727646,-99.84031,206.81743,102.78088,48.721738,194.32446,-93.94264,41.39048,141.02527,14.602035,185.094333,75.400475,251.28859,305.93753,201.59998,98.737079,111.831955


Можно заметить, что данные разделены на обучающую и тестовую выборки по порядку, ориентируясь на индексы

In [9]:
# Ответы к предыдущему набору товаров из 'validation' - 'answers'
data_overview(answers)

Unnamed: 0_level_0,Expected
Id,Unnamed: 1_level_1
100000-query,2676668-base
100001-query,91606-base
100002-query,472256-base
100003-query,3168654-base
100004-query,75484-base


Размер датасета: (100000, 1)
В датасете нет пропусков


Unnamed: 0,Expected
count,100000
unique,91502
top,210304-base
freq,7


В данном разделе проекта мы:
- загрузили данные и вывели информацию о них
- определили, что в данных отсутствуют пропуски
- определили обучающие и валидационные выборки (набор признаков и таргет)  


## Подготовка данных к построению моделей

Обучающий датасет содержит набор признаков и таргет, которые нужно разделить

In [10]:
target = train['Target']
features = train.drop('Target', axis=1)

Переименуем признаки и таргет из валидационного сета

In [11]:
features_test = validation
target_test = answers['Expected']

Разделим обучающий датасет на тренировочную и валидационную выборки

In [12]:
# Сократим обучающую выборку до 50000 товаров
features_train, features_extra, target_train, target_extra = (
    train_test_split(features, target, test_size=0.9, random_state=12345)
)

Так как разброс в значениях данных большой, проведем масштабирование признаков

In [13]:
# Проведем масштабирование признаков

numeric = features_train.columns.to_list() # Запишем все количественные признаки в список numeric
# numeric

features_train_scaled = features_train.copy()
features_test_scaled = features_test.copy()
base_scaled = base.copy()

# Используем метод StandardScaler
# Он приводит все значения к стандартному нормальному распределению
scaler = StandardScaler()
scaler.fit(features_train_scaled[numeric])
features_train_scaled[numeric] = scaler.transform(features_train_scaled[numeric])
features_test_scaled[numeric] = scaler.transform(features_test_scaled[numeric])
base_scaled[numeric] = scaler.transform(base_scaled[numeric])

#features_train_scaled.head()
#features_valid_scaled.head()
#features_test_scaled.head()

In [16]:
# Удалим ненужные переменные
del (
    answers, base, base_url, delete_filepath, download_url, f,
    features, target, features_train, features_test, final_url,
    numeric, public_key, response, scaler, train, validation
)

В данном разделе мы:
- сохранили обучающую и тестовую выборки, отделив признаки от таргета
- провели масштабирование признаков  
  
Данные готовы к построению моделей

## Построение моделей

### FAISS

Построим модель FAISS с ускоренным поиском с помощью inverted lists. Посчитаем предсказания в два этапа.  
  
**Первый этап:**
- Сначала сгруппируем набор векторов из датасета `base` по частям с помощью k-means
- Для каждой части найдем центроид
- Далее будем искать для каждого вектора из `features_valid_scaled` наиболее близкий центроид
- В части с наиболее близким к вектору центроидом будем искать наиболее близкий вектор  
  
*Результатом первого этапа будет модель Faiss для предсказания 30 наиболее близких векторов к каждому из обучающей выборки*
  
**Второй этап:**
- Объединим полученыые предсказания и изначальные вектора, так у нас получится датасет с 72 * 2 = 144 признаками
- Добавим в признаки расстояние от каждого вектора из обучающей выборки до ближайших
- Посчитаем новый таргет, где 1 - предсказание модели соответствует рекомендации эксперта, 0 - нет
- Построим CatBoost, предсказывающий вероятность выбора конкретного товара в качестве наиболее похожего

#### Первый этап (FAISS)

Создадим индекс

In [17]:
n_cells = round(base_scaled.shape[0] ** 0.5) # количество кластеров
print(f'Разделим датасет на {n_cells} кластеров при создании индекса')

Разделим датасет на 1708 кластеров при создании индекса


Обучим индекс и добавим в него вектора из `base_scaled` для получения предсказаний

In [18]:
dim = features_train_scaled.shape[1] # размерность вектора признака
quantiser = faiss.IndexFlatL2(dim)
index = faiss.IndexIVFFlat(quantiser, dim, n_cells) # создание индекса

In [19]:
index.train(np.ascontiguousarray(features_train_scaled.values).astype('float32')) # обучим индекс на трейне
print(index.is_trained) # проверим обучение

index.add(np.ascontiguousarray(base_scaled.values).astype('float32')) # добавим вектора в индекс

True


In [20]:
# Пронумеруем id товаров, запишем в словарь
base_index = {n: v for n, v in enumerate(base_scaled.index.to_list())}
# base_index

Посчитаем предсказания на обучающей выборке и оценим качество модели с гиперпараметром nprobe = 300. Так как нам нужно будет использовать код повторно для оценки качества модели на тестовой выборке, напишем функцию для вычисления предсказаний

In [21]:
'''
Функция принимает на вход заскейленные признаки и таргет
Функция возвращает просчитанные индексы ближайших векторов и расстояния до них, а также метрику accuracy@30 и время подсчета
'''

def faiss_target_formation(features_faiss, target_faiss):

    start = time.time()
    index.nprobe = 300 # обозначим количество кластеров для поиска соседей
    vecs_new, idx_new = index.search(np.ascontiguousarray(features_faiss.values).astype('float32'), 30) # найдем вектора на трейне
    end = time.time()

    acc = 0
    for t, el in zip(target_faiss.values.tolist(), idx_new.tolist()): # посчитаем accuracy@30 для модели с nprobe=n
        acc += int(t in [base_index[r] for r in el])

    acc_faiss = 100 * acc / len(idx_new)

    faiss_time = end - start

    return vecs_new, idx_new, faiss_time, acc_faiss

In [None]:
'''
vecs, idx, faiss_time, acc_faiss = faiss_target_formation(features_train_scaled, target_train)
# print(acc_faiss, faiss_time)
'''

In [22]:
acc_faiss = 74.18
faiss_time = 507.5104320049286

print(f'Точность алгоритма faiss на nprobe=100 и 1708 кластерах: {acc_faiss}\n'
    f'Время подбора ближайших векторов для трейна модели faiss: {faiss_time}')

Точность алгоритма faiss на nprobe=100 и 1708 кластерах: 74.18
Время подбора ближайших векторов для трейна модели faiss: 507.5104320049286


In [None]:
'''
# Скачаем на диск полученные индексы для train
with open('/content/drive/My Drive/idx.txt', 'w') as f:
    np.savetxt(f, idx)

# Скачаем на диск расстояния до ближайших векторов для каждого товара для train
with open('/content/drive/My Drive/vecs.txt', 'w') as f:
    np.savetxt(f, vecs)
'''

In [23]:
with open('/content/drive/My Drive/idx.txt', 'r') as f:
    idx = np.loadtxt(f)

with open('/content/drive/My Drive/vecs.txt', 'r') as f:
    vecs = np.loadtxt(f)

Метрика accuracy@n получилась 74.18 при nprobe=100. Попробуем улучшить результат на втором шаге, используя CatBoost

#### Второй этап (CatBoost)

Напишем функцию для формирования признаков и таргета для CatBoost

In [24]:
'''
Функция принимает на вход признаки и таргет faiss
Функция возвращает склеенные признаки старых векторов и ближайших к ним, таргет для катбуста (1-вектор правильно побобран моделью, 0-неправильно)
'''

def catboost_features_formation(features_faiss, target_faiss, idx_faiss, vecs_faiss):

    rows = np.repeat(features_faiss.values, 30, axis=0).shape[0]
    cols = np.repeat(features_faiss.values, 30, axis=0).shape[1]

    added_features = np.zeros(shape=(rows, cols)) # Создадим датасет размерностью 100,000 * 50 на 72 для новых признаков
    idx_reshaped = idx.reshape(rows) # запишем индекс в более удобной форме
    vecs_faiss = vecs_faiss.reshape(-1, 1)

    for row in range(rows):
         added_features[row] = base_scaled.loc[base_index[idx_reshaped[row]]]

    features_catboost = np.concatenate(
        (np.repeat(features_faiss.values, 30, axis=0), added_features),
            axis=1
    )

    features_catboost = np.concatenate(
        (features_catboost, vecs_faiss),
            axis=1
    )

    target_catboost = []
    for t, el in zip(target_faiss.values.tolist(), idx_faiss.tolist()):
        for r in el:
             if base_index[r] == t:
                 target_catboost.append(1)
             else:
                 target_catboost.append(0)

    target_catboost = np.array(target_catboost)

    # target_catboost.sum() == acc

    return features_catboost, target_catboost

In [25]:
features_catboost, target_catboost = catboost_features_formation(features_train_scaled, target_train, idx, vecs)
# 100 * target_catboost.sum() / len(idx) == acc_faiss

Подберем гиперпараметры для CatBoost (второго этапа метчинга)

In [None]:
'''
start = time.time()
grid_space_catboost = {
    'iterations': [550],
    'learning_rate': [0.01, 0.1, 0.2],
    'depth': np.arange(5, 8)
}

model_catboost = CatBoostClassifier(random_state=12345, verbose=False,
                                    auto_class_weights='Balanced',
                                    eval_metric='Accuracy')

grid_search_result = model_catboost.grid_search(grid_space_catboost, X=features_catboost,
                                                y=target_catboost, cv=3, verbose=False)
end = time.time()
'''


bestTest = 0.8489783634
bestIteration = 549


bestTest = 0.9057586594
bestIteration = 521


bestTest = 0.9055535656
bestIteration = 290


bestTest = 0.8580301076
bestIteration = 548


bestTest = 0.8998896501
bestIteration = 444


bestTest = 0.9028305796
bestIteration = 175


bestTest = 0.8685708152
bestIteration = 548


bestTest = 0.8984212011
bestIteration = 251


bestTest = 0.8994442512
bestIteration = 138

Training on fold [0/3]

bestTest = 0.9008142136
bestIteration = 329

Training on fold [1/3]

bestTest = 0.910775199
bestIteration = 528

Training on fold [2/3]

bestTest = 0.9005224652
bestIteration = 364



In [26]:
# best_params = grid_search_result['params']
best_params = {'depth': 5, 'iterations': 550, 'learning_rate': 0.1}
print(f'Лучшие гиперпараметры модели CatBoost: {best_params}')

# catboost_time = end - start
catboost_time = 1168.5561311244965
print(f'Впемя обучения модели CatBoost: {catboost_time}')

Лучшие гиперпараметры модели CatBoost: {'depth': 5, 'iterations': 550, 'learning_rate': 0.1}
Впемя обучения модели CatBoost: 1168.5561311244965


In [27]:
model_catboost = CatBoostClassifier(random_state=12345, verbose=False,
                                    depth=5, iterations=550,
                                    learning_rate=0.1, auto_class_weights='Balanced',
                                    eval_metric='Accuracy')

model_catboost.fit(features_catboost, target_catboost)

predictions_train = model_catboost.predict_proba(features_catboost)[:, 1]

Основываясь на accuracy score на кросс-валидации, качество модели улучшилось после добавления CatBoost. Посчитаем accuracy@5 на обучающей выборке, чтобы в этом убедиться. Напишем функцию для ее подсчета

In [28]:
def accuracy_5(predictions, target):

    # Выведем индексы номеров, на которых стоят 5 наиболее вероятных товаров для каждого из обучающей выорки
    predictions = pd.DataFrame(predictions.reshape(-1, 30))
    listed_idx = []
    for row in range(predictions.shape[0]):
        listed_idx.append(predictions.loc[row, :].sort_values(ascending=False)[:5].index.to_list())


    # Поставим 1 на место пяти наиболее вероятных товаров, предсказанных катбустом
    predictions = np.zeros(shape=predictions.shape)
    for row in range(predictions.shape[0]):
        for elem in listed_idx[row]:
            predictions[row][elem] = 1

    # Посчитаем accuracy@5
    target = target.reshape(-1,)
    predictions = predictions.reshape(-1,)
    acc = len([i for i, j in zip(target, predictions) if i == j and i == 1]) / target.reshape(-1, 30).shape[0]

    return acc

In [29]:
accuracy_5(predictions_train, target_catboost) * 100

73.65

Accuracy@5 после катбуста немного упала. Нужно посчитать метрику на тестовой выборке, чтобы убедиться, что второй этап с градиентным бустингом нужен

In [30]:
del idx, vecs, features_catboost, target_catboost

## Проверка модели на тестовой выборке

In [31]:
# Сократим тестовую выборку до 50000 товаров
features_test, features_extra_test, target_test, target_test = (
    train_test_split(features_test_scaled, target_test, test_size=0.9, random_state=12345)
)

In [46]:
'''
# Посчитаем ближайшие векторы и расстояния для сокращенной тестовой выборки
vecs_test, idx_test, faiss_time, acc_faiss = faiss_target_formation(features_test, target_test)
'''

In [26]:
'''
# Скачаем на диск полученные индексы для train
with open('/content/drive/My Drive/vecs_test.txt', 'w') as f:
    np.savetxt(f, vecs_test)

# Скачаем на диск расстояния до ближайших векторов для каждого товара для train
with open('/content/drive/My Drive/idx_test.txt', 'w') as f:
    np.savetxt(f, idx_test)
'''

In [32]:
with open('/content/drive/My Drive/idx_test.txt', 'r') as f:
    idx = np.loadtxt(f)

with open('/content/drive/My Drive/vecs_test.txt', 'r') as f:
    vecs = np.loadtxt(f)

In [35]:
acc = 0
for t, el in zip(target_test.values.tolist(), idx.tolist()): # посчитаем accuracy@30 для модели с nprobe=n
    acc += int(t in [base_index[r] for r in el])

acc_faiss = 100 * acc / len(idx)
acc_faiss

0.02

Метрика на faiss получилась очень низкая

In [36]:
# Обозначим признаки и таргет для катбуста, найдем предсказания из катбуста
features_catboost_test, target_catboost_test = catboost_features_formation(features_test, target_test, idx, vecs)
predictions_test = model_catboost.predict_proba(features_catboost_test)[:, 1]

In [37]:
accuracy_5(predictions_test, target_catboost_test)

0.0