In [1]:
import json
from functools import partial
from typing import List, Set
import os
import pickle


import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric
from scipy.spatial.distance import cosine, euclidean
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import train_test_split

In [2]:
# from distutils.dir_util import copy_tree
# copy_tree("/kaggle/input/hackathon-files-for-participants-ozon", "/kaggle/working/hackathon_files_for_participants_ozon")

### Load data

In [3]:
dataset = pd.read_parquet("hackathon_files_for_participants_ozon/train_pairs.parquet")
etl = pd.read_parquet("hackathon_files_for_participants_ozon/train_data.parquet")

In [4]:
dataset.head(2)

Unnamed: 0,target,variantid1,variantid2
0,0.0,51197862,51198054
1,1.0,53062686,536165289


In [5]:
dataset["target"].value_counts()

0.0    171527
1.0    135013
Name: target, dtype: int64

In [6]:
etl.head(3)

Unnamed: 0,variantid,name,categories,color_parsed,pic_embeddings_resnet_v1,main_pic_embeddings_resnet_v1,name_bert_64,characteristic_attributes_mapping
0,51195767,"Удлинитель Партнер-Электро ПВС 2х0,75 ГОСТ,6A,...","{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[оранжевый],,"[[0.04603629, 0.18839523, -0.09973055, -0.6636...","[-0.47045058, 0.67237014, 0.48984158, -0.54485...","{""Номинальный ток, А"":[""10""],""Цвет товара"":[""о..."
1,53565809,Магнитный кабель USB 2.0 A (m) - USB Type-C (m...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Кабели ...",[красный],"[[0.26863545, -0.3130674, 0.29023397, 0.073978...","[[1.1471839, -0.665361, 0.7745614, 0.26716197,...","[-0.6575592, 0.6522429, 0.5426037, -0.54347897...","{""Конструктивные особенности"":[""Магнитная конс..."
2,56763357,"Набор микропрепаратов Konus 25: ""Клетки и ткан...","{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Оптичес...",,"[[0.66954195, 1.0643557, 0.78324044, -0.338267...","[[-0.90570974, 1.0296293, 1.0769907, 0.27746, ...","[-0.7384308, 0.70784587, 0.3012653, -0.3583719...","{""Тип аксессуара"":[""Набор микропрепаратов""],""Б..."


In [7]:
etl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 457063 entries, 0 to 457062
Data columns (total 8 columns):
 #   Column                             Non-Null Count   Dtype 
---  ------                             --------------   ----- 
 0   variantid                          457063 non-null  int64 
 1   name                               457063 non-null  object
 2   categories                         457063 non-null  object
 3   color_parsed                       378652 non-null  object
 4   pic_embeddings_resnet_v1           303467 non-null  object
 5   main_pic_embeddings_resnet_v1      457063 non-null  object
 6   name_bert_64                       457063 non-null  object
 7   characteristic_attributes_mapping  457036 non-null  object
dtypes: int64(1), object(7)
memory usage: 27.9+ MB


In [8]:
etl["categories"].apply(lambda x: json.loads(x)["1"]).unique()

array(['EPG'], dtype=object)

In [21]:
etl["categories"].apply(lambda x: json.loads(x)["2"]).value_counts()

Электроника               456843
Хобби и творчество            52
Строительство и ремонт        52
Автотовары                    30
Детские товары                26
Канцелярские товары           18
Дом и сад                     18
Спорт и отдых                 12
Бытовая техника                5
Бытовая химия                  3
Галантерея и украшения         2
Товары для взрослых            1
Товары для животных            1
Name: categories, dtype: int64

In [51]:
not_electronics = etl[etl["categories"].apply(lambda x: json.loads(x)["2"] != "Электроника")]
not_electronics.head(10)

Unnamed: 0,variantid,name,categories,color_parsed,pic_embeddings_resnet_v1,main_pic_embeddings_resnet_v1,name_bert_64,characteristic_attributes_mapping
89,102327756,"Зарядное устройство PATRIOT BCI-10A, 650303410","{""1"": ""EPG"", ""2"": ""Автотовары"", ""3"": ""Принадле...",,"[[0.4822681, -0.4201555, -0.6026368, 0.5398657...","[[0.23726666, -1.0075958, -0.29583487, 0.57976...","[-0.52825516, 0.5628571, 0.6057918, -0.6228487...","{""Бренд"":[""PATRIOT""],""Страна-изготовитель"":[""Р..."
850,746943315,FM-трансмиттер Baseus T typed S-13 Wireless MP...,"{""1"": ""EPG"", ""2"": ""Автотовары"", ""3"": ""Автозвук...",[черный],"[[-0.16614586, -0.22597459, -0.3307521, 1.3827...","[[0.8512496, -0.10998319, 0.5428988, 1.2968379...","[-0.5384122, 0.61599576, 0.37613955, -0.521040...","{""Цвет товара"":[""черный""],""Бренд"":[""Baseus""],""..."
3114,637359170,Набор Levenhuk LabZZ MTВ3: микроскоп телескоп ...,"{""1"": ""EPG"", ""2"": ""Хобби и творчество"", ""3"": ""...","[синий, оранжевый, космос]","[[0.10534885, -0.8327992, 1.022485, -0.4388804...","[[0.4010295, -0.16012707, -0.9049238, -0.00406...","[-0.4958119, 0.77407074, 0.5936636, -0.4191966...","{""Страна-изготовитель"":[""Китай""],""Название цве..."
4718,762293965,"Держатель Deppa (55202) Mage Safe Sky, магнитный","{""1"": ""EPG"", ""2"": ""Автотовары"", ""3"": ""Автоаксе...",[черный],,"[[0.0639138, -0.2893233, 0.850634, 0.32430652,...","[-0.3478513, 0.5365606, 0.33568802, -0.5065253...","{""Встроенная беспроводная зарядка"":[""Да""],""Аль..."
5255,446248414,Зарядное устройство Ставр ЗУ-20/2 4,"{""1"": ""EPG"", ""2"": ""Строительство и ремонт"", ""3...",[черный],"[[0.2098836, 0.7996861, 0.26932824, 0.23396447...","[[0.4668785, 0.061542198, 0.10214853, 0.330275...","[-0.6000086, 0.27446356, 0.7610713, -0.6496327...","{""Емкость, А•ч"":[""2.4""],""Напряжение аккумулято..."
10993,365718408,"Микроскоп Bresser Junior 40x-640x, синий","{""1"": ""EPG"", ""2"": ""Хобби и творчество"", ""3"": ""...",[синий],"[[-0.0012986977, -0.31094167, 1.0477643, -0.54...","[[0.43239692, -0.15049851, 0.6963732, -0.54433...","[-0.6130601, 0.65550953, 0.6525186, -0.4665081...","{""Тип"":[""Микроскоп детский""],""Минимальный возр..."
11031,401953385,Источник питания Pulsar APS 2 (79162),"{""1"": ""EPG"", ""2"": ""Спорт и отдых"", ""3"": ""Охота...",,"[[-0.33269733, -0.30426037, 0.29399553, -0.383...","[[-0.3466038, -0.40759972, 0.7068129, -0.82976...","[-0.48942944, 0.5668152, 0.43684304, -0.568600...","{""Тип"":[""Прибор ночного видения""],""Артикул"":[""..."
13092,681957196,Телескоп Bresser Junior Space Explorer 45/600 ...,"{""1"": ""EPG"", ""2"": ""Хобби и творчество"", ""3"": ""...",[синий],"[[-0.1387153, -0.1634013, 0.20317742, 0.453225...","[[0.07863441, -0.3417797, 0.29189655, 0.582014...","[-0.6052431, 0.56460565, 0.61351126, -0.672608...","{""Страна-изготовитель"":[""Германия""],""Цвет това..."
18100,496152442,"Микроскоп Levenhuk Rainbow 2L PLUS, Лайм","{""1"": ""EPG"", ""2"": ""Хобби и творчество"", ""3"": ""...","[желтый, лайм]","[[-0.1573754, 0.021033436, 0.63059866, 0.09407...","[[0.07091465, 0.067262806, 0.22924602, -0.2052...","[-0.47605976, 0.46014962, 0.65139973, -0.58924...","{""Бренд"":[""levenhuk""],""Тип"":[""Микроскоп детски..."
18202,563351951,1 шт. в комплекте! Внешний жесткий диск WD My ...,"{""1"": ""EPG"", ""2"": ""Канцелярские товары"", ""3"": ...",[черный],"[[0.26708925, -0.23145437, 0.33789435, -0.3167...","[[0.2518009, -0.48613265, 0.16509332, -0.46910...","[-0.50672656, 0.4457588, 0.6555905, -0.5336636...","{""Артикул"":[""513600""],""Бренд"":[""Нет бренда""],""..."


In [52]:
len(not_electronics)

220

In [58]:
percent_of_not_electronics = len(not_electronics)/len(etl)*100
print(f"percent_of_not_electronics = {percent_of_not_electronics:.3}%")

percent_of_not_electronics = 0.0481%


In [60]:
print(f"not_electrnics_indexes = {list(not_electronics.index)}")

not_electrnics_indexes = [89, 850, 3114, 4718, 5255, 10993, 11031, 13092, 18100, 18202, 23604, 24217, 37593, 38796, 43071, 46632, 49894, 50091, 50189, 51046, 55861, 55991, 56358, 57102, 58016, 59086, 60757, 66654, 68285, 68431, 68730, 69935, 70345, 71989, 75271, 77615, 77671, 78243, 78311, 78327, 79405, 79854, 80256, 82655, 88641, 92148, 93921, 96825, 97154, 102546, 103567, 105292, 105622, 105642, 106472, 110229, 114866, 119727, 120120, 121024, 123645, 126901, 130532, 132943, 137208, 137284, 141513, 141559, 143486, 146893, 147314, 148362, 148617, 149472, 153102, 153183, 153844, 153949, 154992, 155186, 157277, 159159, 160038, 161041, 161098, 167526, 168302, 170881, 173120, 180481, 182257, 182687, 187134, 187428, 190706, 197374, 198395, 198812, 200505, 200536, 201878, 204703, 204833, 207369, 207735, 211661, 212257, 216355, 219453, 220860, 221384, 222821, 224778, 225251, 232688, 234266, 235833, 238699, 251218, 251405, 253392, 260770, 261956, 262242, 265507, 265671, 266830, 266934, 267517,

In [42]:
etl[etl["categories"].apply(lambda x: json.loads(x)["2"] == "Автотовары")].head(6)

Unnamed: 0,variantid,name,categories,color_parsed,pic_embeddings_resnet_v1,main_pic_embeddings_resnet_v1,name_bert_64,characteristic_attributes_mapping
89,102327756,"Зарядное устройство PATRIOT BCI-10A, 650303410","{""1"": ""EPG"", ""2"": ""Автотовары"", ""3"": ""Принадле...",,"[[0.4822681, -0.4201555, -0.6026368, 0.5398657...","[[0.23726666, -1.0075958, -0.29583487, 0.57976...","[-0.52825516, 0.5628571, 0.6057918, -0.6228487...","{""Бренд"":[""PATRIOT""],""Страна-изготовитель"":[""Р..."
850,746943315,FM-трансмиттер Baseus T typed S-13 Wireless MP...,"{""1"": ""EPG"", ""2"": ""Автотовары"", ""3"": ""Автозвук...",[черный],"[[-0.16614586, -0.22597459, -0.3307521, 1.3827...","[[0.8512496, -0.10998319, 0.5428988, 1.2968379...","[-0.5384122, 0.61599576, 0.37613955, -0.521040...","{""Цвет товара"":[""черный""],""Бренд"":[""Baseus""],""..."
4718,762293965,"Держатель Deppa (55202) Mage Safe Sky, магнитный","{""1"": ""EPG"", ""2"": ""Автотовары"", ""3"": ""Автоаксе...",[черный],,"[[0.0639138, -0.2893233, 0.850634, 0.32430652,...","[-0.3478513, 0.5365606, 0.33568802, -0.5065253...","{""Встроенная беспроводная зарядка"":[""Да""],""Аль..."
59086,665004762,Aeab002_аккумулятор Внешний Универсальный (Pow...,"{""1"": ""EPG"", ""2"": ""Автотовары"", ""3"": ""Автоаксе...",,,"[[0.00144439, -0.11962033, 0.45995337, -0.6314...","[-0.7042323, 0.5002088, 0.5848529, -0.48069268...","{""Бренд"":[""Нет бренда""],""Тип"":[""Болты для номе..."
60757,750068051,Сабвуфер в автомобиль (машину) DL Audio Grypho...,"{""1"": ""EPG"", ""2"": ""Автотовары"", ""3"": ""Автозвук...",,"[[0.447394, 0.4943823, -0.056699038, 0.4870089...","[[-0.3063197, 0.09298806, -0.6186492, -0.11659...","[-0.36322117, 0.6047769, 0.7920109, -0.5815751...","{""Типоразмер динамиков"":[""38 см (15 дюйм.)""],""..."
75271,645172860,Моторное масло Airline,"{""1"": ""EPG"", ""2"": ""Автотовары"", ""3"": ""Автомоби...",,,"[[0.5100802, -0.20191506, 0.2410908, -0.570464...","[-0.4633843, 0.26803902, 0.6954631, -0.6617441...","{""Тип"":[""Моторное масло""],""Бренд"":[""Airline""],..."


In [10]:
# решил для каждого типа фичей посмотреть на несколько примеров
from collections import defaultdict 

d = defaultdict(list)

example_ids = [0, 100, 1000, 10000]

for i in example_ids:
    for key in etl:
        d[key].append(etl[key][i])

In [11]:
d["categories"][0]

'{"1": "EPG", "2": "Электроника", "3": "Сетевые фильтры, разветвители и удлинители", "4": "Сетевой фильтр, удлинитель, разветвитель"}'

In [12]:
# Это я так на всякий случай убедился, что у каждого товара ровно один эмбеддинг main_pic_embeddings_resnet_v1
len(etl[etl["main_pic_embeddings_resnet_v1"].apply(lambda x: x.shape[0]) != 1])

0

In [13]:
# В тренировочном датасете у товара от 0 до 15 картинок не считая главную
amounts_of_pictures = etl["pic_embeddings_resnet_v1"].apply(lambda x: x.shape[0] if x is not None else 0).unique()
amounts_of_pictures

array([ 0,  5,  7,  2,  4,  3,  1,  6,  8, 10,  9, 12, 14, 13, 11, 15],
      dtype=int64)

In [14]:
# проверил, что в тестовых данных нет товаров из тренировочной выборки
etl_test = pd.read_parquet("hackathon_files_for_participants_ozon/test_data.parquet")
etl_test.join(etl, on = "variantid", how = "inner", lsuffix='_left', rsuffix='_right')

Unnamed: 0,variantid,variantid_left,name_left,categories_left,color_parsed_left,pic_embeddings_resnet_v1_left,main_pic_embeddings_resnet_v1_left,name_bert_64_left,characteristic_attributes_mapping_left,variantid_right,name_right,categories_right,color_parsed_right,pic_embeddings_resnet_v1_right,main_pic_embeddings_resnet_v1_right,name_bert_64_right,characteristic_attributes_mapping_right


In [15]:
test_dataset = pd.read_parquet("hackathon_files_for_participants_ozon/test_pairs_wo_target.parquet")
test_dataset.head(2)

Unnamed: 0,variantid1,variantid2
0,52076340,290590137
1,64525522,204128919


Get raw data for each variantid.

In [16]:
features = (
    dataset
    .merge(
        etl
        .add_suffix('1'),
        on="variantid1"
    )
    .merge(
        etl
        .add_suffix('2'),
        on="variantid2"
    )
)

In [17]:
features.head(2)

Unnamed: 0,target,variantid1,variantid2,name1,categories1,color_parsed1,pic_embeddings_resnet_v11,main_pic_embeddings_resnet_v11,name_bert_641,characteristic_attributes_mapping1,name2,categories2,color_parsed2,pic_embeddings_resnet_v12,main_pic_embeddings_resnet_v12,name_bert_642,characteristic_attributes_mapping2
0,0.0,51197862,51198054,Удлинитель TDM Electric Люкс УЛ05В 5 м (SQ1303...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[белый],,"[[-0.4304909, -0.49474272, -0.46439183, -0.060...","[-0.5104684, 0.56158644, 0.58873796, -0.529718...","{""Число жил"":[""3""],""Макс. нагрузка, Вт"":[""3500...",Удлинитель TDM Electric Люкс УЛ05В 1.5 м (SQ13...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[белый],,"[[-0.42941108, -0.5129398, -0.4753536, -0.0677...","[-0.455473, 0.58157134, 0.5870387, -0.5325003,...","{""Электробезопасность"":[""Заземление""],""Длина к..."
1,0.0,51197862,51199884,Удлинитель TDM Electric Люкс УЛ05В 5 м (SQ1303...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[белый],,"[[-0.4304909, -0.49474272, -0.46439183, -0.060...","[-0.5104684, 0.56158644, 0.58873796, -0.529718...","{""Число жил"":[""3""],""Макс. нагрузка, Вт"":[""3500...",Удлинитель TDM Electric Люкс УЛ05В 3 м (SQ1303...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[белый],,"[[-0.43180764, -0.49580905, -0.5062628, -0.130...","[-0.5425725, 0.6415736, 0.51481575, -0.5687392...","{""Макс. нагрузка, Вт"":[""3500""],""Стандарт защит..."


In [18]:
# Товары категорий, отличных от эектроники действительно используются
print(features["categories1"].apply(lambda x: json.loads(x)["2"]).unique(),
    features["categories2"].apply(lambda x: json.loads(x)["2"]).unique())

['Электроника' 'Строительство и ремонт' 'Детские товары'] ['Электроника' 'Хобби и творчество' 'Детские товары' 'Дом и сад'
 'Спорт и отдых' 'Автотовары' 'Строительство и ремонт'
 'Канцелярские товары' 'Товары для животных' 'Бытовая химия'
 'Бытовая техника' 'Товары для взрослых' 'Галантерея и украшения']
