# Разработка модели ранжирования

Цель нашей работы разработать релевантную модель, спобную предоставлять пять похожих товаров из базы данных в ответ на запрос по одному товару.

В нашем распоряжении:

base.csv - анонимизированный набор товаров.

Каждый товар представлен как уникальный id (0-base, 1-base, 2-base) и вектор признаков размерностью 72.

train.csv - обучающий датасет. 

Каждая строчка - один товар, для которого известен уникальный id (0-query, 1-query, …) , вектор признаков И id товара из base.csv, который максимально похож на него (по мнению экспертов).

validation.csv - датасет с товарами (уникальный id и вектор признаков), для которых надо найти наиболее близкие товары из base.csv.


validation_answer.csv - правильные ответы к предыдущему файлу.

## Выгрузка данных и знакомство с данными

Отразим используемые модули в рамках одной ячейки для удобства.

In [1]:
import faiss
import pandas as pd
import numpy as np

from catboost import CatBoostRegressor

from sklearn.model_selection import GridSearchCV 
from sklearn.preprocessing import StandardScaler 

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

In [2]:
base = pd.read_csv('C://Users//roma_//OneDrive//Рабочий стол//DATA//Проект Мастерская 2//base.csv', index_col=0)
validation = pd.read_csv('https://raw.githubusercontent.com/romakulikov/workshop2/main/validation.csv', index_col=0)
validation_answer = pd.read_csv('https://raw.githubusercontent.com/romakulikov/workshop2/main/validation_answer.csv', index_col=0)

In [3]:
train = pd.read_csv('https://raw.githubusercontent.com/romakulikov/workshop2/main/train.csv', index_col=0)


In [4]:
base.tail()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,62,63,64,65,66,67,68,69,70,71
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4744755-base,-125.0863,4.735448,38.284935,-125.27671,209.78836,-47.87001,-628.127378,-71.07734,125.534355,29.34185,...,-70.364395,55.42826,-191.53702,122.978279,65.51526,108.963715,-52.79538,117.25451,-615.998268,-77.768555
4744759-base,-108.59839,5.454173,-22.191975,-158.71666,113.60611,-88.84343,-117.234538,137.31769,129.05669,164.75424,...,-70.36019,18.439003,-90.65145,-28.586794,68.691666,1.636499,-24.388298,93.53054,-1074.464888,-90.8004
4744762-base,-97.02673,-2.143885,-71.27409,-144.68066,149.46481,-21.572212,-759.626065,-108.151955,124.810135,118.7607,...,-55.961933,62.92347,-169.11832,108.452806,68.59297,43.205433,69.02983,129.53665,-1074.464888,4.7183
4744763-base,-86.83843,8.911945,-34.837738,-106.46268,42.236572,-96.19423,-532.907338,9.845185,126.03756,59.2231,...,-86.86702,-6.959065,-174.07591,-151.007973,68.622246,0.707927,94.451866,124.843,-1074.464888,-55.789314
4744766-base,-101.96527,12.8595,-51.393135,-168.21826,101.06511,-71.9146,34.734278,-5.394417,135.52376,125.36194,...,-67.40468,72.963425,-147.8587,-5.081259,69.68872,-20.186157,84.47911,92.833786,-1073.82617,14.774761


In [5]:
train.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,63,64,65,66,67,68,69,70,71,Target
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0-query,-53.882748,17.971436,-42.117104,-183.93668,187.51749,-87.14493,-347.360606,38.307602,109.08556,30.413513,...,70.10736,-155.80257,-101.965943,65.90379,34.4575,62.642094,134.7636,-415.750254,-25.958572,675816-base
1-query,-87.77637,6.806268,-32.054546,-177.26039,120.80333,-83.81059,-94.572749,-78.43309,124.9159,140.33107,...,4.669178,-151.69771,-1.638704,68.170876,25.096191,89.974976,130.58963,-1035.092211,-51.276833,366656-base
2-query,-49.979565,3.841486,-116.11859,-180.40198,190.12843,-50.83762,26.943937,-30.447489,125.771164,211.60782,...,78.039764,-169.1462,82.144186,66.00822,18.400496,212.40973,121.93147,-1074.464888,-22.547178,1447819-base
3-query,-47.810562,9.086598,-115.401695,-121.01136,94.65284,-109.25541,-775.150134,79.18652,124.0031,242.65065,...,44.515266,-145.41675,93.990981,64.13135,106.06192,83.17876,118.277725,-1074.464888,-19.902788,1472602-base
4-query,-79.632126,14.442886,-58.903397,-147.05254,57.127068,-16.239529,-321.317964,45.984676,125.941284,103.39267,...,45.02891,-196.09207,-117.626337,66.92622,42.45617,77.621765,92.47993,-1074.464888,-21.149351,717819-base


In [6]:
train.shape

(100000, 73)

In [7]:
target_train = train["Target"]
train.drop("Target", axis=1, inplace=True)

In [8]:
col = train.columns.tolist()

In [9]:
validation.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,62,63,64,65,66,67,68,69,70,71
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100000-query,-57.372734,3.597752,-13.213642,-125.92679,110.74594,-81.279594,-461.003172,139.81572,112.88098,75.21575,...,-75.51302,52.830902,-143.43945,59.051935,69.28224,61.927513,111.59253,115.140656,-1099.130485,-117.07936
100001-query,-53.758705,12.7903,-43.268543,-134.41762,114.44991,-90.52013,-759.626065,63.995087,127.117905,53.128998,...,-79.44183,29.185436,-168.6059,-82.872443,70.7656,-65.97595,97.07716,123.39164,-744.442332,-25.00932
100002-query,-64.175095,-3.980927,-7.679249,-170.16093,96.44616,-62.37774,-759.626065,87.477554,131.27011,168.92032,...,-134.79541,37.36873,-159.66231,-119.232725,67.71044,86.00206,137.63641,141.08163,-294.052271,-70.969604
100003-query,-99.28686,16.123936,9.837166,-148.06044,83.69708,-133.72972,58.576403,-19.04666,115.042404,75.20673,...,-77.23611,44.100494,-132.53012,-106.318982,70.88396,23.577892,133.18396,143.25294,-799.363667,-89.39267
100004-query,-79.53292,-0.364173,-16.027431,-170.88495,165.45392,-28.291668,33.931936,34.411217,128.90398,102.086914,...,-123.77025,45.635944,-134.25893,13.735359,70.61763,15.332115,154.56812,101.70064,-1171.892332,-125.30789


In [10]:
validation.shape

(100000, 72)

In [11]:
validation_answer.head()

Unnamed: 0_level_0,Expected
Id,Unnamed: 1_level_1
100000-query,2676668-base
100001-query,91606-base
100002-query,472256-base
100003-query,3168654-base
100004-query,75484-base


In [12]:
scaler = StandardScaler()
scaler.fit(train) 

train[col] = scaler.transform(train[col])
validation[col] = scaler.transform(validation[col])
base[col] = scaler.transform(base[col])

In [13]:
dim = base.shape[1]
topn = 15

In [14]:
index = faiss.index_factory(dim, "IVF1000,Flat")

In [15]:
index.train(np.ascontiguousarray(base.values[:100000, :]).astype('float32'))

index.add(np.ascontiguousarray(base.values).astype('float32'))

In [16]:
base_index = {k: v for k, v in enumerate(base.index.to_list())}

In [17]:
dist_faiss, idx = index.search(np.ascontiguousarray(train.values).astype('float32'), topn)

In [18]:
acc = 0
for target, el in zip(target_train.values.tolist(), idx.tolist()):
    acc += int(target in [base_index[r] for r in el])

print(100 * acc / len(idx))

53.529


In [19]:
efwegewgerh

NameError: name 'efwegewgerh' is not defined

In [21]:
X = 4000 

In [None]:
X*topn

In [22]:
dist = sum(dist_faiss.tolist()[:X*topn], [])

In [23]:
target_ml = [int(base_index[r] == t) for t, i in zip(target_train.values.tolist()[:X*topn], idx.tolist()[:X*topn]) for r in i]

In [24]:
# скалярное произведение векторов
scalar_vs = [np.dot(base.loc[base_index[r]].values, t) for t, i in zip(train.values.tolist()[:X*topn], idx.tolist()[:X*topn]) for r in i]
# длина исходного ветктора 
length_vector1 = [np.linalg.norm(t) for t in train.values.tolist()[:X*topn] for _ in range(topn)]
# длина найденного вектора
length_vector2 = [np.linalg.norm(base.loc[base_index[r]].values) for i in idx.tolist()[:X*topn] for r in i]
# cos угла между векторами
cos_angle = [np.dot(base.loc[base_index[r]].values, t) /
                         (np.linalg.norm(base.loc[base_index[r]].values) * np.linalg.norm(t)) for t, i in zip(train.values.tolist()[:X*topn], idx.tolist()[:X*topn]) for r in i]
# угол между векторами
angle_vs = [np.arccos(np.dot(base.loc[base_index[r]].values, t) /
                         (np.linalg.norm(base.loc[base_index[r]].values) * np.linalg.norm(t))) 
            for t, i in zip(train.values.tolist()[:X*topn], idx.tolist()[:X*topn]) for r in i]

KeyboardInterrupt: 

In [None]:
# угол между векторами в градусах
angle_degrees = [np.degrees(np.arccos(np.dot(base.loc[base_index[r]].values, t) 
                                      / (np.linalg.norm(base.loc[base_index[r]].values) 
                                         * np.linalg.norm(t)))) for t, i in zip(train.values.tolist()[:X*topn], idx.tolist()[:2]) for r in i]
# косинусное расстояние
cos_distance = [1 - np.dot(base.loc[base_index[r]].values, t) for t, i in zip(train.values.tolist()[:X*topn], idx.tolist()[:X*topn]) for r in i]

# евклидово растояние
euclidean_distance = [np.linalg.norm(base.loc[base_index[r]].values - t) for t, i in zip(train.values.tolist()[:X*topn], idx.tolist()[:X*topn]) for r in i]

# сумма сумм элементов
dif_sum = [np.sum(base.loc[base_index[r]].values) - np.sum(t) for t, i in zip(train.values.tolist()[:X*topn], idx.tolist()[:X*topn]) for r in i]

In [None]:
# разность сумм элементов
sum_sums = [np.sum(base.loc[base_index[r]].values) + np.sum(t) for t, i in zip(train.values.tolist()[:X*topn], idx.tolist()[:X*topn]) for r in i]

# минимальное значение суммы элементов
min_sum_el = [np.min(base.loc[base_index[r]].values + t) for t, i in zip(train.values.tolist()[:X*topn], idx.tolist()[:X*topn]) for r in i]

# максимальное значение суммы элементов
max_sum_el = [np.max(base.loc[base_index[r]].values + t) for t, i in zip(train.values.tolist()[:X*topn], idx.tolist()[:X*topn]) for r in i]

# среднее значение сумм элементов 
mean_value_el = [np.mean(base.loc[base_index[r]].values) + np.mean(t) for t, i in zip(train.values.tolist()[:X*topn], idx.tolist()[:X*topn]) for r in i] 

# стандартное отклонение 
std_value = [np.std(base.loc[base_index[r]].values) + np.std(t) for t, i in zip(train.values.tolist()[:X*topn], idx.tolist()[:2]) for r in i]

# корреляция
correlation = [np.corrcoef(base.loc[base_index[r]].values, t)[0, 1] for t, i in zip(train.values.tolist()[:X*topn], idx.tolist()[:X*topn]) for r in i]

In [None]:
train_ml = pd.DataFrame({
     'target_ml': target_ml, 'dist': dist,
     'scalar_vs': scalar_vs, 'length_vector1': length_vector1,
     'length_vector2': length_vector2, 'cos_angle': cos_angle,
     'angle_vs': angle_vs, 'angle_degrees': angle_degrees, 
     'cos_distance': cos_distance, 'euclidean_distance': euclidean_distance,
      'dif_sum': dif_sum, 'sum_sums': sum_sums, 'min_sum_el': min_sum_el,
      'max_sum_el': max_sum_el, 'mean_value_el': mean_value_el, 
      'std_value': std_value, 'correlation': correlation
})

In [None]:
train_ml

In [None]:
train_ml.info()

In [None]:
r3t3232 

In [None]:
target_ml = train_ml['target_ml']
feature_train = train_ml.drop('target_ml', axis=1)

In [None]:
feature_train

In [None]:
num_columns = feature_train.columns.tolist()

In [None]:
column_transformer = make_column_transformer((StandardScaler(), num_columns), remainder='passthrough')

In [None]:
model = CatBoostRegressor(loss_function='logloss', random_seed=42)
pipeline = make_pipeline(column_transformer, model)

In [None]:
iterations = range(35, 40)
depth = range(6, 10)
learning_rate = np.arange(0.5, 0.8, 0.1)


params = [{
    'clf': [CatBoostRegressor(num_classes=2, 'logloss', random_seed=42)],
    'clf__iterations': iterations,
    'clf__depth': depth,
    'clf__learning_rate': learning_rate
    }]


In [None]:
def accuracy5(feature, target):
    acc = 0
    start = 0
    stop = topn
    proba = pipeline.predict_proba(feature)
    for x in range(int(len(feature)/topn)):
        probabilities = proba[start:stop][:, 1]
        top_sort_proba = probabilities.sort_values(ascending=False)[:5]
        selected = target[top_sort_proba.index]
        acc += int(sum(selected) > 0)
           
    return 100 * acc / (len(feature)/topn)