In [1]:
import numpy as np
import pandas as pd
import scipy
import tqdm

import catboost as cat
from catboost import CatBoostClassifier
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack


## Загрузим данные

In [2]:
%%time
edges = pd.read_csv('./edges.csv')
ids = pd.read_csv('./ids.csv')
vertices = pd.read_csv('./vertices.csv')

CPU times: user 3.24 s, sys: 994 ms, total: 4.23 s
Wall time: 11.6 s


In [3]:
vertices['main_okved'] = vertices['main_okved'].astype(str)

In [4]:
vertices["first_okved"] = vertices.main_okved.map(lambda x: x.split(".")[0])
vertices["second_okved"] = vertices.main_okved.map(lambda x: x.split(".")[1])

In [6]:
vertices.shape[0]

1534749

In [5]:
vertices

Unnamed: 0,id,main_okved,region_code,company_type,first_okved,second_okved
0,1,46.75,77,Limited,46,75
1,2,41.2,78,Limited,41,2
2,3,25.11,50,Limited,25,11
3,4,45.31,89,Limited,45,31
4,5,56.1,50,Limited,56,1
...,...,...,...,...,...,...
1534744,1534745,63.99,77,Individual,63,99
1534745,1534746,47.19,66,Individual,47,19
1534746,1534747,41.2,77,Individual,41,2
1534747,1534748,74.2,33,Individual,74,2


In [16]:
df = vertices[['region_code', 'first_okved', 'second_okved', 'company_type']]
enc = OneHotEncoder(handle_unknown='ignore')
df_oh_encoded = enc.fit_transform(df)

df = vertices.drop(['region_code', 'first_okved', 'second_okved', 'company_type', 'main_okved'], axis=1)
vertices_sparse = scipy.sparse.csr_matrix(df.values)
X = hsparse([vertices_sparse, df_oh_encoded])

In [6]:
np.random.seed(7777)

## Обучим модель

In [6]:
result = pd.DataFrame(columns=['id_1', 'id_2'])

In [8]:
# для каждой вершины из ids с помощью catboost найдем 1000 самых вероятных ребер
for i in tqdm.tqdm(ids.id):
    # соберем датасет из всех возможных вершин
    # вершины имеющие в исходных данных ребро с i обозначим 1, остальные 0
    # учтем то, что вершина i может быть как среди id_1, так и среди id_2
    df1 = edges[edges['id_1'] == i].reset_index()
    df2 = edges[edges['id_2'] == i].reset_index()

    df = df1[['id_2', 'id_1']].rename(columns={'id_1':'id_2', 'id_2':'id_1'}).append(df2[['id_1', 'id_2']])
    df['target'] = 1
    
    df = vertices.set_index('id').join(df.set_index('id_1')['target']).fillna(0)
    
    
    X = df[['region_code', 'first_okved', 'second_okved', 'company_type']]
    y = df['target']
    
    enc = OneHotEncoder(handle_unknown='ignore')
    X = enc.fit_transform(X)    

    model = CatBoostClassifier(iterations=100, verbose=False)
    cat_features = [0,1,2] # все признаки категориальные
    
    model.fit(X, y)

    preds = model.predict_proba(X)[:,1]

    df['preds'] = preds
    df['id_2'] = i
    
    # возьмем первую 1000 предсказанных ребер, исключив те, про которые мы уже знали
    res = df[df['target'] != 1].sort_values(by='preds', ascending=False).iloc[:1000].reset_index()[['id', 'id_2']]
    res.columns = ['id_1', 'id_2']
    
    result = result.append(res, ignore_index=True, sort=False)

100%|██████████| 100/100 [1:06:26<00:00, 39.87s/it]


## Результат готов к отправке

In [9]:
result.to_csv('submission.csv', index = False)

In [10]:
vertices

Unnamed: 0,id,main_okved,region_code,company_type,first_okved,second_okved
0,1,46.75,77,Limited,46,75
1,2,41.2,78,Limited,41,2
2,3,25.11,50,Limited,25,11
3,4,45.31,89,Limited,45,31
4,5,56.1,50,Limited,56,1
...,...,...,...,...,...,...
1534744,1534745,63.99,77,Individual,63,99
1534745,1534746,47.19,66,Individual,47,19
1534746,1534747,41.2,77,Individual,41,2
1534747,1534748,74.2,33,Individual,74,2
