# 8200 public score aka best result

In [0]:
from IPython.display import clear_output
from google.colab import drive
import os
import numpy as np
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/kaggle/finnet/last_try')
!pip install catboost
clear_output()
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter("ignore")
os.environ["PYTHONWARNINGS"] = "ignore"

In [0]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from sklearn.preprocessing import LabelEncoder, StandardScaler
from catboost import CatBoostClassifier
import networkx as nx
from gensim.models import KeyedVectors
import matplotlib.pyplot as plt
import seaborn as sns
import gc

In [0]:
# fix seeds
s = 42
from numpy.random import seed
seed(s)

## Загрузим данные

In [4]:
%%time
edges = pd.read_csv('../data/edges.csv')
ids = pd.read_csv('../data/ids.csv')
vertices = pd.read_csv('../data/vertices.csv')

CPU times: user 2.29 s, sys: 250 ms, total: 2.54 s
Wall time: 6.9 s


In [0]:
# replace old region codes with new
old_new_regions = {
    81: 59,
    85: 38,
    99: 77,   
}
vertices['region_code'] = vertices['region_code'].replace(old_new_regions)

# encode categorical vars
vertices['main_okved'] = vertices['main_okved'].astype(str)
vertices['region_code'] = vertices['region_code'].astype('uint8')
vertices['pre_main_okved'] = vertices['main_okved'].agg([lambda x: x.split(".")[0]])
vertices['post_main_okved'] = vertices['main_okved'].agg([lambda x: x.split(".")[1]])
edges['av_transaction'] = edges['value'] / edges['n_transactions']

In [0]:
inv_edges = edges[['id_2', 'id_1'] + edges.columns[2:].to_list()]
inv_edges.rename({'id_2': 'id_1', 'id_1': 'id_2'}, axis=1, inplace=True)
bi_edges = pd.concat([edges, inv_edges])

In [0]:
if 'embeddings256.pkl' not in os.listdir():
    model = KeyedVectors.load_word2vec_format("n2v256.bin")
    embs = pd.DataFrame(np.zeros((len(vertices), model.vector_size + 1)), columns=['id'] + [f"e_{i}" for i in range(model.vector_size)])

    for i in tqdm_notebook(range(1, 1534750)):
        embs.iloc[i-1, 0] = i
        if str(i) in model:
            embs.iloc[i-1, 1:] = model[str(i)]
    embs.to_pickle("/embeddings256.pkl")
else:
    embs = pd.read_pickle("embeddings256.pkl")

emb_features = list(embs.columns[embs.columns.str.startswith('e_')])

In [0]:
ver_w_features = vertices.copy()
ver_w_features = pd.merge(ver_w_features, embs, how='left', on='id')

In [0]:
feature_edges = bi_edges[~bi_edges['id_2'].isin(ids['id'])]

agg_funcs = ['mean', 'max', 'size']
agg_features = feature_edges.groupby('id_1')[['value', 'n_transactions', 'av_transaction']].agg(agg_funcs)
agg_features.columns = ['_'.join(i) for i in agg_features.columns]
agg_features.drop(['value_size', 'av_transaction_size'], axis=1, inplace=True)
agg_features_cols = agg_features.columns.to_list()
agg_features = agg_features.reset_index()

In [0]:
ver_w_features = pd.merge(ver_w_features, agg_features, how='left', left_on='id', right_on='id_1').drop('id_1', axis=1)

In [11]:
del embs, edges, vertices, inv_edges;
gc.collect()

12

## Обучим модель

In [0]:
def get_booster_params(ver_id):
    default_params = {
        "loss_function": 'Logloss',
        "border_count": 32,
        "iterations": 500,
        "verbose": 250,
        "task_type": "GPU",
        "random_seed": s,
    }        
    return default_params

In [13]:
num_features = emb_features + agg_features_cols
cat_features = ['main_okved', 'region_code', 'company_type', 'pre_main_okved', 'post_main_okved']
features = num_features + cat_features

result = pd.DataFrame(columns=['id_1', 'id_2', 'goodness'])


for i in tqdm_notebook(ids["id"]):
    train = bi_edges[bi_edges['id_1'] == i]
    train['target'] = 1
    train = pd.merge(ver_w_features, train[['id_2', 'target']], how='left', left_on='id', right_on='id_2')
    train.drop('id_2', inplace=True, axis=1)
    train.rename({'id': 'id_2'}, axis=1, inplace=True)
    train["target"] = train["target"].fillna(0)
    
    X = train[features]
    y = train['target']
    
    model = CatBoostClassifier(**get_booster_params(i))
    print(f"Fitting vertice with id: {i}")
    model.fit(X, y, cat_features=cat_features)

    preds = model.predict_proba(X)[:,1]

    train['id_1'] = i
    train['goodness'] = preds
    
    res = train[train['target'] != 1].sort_values(by='goodness', ascending=False).iloc[:2000]
    res = res[['id_1', 'id_2', 'goodness']]
    result = result.append(res, ignore_index=True, sort=False)

    del res, preds, train, y
    fea_imp = pd.DataFrame({'imp': model.feature_importances_, 'col': X.columns})
    del X, model
    fea_imp = fea_imp.sort_values(['imp', 'col'], ascending=[True, False]).iloc[-30:]
    fea_imp.plot(kind='barh', x='col', y='imp', figsize=(10, 7), legend=None)
    plt.title('Importances')
    plt.ylabel('Features')
    plt.xlabel('Importance')
    plt.show();
    gc.collect()

Output hidden; open in https://colab.research.google.com to view.

## Результат готов к отправке

In [0]:
def drop_same_edges(res):
    res = nx.to_pandas_edgelist(nx.from_pandas_edgelist(res, 'id_1', 'id_2', 'goodness')).rename({'source':'id_1', 'target':'id_2'}, axis=1)
    return res 

def write_sub(res, fp='subs/submission.csv'):
    res.sort_values(by='goodness', ascending=False)[['id_1', 'id_2']].head(100000).to_csv(fp, index=False)

In [0]:
fp = 'subs/boosting_exp500.csv'

write_sub(drop_same_edges(result), fp)

In [16]:
s = pd.read_csv(fp)
inv_edges = s[['id_2', 'id_1'] + s.columns[2:].to_list()]
inv_edges.rename({'id_2': 'id_1', 'id_1': 'id_2'}, axis=1, inplace=True)
bi_edges = pd.concat([s, inv_edges])
print(bi_edges[bi_edges['id_1'].isin(ids['id'])]['id_1'].value_counts().to_string())

61537      2001
1301544    2001
361401     2001
1292407    2000
258315     2000
83777      2000
1142564    2000
1263393    2000
1209828    2000
206473     2000
1147550    2000
800360     2000
955991     2000
214507     2000
713665     2000
409925     2000
1319172    2000
300432     2000
524354     2000
83818      2000
341670     2000
1203323    1930
785334     1870
929264     1858
386565     1822
640944     1651
41216      1615
1523148    1611
936788     1592
1244877    1457
776150     1432
1408687    1427
1136487    1408
373737     1312
1342003    1254
1392226    1247
1154568    1165
1471693    1157
1378192    1128
596221     1124
457698     1116
127685     1107
1487070    1101
227665     1023
550468      976
1380777     967
1443535     954
1289618     942
1292275     886
983180      885
1052575     881
331678      801
52803       791
1189202     740
676805      736
319548      735
1025839     716
1356407     713
1160709     701
966969      669
1427428     668
1434836     641
55389   