# 8200 public score

In [0]:
from IPython.display import clear_output
from google.colab import drive
import os
import numpy as np
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/kaggle/finnet/last_try')
!pip install catboost
!pip install implicit
clear_output()
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter("ignore")
os.environ["PYTHONWARNINGS"] = "ignore"

In [0]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from sklearn.preprocessing import LabelEncoder, StandardScaler
from catboost import CatBoostClassifier
import networkx as nx
from gensim.models import KeyedVectors
import matplotlib.pyplot as plt
import seaborn as sns
from implicit.als import AlternatingLeastSquares
import gc

In [0]:
# fix seeds
s = 42
from numpy.random import seed
seed(s)

## Загрузим данные

In [4]:
%%time
edges = pd.read_csv('../data/edges.csv')
ids = pd.read_csv('../data/ids.csv')
vertices = pd.read_csv('../data/vertices.csv')

CPU times: user 2.32 s, sys: 273 ms, total: 2.6 s
Wall time: 2.78 s


In [0]:
# replace old region codes with new
old_new_regions = {
    81: 59,
    85: 38,
    99: 77,   
}
vertices['region_code'] = vertices['region_code'].replace(old_new_regions)

# encode categorical vars
vertices['main_okved'] = vertices['main_okved'].astype(str)
vertices['region_code'] = vertices['region_code'].astype('uint8')
vertices['pre_main_okved'] = vertices['main_okved'].agg([lambda x: x.split(".")[0]])
vertices['post_main_okved'] = vertices['main_okved'].agg([lambda x: x.split(".")[1]])
edges['av_transaction'] = edges['value'] / edges['n_transactions']

In [0]:
inv_edges = edges[['id_2', 'id_1'] + edges.columns[2:].to_list()]
inv_edges.rename({'id_2': 'id_1', 'id_1': 'id_2'}, axis=1, inplace=True)
bi_edges = pd.concat([edges, inv_edges])

In [0]:
if 'als_df.pkl' not in os.listdir():
    graph = nx.from_pandas_edgelist(edges.rename({'value': 'weight'}, axis=1), "id_1", "id_2", "weight")
    graph.add_nodes_from(vertices['id'])
    graph = graph.to_undirected()

    adj_matrix = nx.adjacency_matrix(graph, nodelist=vertices['id'].to_list())
    als = AlternatingLeastSquares(
        factors=128,
        use_gpu=True,
        calculate_training_loss=True,
        iterations=30
    )

    als.fit(adj_matrix)
    als_df = pd.DataFrame(np.concatenate((als.item_factors, als.user_factors), axis=1))
    als_df.columns = [f"als_{i}" for i in range(als_df.shape[1])]
    als_df['id'] = als_df.index + 1
    als_df.to_pickle('als_df.pkl')
else:
    als_df = pd.read_pickle('als_df.pkl')

als_features = list(als_df.columns[als_df.columns.str.startswith('als_')])

In [0]:
ver_w_features = vertices.copy()
ver_w_features = pd.merge(ver_w_features, als_df, how='left', on='id')

In [0]:
feature_edges = bi_edges[~bi_edges['id_2'].isin(ids['id'])]

agg_funcs = ['mean', 'max', 'size']
agg_features = feature_edges.groupby('id_1')[['value', 'n_transactions', 'av_transaction']].agg(agg_funcs)
agg_features.columns = ['_'.join(i) for i in agg_features.columns]
agg_features.drop(['value_size', 'av_transaction_size'], axis=1, inplace=True)
agg_features_cols = agg_features.columns.to_list()
agg_features = agg_features.reset_index()

In [0]:
ver_w_features = pd.merge(ver_w_features, agg_features, how='left', left_on='id', right_on='id_1').drop('id_1', axis=1)

In [11]:
del als_df, edges, vertices, inv_edges;
gc.collect()

12

## Обучим модель

In [0]:
def get_booster_params(ver_id):
    default_params = {
        "loss_function": 'Logloss',
        "border_count": 32,
        "iterations": 333,
        "verbose": 250,
        "task_type": "GPU",
        "random_seed": s,
    }        
    return default_params

In [13]:
num_features = als_features + agg_features_cols
cat_features = ['main_okved', 'region_code', 'company_type', 'pre_main_okved', 'post_main_okved']
features = num_features + cat_features

result = pd.DataFrame(columns=['id_1', 'id_2', 'goodness'])


for i in tqdm_notebook(ids["id"]):
    train = bi_edges[bi_edges['id_1'] == i]
    train['target'] = 1
    train = pd.merge(ver_w_features, train[['id_2', 'target']], how='left', left_on='id', right_on='id_2')
    train.drop('id_2', inplace=True, axis=1)
    train.rename({'id': 'id_2'}, axis=1, inplace=True)
    train["target"] = train["target"].fillna(0)
    
    X = train[features]
    y = train['target']
    
    model = CatBoostClassifier(**get_booster_params(i))
    print(f"Fitting vertice with id: {i}")
    model.fit(X, y, cat_features=cat_features)

    preds = model.predict_proba(X)[:,1]

    train['id_1'] = i
    train['goodness'] = preds
    
    res = train[train['target'] != 1].sort_values(by='goodness', ascending=False).iloc[:2000]
    res = res[['id_1', 'id_2', 'goodness']]
    result = result.append(res, ignore_index=True, sort=False)

    del res, preds, train, y
    fea_imp = pd.DataFrame({'imp': model.feature_importances_, 'col': X.columns})
    del X, model
    fea_imp = fea_imp.sort_values(['imp', 'col'], ascending=[True, False]).iloc[-30:]
    fea_imp.plot(kind='barh', x='col', y='imp', figsize=(10, 7), legend=None)
    plt.title('Importances')
    plt.ylabel('Features')
    plt.xlabel('Importance')
    plt.show();
    gc.collect()

Output hidden; open in https://colab.research.google.com to view.

## Результат готов к отправке

In [0]:
def drop_same_edges(res):
    res = nx.to_pandas_edgelist(nx.from_pandas_edgelist(res, 'id_1', 'id_2', 'goodness')).rename({'source':'id_1', 'target':'id_2'}, axis=1)
    return res 

def write_sub(res, fp='subs/submission.csv'):
    res.sort_values(by='goodness', ascending=False)[['id_1', 'id_2']].head(100000).to_csv(fp, index=False)

In [0]:
fp = 'subs/333iters_256emb_2000_als_bc32.csv'

write_sub(drop_same_edges(result), fp)

In [16]:
s = pd.read_csv(fp)
inv_edges = s[['id_2', 'id_1'] + s.columns[2:].to_list()]
inv_edges.rename({'id_2': 'id_1', 'id_1': 'id_2'}, axis=1, inplace=True)
bi_edges = pd.concat([s, inv_edges])
print(bi_edges[bi_edges['id_1'].isin(ids['id'])]['id_1'].value_counts().to_string())

524354     2001
361401     2000
61537      2000
1319172    2000
1154568    2000
1189202    2000
58408      2000
300432     2000
41216      2000
1301544    2000
640944     1970
1487070    1883
776150     1840
1209828    1805
1408687    1773
929264     1748
1292407    1720
83777      1665
1380777    1660
206473     1648
1434836    1578
83818      1484
341670     1479
1443535    1466
596221     1344
457698     1342
1063763    1329
373737     1313
1292275    1309
324065     1302
227665     1295
1392226    1269
966969     1263
194554     1258
722645     1256
983180     1243
1052575    1192
1136487    1182
598328     1153
47929      1134
1160709    1099
995150     1088
52803      1071
831789     1015
1227912    1009
676805     1001
409925      991
1244877     963
1025839     954
1378192     930
550468      912
846476      909
1203323     874
1471693     813
258675      799
1356407     790
1427428     785
55389       782
1500415     765
1297265     761
800360      738
1392199     695
1202376 