## Подгружаем данные и импорты)

In [None]:
!gdown 1_RQxjVFlve12NMTSyvEWlL6m9eylHmxU
!gdown 12NVI9hbSnFjfiT27d-FkaeLmkz3WgvwI
!gdown 1jkpplWIKV6IS7AHBPEPcgmiQC8T6RuuJ
!gdown 1V9X-iDGABK0njxTm6nVJPDH-cquHqJ8s
!gdown 1fXx465_ICgmZ1-9_Sl45w7GHLHzndyRo

Downloading...
From: https://drive.google.com/uc?id=1_RQxjVFlve12NMTSyvEWlL6m9eylHmxU
To: /content/graph.csv
100% 5.75M/5.75M [00:00<00:00, 37.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=12NVI9hbSnFjfiT27d-FkaeLmkz3WgvwI
To: /content/submission.csv
100% 131k/131k [00:00<00:00, 85.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1jkpplWIKV6IS7AHBPEPcgmiQC8T6RuuJ
To: /content/test2_X.csv
100% 40.0M/40.0M [00:01<00:00, 30.2MB/s]
Downloading...
From: https://drive.google.com/uc?id=1V9X-iDGABK0njxTm6nVJPDH-cquHqJ8s
To: /content/train_X.csv
100% 87.8M/87.8M [00:00<00:00, 111MB/s]
Downloading...
From: https://drive.google.com/uc?id=1fXx465_ICgmZ1-9_Sl45w7GHLHzndyRo
To: /content/train_y.csv
100% 514k/514k [00:00<00:00, 132MB/s]


In [None]:
%pip install pyvis torch_geometric >> None

In [None]:
# other
from itertools import combinations, groupby
from tqdm import tqdm

# for graphs
from torch_geometric.utils import from_networkx
from torch_geometric.data import Batch

import networkx as nx
import torch_geometric.transforms as T

# for data
import pandas as pd
import numpy as np

# for visualazing
import matplotlib.pyplot as plt
import seaborn as sns

#sklearn
import sklearn


## Стандартная предобработка графа

In [None]:
data_path = '/content'
graph = pd.read_csv(f'{data_path}/graph.csv').drop(columns=['Unnamed: 0'])

In [None]:
graph_inversed = graph.copy()
tmp = graph_inversed['contractor_id1']
graph_inversed['contractor_id1'] = graph_inversed['contractor_id2']
graph_inversed['contractor_id2'] = tmp
graph_inversed['base_graph'] = 1
graph['base_graph'] = 0

In [None]:
graph_joined = pd.concat([graph, graph_inversed], axis=0)

In [None]:
graph_joined = graph_joined.drop_duplicates(subset=['contractor_id1', 'contractor_id2', 'Distance'])
graph = graph_joined[graph_joined['base_graph'] == 1].drop(columns='base_graph')

In [None]:
# generating features
if gen_distance_features := False:
  graph_distaces_features = graph_joined.groupby(['contractor_id1'], as_index=False)['Distance'].agg(['mean', 'std', 'median']).rename({'contractor_id1' : 'contractor_id'}, axis=1)
  graph_distaces_features.columns = [graph_distaces_features.columns[0]] + [f'contractor_distance_{col}' for col in graph_distaces_features.columns[1:]]
  graph_distaces_features.to_csv('contractor_distances_features.csv', index=False)

## Стандартная предобработка табличных данных

In [None]:
from datetime import datetime

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

def preprocess_data(train, test, delete_corr_features=True, scale=True, init_features2drop=None):

    ts_features2drop = ['agg_cec_requests__g_contract__total_sum_accepted__all__sum__6W', 'agg_cec_requests__g_contract__total_sum_accepted__all__sum__8W', 'agg_cec_requests__g_contract__total_sum_accepted__all__sum__5W', 'agg_all_contracts__g_contract__rel_change_price_last_ds__isMain__last__ALL_TIME', 'agg_all_contracts__g_contract__abs_change_price_last_ds__isMain__last__ALL_TIME', 'agg_all_contracts__g_contract__bit_da_guid__isMain__count__ALL_TIME', 'agg_all_contracts__g_contract__abs_change_price_last_ds__isMain__mean__ALL_TIME', 'agg_cec_requests__g_contract__request_id__all__count__8W', 'agg_cec_requests__g_contract__request_id__all__count__4W', 'agg_cec_requests__g_contract__request_id__all__count__5W', 'agg_cec_requests__g_contract__request_id__all__count__7W', 'agg_ArbitrationCases__g_contractor__DefendantSum__sum__ALL_TIME', 'agg_FinanceAndTaxesFTS__g_contractor__TaxesSum__last__ALL_TIME', 'agg_FinanceAndTaxesFTS__g_contractor__TaxPenaltiesSum__last__ALL_TIME', 'agg_ArbitrationCases__g_contractor__PlaintiffSum__sum__ALL_TIME', 'counteragent_sum_agg_ks2__g_contract__total_sum__all__sum__ALL_TIME', 'counteragent_sum_agg_cec_requests__g_contract__request_id__all__count__ALL_TIME', 'agg_cec_requests__g_contract__time_btw_requests__all__mean__ALL_TIME', 'agg_materials__g_contract__order_id__countDistinct__ALL_TIME', 'agg_cec_requests__g_contract__time_btw_requests__all__mean__12M', 'agg_cec_requests__g_contract__time_btw_requests__all__mean__5M', 'agg_cec_requests__g_contract__time_btw_requests__all__mean__6M', 'agg_cec_requests__g_contract__total_sum_accepted__all__sum__ALL_TIME', 'counteragent_sum_agg_cec_requests__g_contract__total_sum_accepted__all__sum__ALL_TIME', 'agg_cec_requests__g_contract__time_btw_requests__all__mean__7M', 'agg_ks2__g_contract__id__all__count__ALL_TIME', 'agg_cec_requests__g_contract__request_id__all__count__12W', 'agg_ks2__g_contract__total_sum__all__sum__ALL_TIME', 'agg_cec_requests__g_contract__time_btw_requests__all__mean__8M', 'specialization_sum_agg_ks2__g_contract__total_sum__all__sum__ALL_TIME', 'counteragent_sum_agg_payments__g_contract__sum__all__sum__ALL_TIME', 'specialization_sum_agg_payments__g_contract__sum__all__sum__ALL_TIME', 'agg_workers__g_contract__fact_workers__all__mean__4W', 'agg_workers__g_contract__fact_workers__all__mean__3W', 'agg_workers__g_contract__fact_workers__all__mean__26W', 'agg_workers__g_contract__fact_workers__all__mean__6W', 'agg_workers__g_contract__fact_workers__all__mean__12W', 'agg_workers__g_contract__fact_workers__all__mean__8W', 'agg_workers__g_contract__fact_workers__all__mean__5W', 'agg_Finance__g_contractor__Value__CostPrice_y__last__ALL_TIME', 'agg_Finance__g_contractor__Value__NetProfit_y__last__ALL_TIME', 'agg_spass_applications__g_contract__appl_count_week__mean__6W', 'agg_spass_applications__g_contract__appl_count_week__mean__26W', 'agg_spass_applications__g_contract__appl_count_week__max__ALL_TIME', 'agg_spass_applications__g_contract__appl_count_week__mean__12W', 'agg_spass_applications__g_contract__appl_count_week__mean__ALL_TIME', 'agg_payments__g_contract__sum__all__countDistinct__4W', 'agg_spass_applications__g_contract__appl_count_week__mean__4W', 'agg_payments__g_contract__sum__all__countDistinct__2W', 'agg_scontrol__g_contractor__close_delay__defect_type_app__mean__ALL_TIME', 'agg_ks2__g_contract__id__all__count__2W', 'agg_ks2__g_contract__id__all__count__1W', 'agg_cec_requests__g_contract__total_sum_accepted__all__sum__4W', 'agg_ks2__g_contract__id__all__count__4W', 'agg_FinanceAndTaxesFTS__g_contractor__Income__last__ALL_TIME', 'agg_tender_proposal__g_contractor__id__ALL__countDistinct__8W', 'agg_tender_proposal__g_contractor__id__ALL__countDistinct__26W', 'agg_tender_proposal__g_contractor__id__ALL__countDistinct__52W', 'agg_payments__g_contract__sum__all__countDistinct__12W', 'agg_payments__g_contract__sum__all__sum__8W', 'agg_materials__g_contract__order_id__countDistinct__12W', 'agg_materials__g_contract__material_id__countDistinct__ALL_TIME', 'agg_materials__g_contract__order_id__countDistinct__8W', 'agg_payments__g_contract__sum__all__countDistinct__ALL_TIME', 'agg_payments__g_contract__date__advance__min__ALL_TIME', 'agg_payments__g_contract__sum__all__sum__ALL_TIME', 'agg_ks2__g_contract__total_sum__all__sum__8W', 'agg_ks2__g_contract__id__all__count__12W', 'agg_cec_requests__g_contract__time_btw_requests__all__mean__3M', 'agg_cec_requests__g_contract__time_btw_requests__all__mean__4M', 'agg_ArbitrationCases__g_contractor__DefendantSum__sum__12M', 'agg_ArbitrationCases__g_contractor__PlaintiffSum__sum__12M', 'agg_ArbitrationCases__g_contractor__PlaintiffSum__sum__12_24M', 'agg_ArbitrationCases__g_contractor__PlaintiffSum__sum__12_48M', 'agg_ArbitrationCases__g_contractor__DefendantSum__sum__12_36M', 'agg_ArbitrationCases__g_contractor__DefendantSum__sum__12_48M', 'agg_ArbitrationCases__g_contractor__DefendantSum__sum__12_24M', 'agg_Finance__g_contractor__Value__CurrentAssets__last__ALL_TIME', 'agg_Finance__g_contractor__Value__Balance__last__ALL_TIME', 'agg_Finance__g_contractor__Value__Capital__last__ALL_TIME', 'agg_workers__g_contract__fact_workers__all__mean__1W', 'agg_sroomer__g_contractor__sroomer_id__count__12M', 'agg_sroomer__g_contractor__sroomer_id__count__6M', 'agg_sroomer__g_contractor__sroomer_id__count__ALL_TIME', 'agg_sroomer__g_contractor__sroomer_id__count__3M', 'agg_ks2__g_contract__total_sum__all__sum__12W', 'agg_cec_requests__g_contract__created_dt__accepted__min__ALL_TIME', 'agg_payments__g_contract__sum__all__countDistinct__1W']
    constant_features_names = ['agg_BoardOfDirectors__g_contractor__Name__count__ALL_TIME', 'agg_ConsolidatedIndicator__g_contractor__Index__IndexOfDueDiligence__mean__ALL_TIME', 'agg_spark_extended_report__g_contractor__CreditLimitSum__last__ALL_TIME', 'agg_ConsolidatedIndicator__g_contractor__Index__Overall__mean__ALL_TIME', 'agg_ConsolidatedIndicator__g_contractor__Index__PaymentIndex__mean__ALL_TIME', 'agg_spark_extended_report__g_contractor__CompanySizeRevenue__last__ALL_TIME', 'agg_spark_extended_report__g_contractor__PledgeeActiveCount__last__ALL_TIME', 'contract_date', 'contract_init_sum', 'agg_spark_extended_report__g_contractor__PledgerActiveCount__last__ALL_TIME', 'agg_spark_extended_report__g_contractor__PledgeeCeasedCount__last__ALL_TIME', 'agg_spark_extended_report__g_contractor__PledgerCeasedCount__last__ALL_TIME', 'agg_spark_extended_report__g_contractor__EstimatedNetLiabilitiesSum__last__ALL_TIME', 'agg_spark_extended_report__g_contractor__EstimatedClaimsSum__last__ALL_TIME', 'agg_spark_extended_report__g_contractor__EstimatedLiabilitiesSum__last__ALL_TIME', 'agg_ConsolidatedIndicator__g_contractor__Index__FailureScore__mean__ALL_TIME']

    if delete_corr_features:
        train = train.drop(columns=ts_features2drop)
        test = test.drop(columns=ts_features2drop)

    # for now lets drop columns where num of nans is greater that  a half of df length
    isna_stat = train.isna().sum()[train.isna().sum() > 0]
    features2drop = [col for col in isna_stat.index if isna_stat[col] > (train.shape[0] * (3/5))]
    train.drop(columns=features2drop, inplace=True)
    test.drop(columns=features2drop, inplace=True)

    # other we`ll fill with median
    for col in isna_stat.index:
        if col not in features2drop:
            imp = SimpleImputer(strategy="median").fit(train[col].values.reshape(-1, 1))
            train[col] = imp.transform(train[col].values.reshape(-1, 1))
            test[col] = imp.transform(test[col].values.reshape(-1, 1))

    #translate to timestamp
    dates_columns = train.select_dtypes('object').columns

    new_dates_cols = []

    for date_col in dates_columns:
        train[f"{date_col}_day"] = train[date_col].apply(lambda x: datetime.fromisoformat(x).day)
        train[f"{date_col}_month"] = train[date_col].apply(lambda x: datetime.fromisoformat(x).month)

        test[f"{date_col}_day"] = test[date_col].apply(lambda x: datetime.fromisoformat(x).day)
        test[f"{date_col}_month"] = test[date_col].apply(lambda x: datetime.fromisoformat(x).month)

        train[date_col] = train[date_col].apply(lambda x: datetime.fromisoformat(x).timestamp())
        test[date_col] = test[date_col].apply(lambda x: datetime.fromisoformat(x).timestamp())

        new_dates_cols.extend([f"{date_col}_day", f"{date_col}_month"])

    if init_features2drop is not None:
      train = train.drop(columns=init_features2drop)
      test = test.drop(columns=init_features2drop)

    return train, test, constant_features_names

In [None]:
root_data_path = "/content"

X_train = pd.read_csv(f"{root_data_path}/train_X.csv")
y_train_all = pd.read_csv(f"{root_data_path}/train_y.csv")

train = X_train.merge(y_train_all, on=["contract_id", "report_date"], how="left")
test = pd.read_csv(f"{root_data_path}/test2_X.csv")

y_train_all = train["default6"]
train_orig = train.copy()

###  Добавляем сгенерированные графовые фичи (contractor_graph_features.csv - первая версия)

In [None]:
# adding contractor new graph features
graph_features = pd.read_csv("contractor_graph_features.csv")
train = train.merge(graph_features, on="contractor_id", how="left")
test = test.merge(graph_features, on="contractor_id", how="left")

In [None]:
for_drop = ["project_id", "building_id"]
cat_features = ["specialization_id"]

train, test, constant_features_names = preprocess_data(
    train.copy(),
    test.copy(),
    delete_corr_features=False,
    scale=False,
    init_features2drop=for_drop,
)

## Тут представлен код для генерацию фичей нод(контракторов), с помощью аггрегаций, а также формирование таргета

In [None]:
def gen_features_for_contractor(df):
  import pickle

  best_cols = []

  #подгружаем лучшие колонки в формате массива
  with open('/content/best_cols.pickle', 'rb') as f:
    best_cols = pickle.load(f)
  # print(best_cols)

  #аггрегируем
  contractor_features_agg = df.groupby(
      'contractor_id', as_index=False
  )[list(best_cols)].agg(['mean', 'std']).fillna(0)

  contractor_features_agg.columns = ['contractor_id'] + [f'{col[0]}_{col[1]}' for col in contractor_features_agg.columns[1:]]

  #ещё аггрегируем
  contractor_features_pro_dict = dict(
    reports_count=('contract_id', 'count'),
    contracts_count=('contract_id', lambda x: len(np.unique(x))),
    contract_len_mean=('contract_id', lambda x: x.value_counts().mean()),
    contract_len_std=('contract_id', lambda x: x.value_counts().std()),
  )

  contractor_features_pro = df.groupby('contractor_id', as_index=False).agg(
      **contractor_features_pro_dict
  ).fillna(0)

  contractor_features = contractor_features_agg.merge(
      contractor_features_pro,
      on='contractor_id',
      how='left'
  )

  targets = None

  #формируем таргет (пробовали по-разному, сейчас - это средняя текущая сумма контракта у контрактора)
  if 'default6' in df.columns:

    targets = {key : 0 for key in df['contractor_id'].unique()}
    gb_contractor = df.groupby('contractor_id', as_index=False)

    for c_id in targets.keys():
      # print(gb_contractor.get_group(c_id))
      # print(gb_contractor.get_group(c_id).groupby('contract_id')['default6'].agg(lambda x: int(1 in list(x))))
      # targets[c_id] = gb_contractor.get_group(c_id).groupby('contract_id')['contractor_distance_PageRank'].agg(lambda x: sum(x == 1)).sum()
      targets[c_id] = gb_contractor.get_group(c_id)['contract_current_sum'].agg('mean')
      # print(gb_contractor.get_group(c_id)['contract_init_sum'].agg('max'))

  return contractor_features, targets

In [None]:
#для трейна
contractor_features_train, targets = gen_features_for_contractor(train)
#для теста
contractor_features_test, _ = gen_features_for_contractor(test)
contractor_features = pd.concat(
  [
      contractor_features_train,
      contractor_features_test
  ],
  axis=0
).drop_duplicates(subset=['contractor_id'])
#выбираем только тех контракторов, которые есть в графе
graph_contract_ids = list(set(graph['contractor_id1'].unique()) | set(graph['contractor_id2'].unique()))
contractor_features = contractor_features[
    contractor_features['contractor_id'].isin(graph_contract_ids)
]
contractor_features = contractor_features.merge(
    pd.Series(targets).rename('target'),
    left_on='contractor_id',
    right_index=True,
    how='left'
)

In [None]:
#делаем маски для трейна и теста, чтобы при обучении было удобно доставать нужные примеры
contractor_features['target_mask'] = (~contractor_features['target'].isna()).astype(int)
contractor_features_c_ids = contractor_features[contractor_features['target_mask'] != 0]['contractor_id']
train_cids, val_cids = sklearn.model_selection.train_test_split(
    contractor_features_c_ids, test_size=0.15
)
contractor_features['val_mask'] = False
contractor_features['train_mask'] = False
contractor_features.loc[contractor_features['contractor_id'].isin(train_cids), 'train_mask'] = True
contractor_features.loc[contractor_features['contractor_id'].isin(val_cids), 'val_mask'] = True
contractor_features["contractor_id_index"] = contractor_features["contractor_id"]
contractor_features = contractor_features.set_index('contractor_id_index').to_dict(orient="index")

## Формируем граф

In [None]:
graph = graph[
    (
      graph['contractor_id1'].isin(list(contractor_features.keys()))
    ) & (
      graph['contractor_id2'].isin(list(contractor_features.keys()))
    )
]
graph.shape

(286795, 3)

In [None]:
G = nx.Graph()

pairs = []
for _, row in tqdm(graph.iterrows()):
  pairs.append((int(row['contractor_id1']), int(row['contractor_id2']), int(row['Distance'])))

G.add_weighted_edges_from(pairs)

286795it [00:15, 18244.25it/s]


In [None]:
nx.set_node_attributes(G, contractor_features)

In [None]:
from torch_geometric.utils import from_networkx

to_keep_as_single = ['contractor_id', 'target', 'target_mask', 'val_mask', 'train_mask']

g_data = from_networkx(
  G,
  group_node_attrs=[
      x for x in list(next(iter(G.nodes(data=True)))[-1].keys()) \
      if x not in to_keep_as_single
  ],
)

## Строим модель на основе графовый свёртки со своим механизмом атеншиона - GAT

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch_geometric.nn import GATConv

import random
import os

In [None]:
class train_CFG:
    def __init__(
        self,
    ):
        pass

    def __call__(self, attr, value):
        setattr(self, attr, value)

def set_seed(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
set_seed(69)

### Архитектура более подробна представлена в презентации


In [None]:
class GATBlock(nn.Module):

    def __init__(self, h_in, h_out, n_heads):

        super(GATBlock, self).__init__()

        self.skip = nn.Linear(h_in, h_out * n_heads)
        self.conv = GATConv(h_in, h_out, heads=n_heads, dropout=0.0)
        self.act = nn.ELU(alpha=0.1)
        self.dropout = nn.Dropout(p=0.0)

    def forward(self, x, edge_index):
        out = self.conv(x, edge_index) + self.skip(x)
        return self.act(self.dropout(out))

class TransductiveGAT(nn.Module):
    def __init__(
            self,
            n_in=128,
            n_out=128,
            hidden_dim=128,
            head=2,
            n_layers=1,
    ):

        super(TransductiveGAT, self).__init__()

        hidden_dims = []
        factor = 1
        for i in range(1, n_layers + 1):
            hidden_dims.append(int(hidden_dim * factor))
            if i <= n_layers // 2: factor *= 2
            else: factor /= 2

        heads = [1] + [head] * n_layers


        self.encoder = nn.Sequential(
            nn.Linear(n_in, hidden_dims[0]),
            nn.LayerNorm(hidden_dims[0]),
            nn.ReLU(),
            nn.Linear(hidden_dims[0], hidden_dims[0])
        )

        self.conv_layers = nn.ModuleList(
            GATBlock(h_in * n_head_in, h_out, n_head_out)
            for h_in, h_out, n_head_in, n_head_out in zip(hidden_dims, hidden_dims[1:], heads, heads[1:])
        )

        self.decoder = nn.Sequential(
            nn.Linear(heads[-1] * hidden_dims[-1], hidden_dims[-1]),
            nn.LeakyReLU(negative_slope=0.1),
            nn.Linear(hidden_dims[-1], n_out)
        )

    def forward(self, dataset, return_hidden=False):

        x, edge_index = dataset.x, dataset.edge_index

        x = self.encoder(x)

        for layer in self.conv_layers:
            x = layer(x, edge_index)

        if return_hidden: hidden_state = x

        return self.decoder(x) if not return_hidden else (self.decoder(x), hidden_state)

### Функции для обучения

In [None]:
def train_one_epoch_transductive(dataset, model, optimizer, loss_fn, mask):

    model.train()

    model_input = [dataset]

    out = model(*model_input)

    train_loss = loss_fn(
        out[getattr(dataset, mask)].view(-1), dataset.target[getattr(dataset, mask)]
    )

    train_mse = sklearn.metrics.mean_squared_error(
        dataset.target[getattr(dataset, mask)].cpu().numpy(),
        out[getattr(dataset, mask)].view(-1).detach().cpu().numpy(),
    )

    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()

    return train_loss.item(), train_mse


def val_one_epoch_transductive(dataset, model, loss_fn, mask):

    model.eval()

    with torch.no_grad():

        model_input = [dataset]

        out = model(*model_input)

        val_loss = loss_fn(
            out[getattr(dataset, mask)].view(-1), dataset.target[getattr(dataset, mask)]
        )
        val_mse = sklearn.metrics.mean_squared_error(
            dataset.target[getattr(dataset, mask)].cpu().numpy(),
            out[getattr(dataset, mask)].view(-1).detach().cpu().numpy(),
        )

    return val_loss.item(), val_mse

In [None]:
def train_transductive(
    dataset=None,
    model=None,
    optimizer=None,
    loss_fn=None,
    train_cfg=None,
    started_patience=None,
    earlystopper=None,
):

    stream = tqdm(range(getattr(train_cfg, "num_epochs")), desc="training")
    for epoch in stream:

        train_loss, train_mse = train_one_epoch_transductive(
            dataset, model, optimizer, loss_fn, train_cfg.train_mask
        )
        val_loss, val_mse = val_one_epoch_transductive(
            dataset, model, loss_fn, train_cfg.val_mask
        )

        if epoch > started_patience:

            if earlystopper != None:

                earlystopper(val_mse, model)

                if earlystopper.early_stop:
                    print(f"Early stopping at epoch {epoch}")
                    break

        if getattr(train_cfg, "verbose") and epoch % getattr(train_cfg, "verbose") == 0:
            stream.set_description(f"train mse: {train_mse}, eval mse: {val_mse}")

In [None]:
class EarlyStoppingMSE:
    """Early stops the training if validation mse doesn't improve after a given patience."""

    def __init__(
        self, patience=7, verbose=False, delta=0, path="checkpoint.pt", trace_func=print, use_checkpoints=True, model_name="best_model.pt",
    ):

        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_r2_max = -np.Inf
        self.delta = delta
        self.path = os.path.join(path, model_name)
        self.trace_func = trace_func
        self.use_checkpoints = use_checkpoints

    def __call__(self, val_r2, model):

        score = val_r2

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_r2, model)
        elif score > self.best_score - self.delta:
            self.counter += 1
            if self.verbose:
                self.trace_func(
                    f"EarlyStopping counter: {self.counter} out of {self.patience}"
                )
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_r2, model)
            self.counter = 0

    def save_checkpoint(self, val_r2, model):
        if self.verbose:
            self.trace_func(
                f"Val MSE up from ({self.val_r2_max:.4f} to {val_r2:.4f}).  Saving model ..."
            )
        if self.use_checkpoints:
            torch.save(model.state_dict(), self.path)
        self.val_r2_max = val_r2

### Настройки обучения

In [None]:
hidden_dim = 16
n_layers = 2
n_head=2
optimizer_name = "AdamW"
num_epochs = 1000
device = torch.device('cuda')

model = TransductiveGAT(
    n_in=g_data.num_features,
    n_out=1,
    hidden_dim=hidden_dim,
    n_layers=n_layers,
    head=n_head,
).to(device)

#optimizer
optimizer = getattr(torch.optim, optimizer_name)(model.parameters(), lr=5e-3)

loss = torch.nn.MSELoss()

stopper_delta = 0.01
checkpoints_path = '/content/models/'
stopper_patience = 50
earlystopper = EarlyStoppingMSE(
    patience=stopper_patience,
    verbose=True,
    delta=stopper_delta,
    path=checkpoints_path,
    trace_func=print,
    model_name="best_model_train.pt"
)

### Запускаем)

In [None]:
g_data = g_data.to(device)

train_cfg = train_CFG()

train_cfg("num_epochs", num_epochs)
train_cfg("verbose", 10)
train_cfg("train_mask", "train_mask") # Train Mask name depending on its name in graph object
train_cfg("val_mask", "val_mask") # Val Mask name depending on its name in graph object

train_transductive(
    dataset=g_data,
    model=model,
    optimizer=optimizer,
    loss_fn=loss,
    train_cfg=train_cfg,
    started_patience=50,
    earlystopper=earlystopper
)

# evaluation
_, result_r2 = val_one_epoch_transductive(g_data, model, loss, "val_mask")
print(result_r2)

train mse: 0.4217981994152069, eval mse: 0.18685282766819:   6%|▌         | 58/1000 [00:01<00:20, 46.64it/s]  

Val MSE up from (-inf to 0.2310).  Saving model ...
Val MSE up from (0.2310 to 0.2186).  Saving model ...
EarlyStopping counter: 1 out of 50
Val MSE up from (0.2186 to 0.1905).  Saving model ...
EarlyStopping counter: 1 out of 50
EarlyStopping counter: 2 out of 50
EarlyStopping counter: 3 out of 50
EarlyStopping counter: 4 out of 50
EarlyStopping counter: 5 out of 50
EarlyStopping counter: 6 out of 50


train mse: 0.2363176941871643, eval mse: 0.1589498668909073:   7%|▋         | 68/1000 [00:01<00:20, 46.46it/s]

EarlyStopping counter: 7 out of 50
EarlyStopping counter: 8 out of 50
EarlyStopping counter: 9 out of 50
Val MSE up from (0.1905 to 0.1711).  Saving model ...
EarlyStopping counter: 1 out of 50
EarlyStopping counter: 2 out of 50
EarlyStopping counter: 3 out of 50
EarlyStopping counter: 4 out of 50
EarlyStopping counter: 5 out of 50
Val MSE up from (0.1711 to 0.1589).  Saving model ...


train mse: 0.1375093162059784, eval mse: 0.1327708512544632:   8%|▊         | 78/1000 [00:01<00:19, 46.13it/s]

Val MSE up from (0.1589 to 0.1486).  Saving model ...
EarlyStopping counter: 1 out of 50
EarlyStopping counter: 2 out of 50
EarlyStopping counter: 3 out of 50
EarlyStopping counter: 4 out of 50
EarlyStopping counter: 5 out of 50
Val MSE up from (0.1486 to 0.1369).  Saving model ...
EarlyStopping counter: 1 out of 50
EarlyStopping counter: 2 out of 50
EarlyStopping counter: 3 out of 50


train mse: 0.06849499046802521, eval mse: 0.08660494536161423:   9%|▉         | 88/1000 [00:02<00:19, 46.44it/s]

EarlyStopping counter: 4 out of 50
Val MSE up from (0.1369 to 0.1238).  Saving model ...
EarlyStopping counter: 1 out of 50
Val MSE up from (0.1238 to 0.1137).  Saving model ...
EarlyStopping counter: 1 out of 50
EarlyStopping counter: 2 out of 50
EarlyStopping counter: 3 out of 50
EarlyStopping counter: 4 out of 50
Val MSE up from (0.1137 to 0.0980).  Saving model ...
Val MSE up from (0.0980 to 0.0866).  Saving model ...


train mse: 0.03990462049841881, eval mse: 0.0446569062769413:  10%|▉         | 98/1000 [00:02<00:19, 45.42it/s] 

EarlyStopping counter: 1 out of 50
EarlyStopping counter: 2 out of 50
EarlyStopping counter: 3 out of 50
Val MSE up from (0.0866 to 0.0744).  Saving model ...
EarlyStopping counter: 1 out of 50
Val MSE up from (0.0744 to 0.0588).  Saving model ...
EarlyStopping counter: 1 out of 50
Val MSE up from (0.0588 to 0.0472).  Saving model ...
EarlyStopping counter: 1 out of 50
EarlyStopping counter: 2 out of 50


train mse: 0.026989536359906197, eval mse: 0.0268817450851202:  11%|█         | 108/1000 [00:02<00:19, 45.77it/s]

EarlyStopping counter: 3 out of 50
EarlyStopping counter: 4 out of 50
Val MSE up from (0.0472 to 0.0358).  Saving model ...
EarlyStopping counter: 1 out of 50
EarlyStopping counter: 2 out of 50
EarlyStopping counter: 3 out of 50
EarlyStopping counter: 4 out of 50
EarlyStopping counter: 5 out of 50
EarlyStopping counter: 6 out of 50
EarlyStopping counter: 7 out of 50


train mse: 0.04141328111290932, eval mse: 0.04944001883268356:  12%|█▏        | 118/1000 [00:02<00:19, 46.36it/s]

EarlyStopping counter: 8 out of 50
EarlyStopping counter: 9 out of 50
Val MSE up from (0.0358 to 0.0253).  Saving model ...
EarlyStopping counter: 1 out of 50
EarlyStopping counter: 2 out of 50
EarlyStopping counter: 3 out of 50
EarlyStopping counter: 4 out of 50
EarlyStopping counter: 5 out of 50
EarlyStopping counter: 6 out of 50
EarlyStopping counter: 7 out of 50


train mse: 0.024524133652448654, eval mse: 0.03233177587389946:  13%|█▎        | 128/1000 [00:02<00:18, 46.74it/s]

EarlyStopping counter: 8 out of 50
EarlyStopping counter: 9 out of 50
EarlyStopping counter: 10 out of 50
EarlyStopping counter: 11 out of 50
EarlyStopping counter: 12 out of 50
EarlyStopping counter: 13 out of 50
EarlyStopping counter: 14 out of 50
EarlyStopping counter: 15 out of 50
EarlyStopping counter: 16 out of 50
EarlyStopping counter: 17 out of 50


train mse: 0.0229596346616745, eval mse: 0.027352889999747276:  14%|█▍        | 138/1000 [00:03<00:18, 47.33it/s] 

EarlyStopping counter: 18 out of 50
EarlyStopping counter: 19 out of 50
EarlyStopping counter: 20 out of 50
EarlyStopping counter: 21 out of 50
EarlyStopping counter: 22 out of 50
EarlyStopping counter: 23 out of 50
EarlyStopping counter: 24 out of 50
EarlyStopping counter: 25 out of 50
EarlyStopping counter: 26 out of 50
EarlyStopping counter: 27 out of 50


train mse: 0.014770126901566982, eval mse: 0.02018081583082676:  15%|█▍        | 148/1000 [00:03<00:18, 46.70it/s]

EarlyStopping counter: 28 out of 50
EarlyStopping counter: 29 out of 50
EarlyStopping counter: 30 out of 50
EarlyStopping counter: 31 out of 50
EarlyStopping counter: 32 out of 50
EarlyStopping counter: 33 out of 50
EarlyStopping counter: 34 out of 50
EarlyStopping counter: 35 out of 50
EarlyStopping counter: 36 out of 50
EarlyStopping counter: 37 out of 50


train mse: 0.011638344265520573, eval mse: 0.017148897051811218:  16%|█▌        | 158/1000 [00:03<00:17, 46.93it/s]

EarlyStopping counter: 38 out of 50
EarlyStopping counter: 39 out of 50
EarlyStopping counter: 40 out of 50
EarlyStopping counter: 41 out of 50
EarlyStopping counter: 42 out of 50
EarlyStopping counter: 43 out of 50
EarlyStopping counter: 44 out of 50
EarlyStopping counter: 45 out of 50
EarlyStopping counter: 46 out of 50
EarlyStopping counter: 47 out of 50


train mse: 0.011638344265520573, eval mse: 0.017148897051811218:  16%|█▋        | 163/1000 [00:03<00:18, 45.54it/s]

EarlyStopping counter: 48 out of 50
EarlyStopping counter: 49 out of 50
EarlyStopping counter: 50 out of 50
Early stopping at epoch 163
0.016196711





### Подгружаем лучшую модель и формируем эмбеддинги + предсказания как фичи

In [None]:
best_model = TransductiveGAT(
    n_in=g_data.num_features,
    n_out=1,
    hidden_dim=hidden_dim,
    n_layers=n_layers,
    head=n_head,
).to(device)

best_model_path = "/content/models/best_model_train.pt"
best_model.load_state_dict(torch.load(best_model_path, map_location="cuda"))

  best_model.load_state_dict(torch.load(best_model_path, map_location="cuda"))


<All keys matched successfully>

In [None]:
best_model.eval()
with torch.no_grad():
    out, embeddings = best_model(g_data, return_hidden=True)

In [None]:
embeddings = torch.cat([embeddings, out], dim=-1)
c_ids = g_data.contractor_id

In [None]:
embeds_df = pd.DataFrame(
    {
        "contractor_id" : c_ids.cpu(),
    }
)

embeds_df = pd.concat(
    [embeds_df,
    pd.DataFrame(embeddings.detach().cpu(), columns=[f'embed_{i}' for i in range(embeddings.shape[1] - 1)] + ['graph_pred'])],
    axis=1
)

In [None]:
embeds_df.describe()

Unnamed: 0,contractor_id,embed_0,embed_1,embed_2,embed_3,embed_4,embed_5,embed_6,embed_7,embed_8,...,embed_55,embed_56,embed_57,embed_58,embed_59,embed_60,embed_61,embed_62,embed_63,graph_pred
count,764.0,764.0,764.0,764.0,764.0,764.0,764.0,764.0,764.0,764.0,...,764.0,764.0,764.0,764.0,764.0,764.0,764.0,764.0,764.0,764.0
mean,450.954188,0.508657,-0.030144,-0.041518,0.494229,0.017328,-0.063835,0.036087,-0.011363,-0.041121,...,0.694113,-0.014866,0.685957,0.440469,0.162186,-0.021367,-0.031245,0.215587,0.384167,0.146589
std,260.035095,0.108906,0.003113,0.011009,0.092109,0.072829,0.004866,0.084151,0.054862,0.006237,...,0.127288,0.005638,0.114893,0.102434,0.088474,0.005101,0.013651,0.167539,0.172072,0.923597
min,0.0,0.157289,-0.040239,-0.085705,0.173718,-0.039372,-0.082889,-0.036757,-0.038828,-0.054151,...,0.469457,-0.041527,0.359597,0.058592,-0.042242,-0.041001,-0.058929,-0.005754,-0.012078,-0.574433
25%,227.75,0.441918,-0.03201,-0.046806,0.474215,-0.01515,-0.066587,-0.008555,-0.026164,-0.045438,...,0.616042,-0.017889,0.634149,0.374902,0.110744,-0.024331,-0.038517,0.101209,0.316846,-0.381613
50%,452.5,0.490797,-0.030411,-0.041381,0.502902,-0.009243,-0.06314,-0.002942,-0.022157,-0.042369,...,0.67891,-0.015378,0.669476,0.451726,0.16659,-0.021785,-0.032683,0.186111,0.384265,-0.181514
75%,676.25,0.547856,-0.028835,-0.036113,0.541721,-0.0026,-0.06027,0.043343,-0.015254,-0.037499,...,0.744936,-0.011498,0.729281,0.514133,0.221985,-0.018385,-0.026164,0.276253,0.454402,0.263578
max,898.0,0.985769,-0.012789,-0.004469,0.966256,0.371162,-0.053248,0.368929,0.758097,-0.001925,...,1.499091,-0.000562,1.353791,0.650993,0.416019,0.010701,0.074666,0.779789,1.367316,8.606124


In [None]:
embeds_df.to_parquet('nodes_embeddings_target=contract_current_sum_mean.parquet', index=False)