In [1]:
# embs
!gdown 1qR_Nl7zwabJuL-sqpGW8yaMvTtxh_SzL

Downloading...
From (original): https://drive.google.com/uc?id=1qR_Nl7zwabJuL-sqpGW8yaMvTtxh_SzL
From (redirected): https://drive.google.com/uc?id=1qR_Nl7zwabJuL-sqpGW8yaMvTtxh_SzL&confirm=t&uuid=090190fd-2600-4451-bac6-203eb8ae33a6
To: /workspace-SR003.nfs2/mera_dev/lm-evaluation-harness/auto_tests/embeddings_2.pkl
100%|████████████████████████████████████████| 609M/609M [00:09<00:00, 66.7MB/s]


In [2]:
# df_final
!gdown 1aaRsFW2WpWIcV9PPc9amnMu4Vp5bOvtz

Downloading...
From: https://drive.google.com/uc?id=1aaRsFW2WpWIcV9PPc9amnMu4Vp5bOvtz
To: /workspace-SR003.nfs2/mera_dev/lm-evaluation-harness/auto_tests/df_final.parquet
100%|██████████████████████████████████████| 41.4M/41.4M [00:00<00:00, 50.6MB/s]


In [3]:
# one_more_df
!gdown 1Ln6S652bPn3S6YT8fycIa8f2iE65_44E

Downloading...
From (original): https://drive.google.com/uc?id=1Ln6S652bPn3S6YT8fycIa8f2iE65_44E
From (redirected): https://drive.google.com/uc?id=1Ln6S652bPn3S6YT8fycIa8f2iE65_44E&confirm=t&uuid=04b717c4-7f19-409d-bea7-cb53990dbe38
To: /workspace-SR003.nfs2/mera_dev/lm-evaluation-harness/auto_tests/graph.edgelist
100%|████████████████████████████████████████| 179M/179M [00:02<00:00, 79.5MB/s]


In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from tqdm.auto import tqdm

In [2]:
import gc


def check_if_integer(series):
    return series.apply(float.is_integer).all()


def reduce_mem_usage(df, int_cast=True, obj_to_category=False, subset=None):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
    :param df: dataframe to reduce (pd.DataFrame)
    :param int_cast: indicate if columns should be tried to be casted to int (bool)
    :param obj_to_category: convert non-datetime related objects to category dtype (bool)
    :param subset: subset of columns to analyse (list)
    :return: dataset with the column dtypes adjusted (pd.DataFrame)
    """
    start_mem = df.memory_usage().sum() / 1024**2
    gc.collect()
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))

    cols = subset if subset is not None else df.columns.tolist()

    for col in tqdm(cols):
        col_type = df[col].dtype

        if (
            col_type != object  # noqa: E721
            and col_type.name != "category"
            and "datetime" not in col_type.name
        ):
            c_min = df[col].min()
            c_max = df[col].max()

            # test if column can be converted to an integer
            treat_as_int = str(col_type)[:3] == "int"
            if int_cast and not treat_as_int:
                treat_as_int = check_if_integer(df[col])

            if treat_as_int:
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
                    df[col] = df[col].astype(np.uint8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif (
                    c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max
                ):
                    df[col] = df[col].astype(np.uint16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif (
                    c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max
                ):
                    df[col] = df[col].astype(np.uint32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
                elif (
                    c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max
                ):
                    df[col] = df[col].astype(np.uint64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        elif "datetime" not in col_type.name and obj_to_category:
            df[col] = df[col].astype("category")
    gc.collect()
    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.3f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))

    return df

In [3]:
import pickle


with open("embeddings_2.pkl", "rb") as file:
    embeddings_dict = pickle.load(file)

In [4]:
df_edges = pd.DataFrame.from_dict(embeddings_dict, orient="index")

df_edges.columns = [f"dim_{i}" for i in range(df_edges.shape[1])]

df_edges.head(5)

Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,...,dim_54,dim_55,dim_56,dim_57,dim_58,dim_59,dim_60,dim_61,dim_62,dim_63
0,-0.217443,-0.568524,-0.422515,-0.618696,-0.013863,0.028434,0.453128,0.222339,0.026562,-0.210084,...,0.075274,0.818999,-0.38828,-0.063575,0.032599,0.432131,-0.462828,-0.202175,-0.038196,0.380083
1,0.287422,-0.471464,0.161808,-0.5993,0.442818,-0.09101,0.406752,0.379089,0.770118,-0.550693,...,0.201038,0.142476,-0.444186,-0.169624,-0.642753,0.521336,-0.612131,0.063163,-0.382933,-0.008654
2,-0.626136,-0.930415,0.064052,-0.836702,0.102227,-0.117012,0.330341,-0.085465,0.296298,0.223422,...,0.247413,0.399915,-0.399215,-0.104124,-0.478118,0.261969,0.104018,0.153504,0.360642,0.421945
3,-0.280868,-0.018827,-0.317551,-0.504031,-0.354453,-0.154107,0.135648,0.179993,0.123914,-0.186927,...,0.220232,0.551399,-0.33147,-0.340688,-0.025831,0.102063,0.081065,-0.294166,0.242619,0.15635
4,0.140446,-0.365303,-0.089975,0.039649,-0.208843,0.009867,0.101015,0.004137,-0.004447,-0.051337,...,0.296469,0.249513,-0.11017,0.022214,-0.021258,0.098818,-0.204368,0.074579,-0.020991,0.178723


In [5]:
df_edges.index = df_edges.index.astype(int)
df_edges = reduce_mem_usage(df_edges)

Memory usage of dataframe is 545.07 MB


  0%|          | 0/64 [00:00<?, ?it/s]

Memory usage after optimization is: 142.556 MB
Decreased by 73.8%


In [6]:
df_graph_features = pd.read_parquet("df_final.parquet", engine="pyarrow")

In [7]:
df_graph_features = df_graph_features[df_graph_features["AGE"] < 100]
node_labels = df_graph_features["AGE"]
df_graph_features.drop(columns=["AGE", "node"], inplace=True)

In [8]:
df = pd.merge(df_edges, df_graph_features, left_index=True, right_index=True)

In [9]:
df

Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,...,max_hight,min_weight,max_weight,neighbour_cnt_mean,SC_mean,AA_mean,vertex_cnt,edge_cnt,edge-vertex_cnt,density
0,-0.217407,-0.568359,-0.422607,-0.618652,-0.013863,0.028427,0.453125,0.222290,0.026566,-0.210083,...,185,0,88,1.833008,1.833008,0.534668,12,11,-1,0.166626
1,0.287354,-0.471436,0.161865,-0.599121,0.442871,-0.091003,0.406738,0.379150,0.770020,-0.550781,...,190,0,91,9.421875,9.421875,0.815918,38,179,141,0.254639
2,-0.625977,-0.930176,0.064026,-0.836914,0.102234,-0.117004,0.330322,-0.085449,0.296387,0.223389,...,195,0,109,2.000000,2.000000,0.554199,14,14,0,0.153809
3,-0.280762,-0.018829,-0.317627,-0.503906,-0.354492,-0.154053,0.135620,0.180054,0.123901,-0.186890,...,185,0,90,1.500000,1.500000,0.562500,4,3,-1,0.500000
4,0.140503,-0.365234,-0.089966,0.039642,-0.208862,0.009865,0.101013,0.004139,-0.004448,-0.051331,...,185,90,90,1.000000,1.000000,0.500000,2,1,-1,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1099111,-0.171021,-0.408691,-0.109497,0.004635,0.038910,-0.064819,-0.340332,0.103821,-0.150635,0.011116,...,160,56,56,1.000000,1.000000,0.500000,2,1,-1,1.000000
1099112,0.174805,-0.075500,-0.083557,0.030853,-0.290039,0.175903,-0.094055,0.050568,0.052795,-0.262695,...,165,58,58,1.000000,1.000000,0.500000,2,1,-1,1.000000
1099113,0.308350,0.003771,0.047607,-0.067749,-0.162109,0.142090,-0.142456,0.060944,0.035095,-0.180542,...,0,0,0,1.000000,1.000000,0.500000,2,1,-1,1.000000
1099117,-0.288818,0.045929,0.067444,-0.348877,0.229492,-0.129272,0.453369,-0.061127,0.250488,0.071045,...,0,0,0,1.000000,1.000000,0.500000,2,1,-1,1.000000


In [10]:
ind = df.index.values
np.random.shuffle(ind)
train_nids = ind[0 : int(len(ind) * 0.9)]
test_nids = ind[int(len(ind) * 0.9) :]

In [11]:
train_df = df.loc[train_nids]
val_df = df.loc[test_nids]
train_labels = node_labels[train_nids]
val_labels = node_labels[test_nids]

In [12]:
# train_pool = Pool(
#     data=train_df,
#     label=train_labels
# )

# val_pool = Pool(
#     data=val_df,
#     label=val_labels
# )

In [13]:
params = {
    "task_type": "CPU",  # Если есть GPU, можно поставить 'GPU'
    "loss_function": "RMSE",  # Или другая метрика, если нужна классификация/др.
    "iterations": 1500,  # Можно поставить 1000-3000; дополнительно использовать раннюю остановку
    "learning_rate": 0.03,  # Классический размер шага (обучение «не слишком быстрое, не слишком медленное»)
    "depth": 6,  # Глубина деревьев: 6-8 — хороший компромисс
    "l2_leaf_reg": 3,  # Регуляризация; базовое значение — 3
    "random_strength": 1,  # Сила случайности при расщеплениях
    "bagging_temperature": 1,  # Для стекинга/бэггинга внутри CatBoost
    "boosting_type": "Plain",  # 'Ordered' может быть полезно при небольших датасетах
    "grow_policy": "SymmetricTree",  # Альтернативы: 'Depthwise' (быстрее, но возможный оверфит), 'Lossguide'
    "random_seed": 56,  # Для воспроизводимости
    "thread_count": -1,  # Использовать все доступные ядра CPU
    "od_type": "Iter",  # early_stopping по количеству итераций
    "od_wait": 50,  # Количество итераций без улучшения на валидации, после которых обучение останавливается
}

In [None]:
X_train = train_df
X_val = val_df
y_train = train_labels
y_val = val_labels

models = {
    "CatBoost": CatBoostRegressor(**params),
    #"Linear Regression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    #"KNN": KNeighborsRegressor(),
    "SVR": SVR(verbose=True, max_iter=7777),
    #"GaussianProcess": GaussianProcessRegressor(),
    "ExtraTrees": ExtraTreesRegressor(random_state=42, verbose=True),
    "MLP": MLPRegressor(max_iter=500, random_state=42, verbose=True),
}

# Словарь для сохранения результатов
results = {}

# Перебор моделей
for name, model in models.items():
    try:
        model.fit(train_df, train_labels)
        y_pred = model.predict(val_df)
        # MAE
        mae = mean_absolute_error(val_labels, y_pred)
        results[name] = mae
        print(f"Model: {name}, MAE: {mae}")
    except Exception as e:
        print(f"Model: {name} failed with error: {e}")

# Лучшая модель
best_model_name = min(results, key=results.get)
best_mae = results[best_model_name]
print(f"Best model: {best_model_name} with MAE: {best_mae}")

0:	learn: 8.9905730	total: 83.3ms	remaining: 2m 4s
1:	learn: 8.9905574	total: 117ms	remaining: 1m 27s
2:	learn: 8.9905328	total: 147ms	remaining: 1m 13s
3:	learn: 8.9905137	total: 178ms	remaining: 1m 6s
4:	learn: 8.9904844	total: 211ms	remaining: 1m 3s
5:	learn: 8.9904571	total: 244ms	remaining: 1m
6:	learn: 8.9904270	total: 274ms	remaining: 58.4s
7:	learn: 8.9903994	total: 306ms	remaining: 57s
8:	learn: 8.9903680	total: 337ms	remaining: 55.8s
9:	learn: 8.9903402	total: 367ms	remaining: 54.6s
10:	learn: 8.9903127	total: 399ms	remaining: 54s
11:	learn: 8.9902841	total: 429ms	remaining: 53.2s
12:	learn: 8.9902622	total: 458ms	remaining: 52.4s
13:	learn: 8.9902304	total: 491ms	remaining: 52.1s
14:	learn: 8.9901947	total: 524ms	remaining: 51.9s
15:	learn: 8.9901698	total: 555ms	remaining: 51.5s
16:	learn: 8.9901334	total: 586ms	remaining: 51.1s
17:	learn: 8.9901136	total: 616ms	remaining: 50.7s
18:	learn: 8.9900858	total: 648ms	remaining: 50.5s
19:	learn: 8.9900595	total: 681ms	remaining: 

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


Model: Ridge, MAE: 6.820382277416155
Model: Lasso, MAE: 6.824426987783866
[LibSVM].......WARN: libsvm Solver reached max_iter
optimization finished, #iter = 7777
obj = -401830.574294, rho = -27.000172
nSV = 15552, nBSV = 15552




Model: SVR, MAE: 7.664648198160339


In [None]:
model = models[best_model_name]
model.fit(X_train, y_train)

zip(X_train.columns, model.feature_importances_)

In [None]:
1