In [5]:
import networkx as nx
import matplotlib.pyplot as plt
import tarfile
import pandas as pd
import numpy as np
import torch
import numpy as np
from tqdm.auto import tqdm
from catboost import CatBoostRegressor, Pool

# Data processing

In [None]:
! wget https://snap.stanford.edu/data/soc-pokec-profiles.txt.gz

--2024-12-24 16:07:57--  https://snap.stanford.edu/data/soc-pokec-profiles.txt.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 435512787 (415M) [application/x-gzip]
Saving to: ‘soc-pokec-profiles.txt.gz.1’

pokec-profiles.txt.   0%[                    ]   4.01M  1.19MB/s    eta 10m 52s^C


In [None]:
!gdown 1Ln6S652bPn3S6YT8fycIa8f2iE65_44E

Downloading...
From (original): https://drive.google.com/uc?id=1Ln6S652bPn3S6YT8fycIa8f2iE65_44E
From (redirected): https://drive.google.com/uc?id=1Ln6S652bPn3S6YT8fycIa8f2iE65_44E&confirm=t&uuid=a62b5f32-9be8-4fd6-a562-774254680290
To: /workspace-SR003.nfs2/mera_dev/lm-evaluation-harness/auto_tests/graph.edgelist
100%|█████████████████████████████████████████| 179M/179M [00:01<00:00, 106MB/s]


In [None]:
#G = nx.read_edgelist("soc-pokec-relationships.txt.gz", create_using = nx.DiGraph())
G = nx.read_edgelist("graph.edgelist")

In [None]:
def preprocess_nodes(X, ego_net):
    # Собрать уникальные номера вершин из X
    unique_nodes = pd.concat([X['u'], X['v']]).unique()

    # Создать отображение узлов в индексы
    node_to_index = {node: idx for idx, node in enumerate(unique_nodes)}

    # Преобразовать u и v в индексы
    X['u'] = X['u'].map(node_to_index)
    X['v'] = X['v'].map(node_to_index)

    return X, node_to_index

def build_X(X, ego_net):
    # Количество узлов и рёбер в эго-графе
    vertex_cnt = ego_net.number_of_nodes()
    edge_cnt = ego_net.number_of_edges()

    # Добавляем метрики в X для всех узлов эго-графа
    X['vertex_cnt'] = vertex_cnt
    X['edge_cnt'] = edge_cnt
    X['edge-vertex_cnt'] = edge_cnt - vertex_cnt
    X['density'] = 2 * edge_cnt / vertex_cnt / (vertex_cnt - 1) if vertex_cnt > 1 else 0

    # Сортируем вершины и создаем матрицу смежности для эго-графа
    nodes = sorted(ego_net.nodes())
    g = nx.to_numpy_array(ego_net, nodelist=nodes)

    # Степени вершин
    degree = g.sum(axis=1)

    # Для каждого эго-графа (каждой вершины) считаем метрики
    ego_data = pd.DataFrame({'node': nodes})

    # Метрики по вершинам (агрегат по эго-графу)
    ego_data['neighbour_cnt'] = degree  # Количество соседей (степень вершины)

    # SC (структурная связность)
    sc = g.dot(g.T)  # Это матрица структурной связности
    ego_data['SC'] = sc.diagonal()  # Получаем значения только на диагонали (для каждой вершины)

    # Метрика AA (Adamic-Adar)
    degree_with_self = 1 + degree.reshape((-1, 1))  # Избегаем деления на 0
    aa = (g / degree_with_self).dot(g.T)
    ego_data['AA'] = aa.diagonal()  # Значения только на диагонали (для каждой вершины)

    # Теперь агрегация по всем меткам эго-графа (среднее значение для всех соседей)
    # Мы будем использовать данные для первой вершины (или центра эго-сети)
    # Для одного эго-графа вернем агрегированные метрики
    ego_data_aggregated = pd.DataFrame({
        'neighbour_cnt_mean': ego_data['neighbour_cnt'].mean(),
        'SC_mean': ego_data['SC'].mean(),
        'AA_mean': ego_data['AA'].mean(),
        'vertex_cnt': vertex_cnt,
        'edge_cnt': edge_cnt,
        'edge-vertex_cnt': edge_cnt - vertex_cnt,
        'density': 2 * edge_cnt / vertex_cnt / (vertex_cnt - 1) if vertex_cnt > 1 else 0
    }, index=[0])

    return ego_data_aggregated

In [None]:
df_graph = []

# Список всех вершин графа
nodes = list(G.nodes())

# Обрабатываем каждую вершину как эго
for ego_id in tqdm(nodes):
    # Извлекаем эго-сеть
    ego_net = nx.ego_graph(G, ego_id, radius=1, center=True)

    # Подготовка данных для текущей эго-сети
    X = pd.DataFrame({
        'u': [int(edge[0]) for edge in ego_net.edges()],
        'v': [int(edge[1]) for edge in ego_net.edges()]
    })

    # Подготовка данных
    X, node_to_index = preprocess_nodes(X, ego_net)

    # Строим признаки для эго-графа
    X_aggregated = build_X(X, ego_net)

    # Сохраняем результат для каждого эго-графа
    df_graph.append(X_aggregated)

# Собираем всё в единый DataFrame
df_graph = pd.concat(df_graph, ignore_index=True)

# Печатаем итоговый DataFrame
print(df_graph)

  0%|          | 0/1099121 [00:00<?, ?it/s]

         neighbour_cnt_mean   SC_mean   AA_mean  vertex_cnt  edge_cnt  \
0                  1.833333  1.833333  0.534722          12        11   
1                  9.421053  9.421053  0.816157          38       179   
2                  2.000000  2.000000  0.554422          14        14   
3                  1.500000  1.500000  0.562500           4         3   
4                  1.000000  1.000000  0.500000           2         1   
...                     ...       ...       ...         ...       ...   
1099116            1.000000  1.000000  0.500000           2         1   
1099117            1.000000  1.000000  0.500000           2         1   
1099118            1.000000  1.000000  0.500000           2         1   
1099119            1.000000  1.000000  0.500000           2         1   
1099120            1.000000  1.000000  0.500000           2         1   

         edge-vertex_cnt   density  
0                     -1  0.166667  
1                    141  0.254623  
2           

In [None]:
columns = [
            "public",
            "completion_percentage",
            "gender",
            "region",
            "last_login",
            "registration",
            "AGE",
            "body",
            "I_am_working_in_field",
            "spoken_languages",
            "hobbies",
            "I_most_enjoy_good_food",
            "pets",
            "body_type",
            "my_eyesight",
            "eye_color",
            "hair_color",
            "hair_type",
            "completed_level_of_education",
            "favourite_color",
            "relation_to_smoking",
            "relation_to_alcohol",
            "sign_in_zodiac",
            "on_pokec_i_am_looking_for",
            "love_is_for_me",
            "relation_to_casual_sex",
            "my_partner_should_be",
            "marital_status",
            "children",
            "relation_to_children",
            "I_like_movies",
            "I_like_watching_movie",
            "I_like_music",
            "I_mostly_like_listening_to_music",
            "the_idea_of_good_evening",
            "I_like_specialties_from_kitchen",
            "fun",
            "I_am_going_to_concerts",
            "my_active_sports",
            "my_passive_sports",
            "profession",
            "I_like_books",
            "life_style",
            "music",
            "cars",
            "politics",
            "relationships",
            "art_culture",
            "hobbies_interests",
            "science_technologies",
            "computers_internet",
            "education",
            "sport",
            "movies",
            "travelling",
            "health",
            "companies_brands",
            "more",
            ""
        ]

In [None]:
df_all = pd.read_csv('soc-pokec-profiles.txt.gz', compression='gzip', sep='\t', names=columns, encoding='utf-8', on_bad_lines='skip', index_col=None)

In [None]:
df_all.head()

Unnamed: 0,public,completion_percentage,gender,region,last_login,registration,AGE,body,I_am_working_in_field,spoken_languages,...,science_technologies,computers_internet,education,sport,movies,travelling,health,companies_brands,more,Unnamed: 21
1,1,14,1.0,"zilinsky kraj, zilina",2012-05-25 11:20:00.0,2005-04-03 00:00:00.0,26.0,"185 cm, 90 kg",it,anglicky,...,,,,,,,,,,
2,1,62,0.0,"zilinsky kraj, kysucke nove mesto",2012-05-25 23:08:00.0,2007-11-30 00:00:00.0,0.0,"166 cm, 58 kg",,nemecky,...,,,,,,,,,,
16,1,64,1.0,"zilinsky kraj, kysucke nove mesto",2012-05-25 23:19:40.0,2008-05-18 00:00:00.0,23.0,"173 cm, 70 kg",najvatcsej firme na svete urad prace,no predsa svoj :d a najlepsie druhy,...,,,,,,,,,,
3,0,38,1.0,"bratislavsky kraj, bratislava - karlova ves",2012-05-10 18:05:00.0,2010-05-23 00:00:00.0,29.0,,"reklamy a medii, sluzieb a obchodu","anglicky, nemecky",...,,,,,,,,,,
4,1,12,0.0,"banskobystricky kraj, brezno",2011-12-29 12:25:00.0,2011-12-29 00:00:00.0,26.0,,,,...,,,,,,,,,,


In [None]:
from sklearn.preprocessing import LabelEncoder
import string
import math

df_known = df_all.copy()
df_known = df_known[~df_known.AGE.isnull()]
df_known = df_known[df_known.AGE != 0]

df_known.index = list(range(df_known.shape[0]))

df = df_known[["public", "completion_percentage", "gender", "region", "AGE", "last_login", "completed_level_of_education"]]

le = LabelEncoder()
df.loc[:, 'region'] = le.fit_transform(df['region'])
df.loc[:, 'region'] = df['region'].apply(lambda x : int(x))

translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))

def get_words(text):
    res = text.translate(translator).split()
    #res = re.sub('['+string.punctuation+']', ' ', text).split()
    for i in range(0, len(res)):
        res[i] = res[i].strip()
    return res

def transform(val, transformer):
    if val is None or val == " " or (type(val) is float and math.isnan(val)):
        return 0
    return transformer(val)
def relable_transformer(val, keywords, no_hit_to_null=True):
    if val == 0:
        return 0
    val_clean = ' '.join(get_words(val))
    for keyword, repl in keywords:
        if keyword in val_clean:
            return repl
    if no_hit_to_null:
        return 0
    else:
        return val

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [None]:
edu_keywords = [('stredoskolske', 1), ('zakladne', 1), # среднее и основное образование
                ('vysokoskolske', 2), # колледж
                ('ucnovske', 3), # стажировка
                ('studujem', 4), ('student', 3), # студент
                ('pracuje', 5), # практика
                ('bakalarske', 6)] # бакалавриат
df.loc[:, 'completed_level_of_education'] = df['completed_level_of_education'].apply(lambda x: transform(x, lambda v: relable_transformer(v, edu_keywords)))

In [None]:
import datetime
df.loc[:, "last_login"] = df["last_login"].apply(lambda x : int(datetime.datetime.strptime(x.split()[0],  '%Y-%m-%d').timestamp()))

In [None]:
import re
def transform_body(val):
    if val is None or (type(val) is float and math.isnan(val)):
        return 0, 0
    height_program = re.compile(r"[0-9]+(\s*)cm")
    weight_program = re.compile(r"[0-9]+(\s*)kg")
    h = re.search(height_program, val)
    if h is None:
        h = 0
    else:
        h = re.sub(r'(\s*)cm', '', h.group(0))
    w = re.search(weight_program, val)
    if w is None:
        w = 0
    else:
        w = re.sub(r'(\s*)kg', '', w.group(0))
    if int(h) > 10000:
        h = 0
    if int(w) > 10000:
        w = 0
    return int(h), int(w)

In [None]:
df.loc[:, "hight"] = df_known["body"].apply(lambda x : transform_body(x)[0])
df.loc[:, "weight"] = df_known["body"].apply(lambda x : transform_body(x)[1])
df = df.astype({'region': 'int64', 'completed_level_of_education': 'int64', 'hight' : 'int64', 'weight' : 'int64', 'last_login' : 'int64'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [None]:
def calculate_mode(series):
    return series.mode().iloc[0] if not series.empty else np.nan

# Вычисление агрегатов по соседям для каждой вершины
aggregates = []
for node in tqdm(G.nodes):
    neighbors = [int(neighbor) for neighbor in G.neighbors(node)]
    if neighbors:
        # Извлекаем признаки соседей
        neighbor_features = df.loc[neighbors]

        # Вычисляем агрегаты для каждого признака
        aggregates.append({
            'node': node,
            'mean_public': neighbor_features['public'].mode().iloc[0],  # Мода для категориальных данных
            'mean_completion_percentage': neighbor_features['completion_percentage'].mean(),  # Среднее для числовых данных
            'mean_gender': neighbor_features['gender'].mode().iloc[0],
            'mean_region': neighbor_features['region'].mode().iloc[0],
            'mean_AGE': neighbor_features['AGE'].mean(),
            'mean_last_login': neighbor_features['last_login'].mean(),
            'mean_completed_level_of_education': neighbor_features['completed_level_of_education'].mode().iloc[0],
            'mean_hight': neighbor_features['hight'].mean(),
            'mean_weight': neighbor_features['weight'].mean(),
            'min_completion_percentage': neighbor_features['completion_percentage'].min(),  # Мин для числовых данных
            'max_completion_percentage': neighbor_features['completion_percentage'].max(),
            'min_AGE': neighbor_features['AGE'].min(),
            'max_AGE': neighbor_features['AGE'].max(),
            'min_last_login': neighbor_features['last_login'].min(),
            'max_last_login': neighbor_features['last_login'].max(),
            'min_hight': neighbor_features['hight'].min(),
            'max_hight': neighbor_features['hight'].max(),
            'min_weight': neighbor_features['weight'].min(),
            'max_weight': neighbor_features['weight'].max()
        })
    else:
        # Если у вершины нет соседей, заполняем NaN
        aggregates.append({
            'node': node,
            'mean_public': np.nan,
            'mean_completion_percentage': np.nan,
            'mean_gender': np.nan,
            'mean_region': np.nan,
            'mean_AGE': np.nan,
            'mean_last_login': np.nan,
            'mean_completed_level_of_education': np.nan,
            'mean_hight': np.nan,
            'mean_weight': np.nan,
            'min_completion_percentage': np.nan,
            'max_completion_percentage': np.nan,
            'min_AGE': np.nan,
            'max_AGE': np.nan,
            'min_last_login': np.nan,
            'max_last_login': np.nan,
            'min_hight': np.nan,
            'max_hight': np.nan,
            'min_weight': np.nan,
            'max_weight': np.nan
        })

  0%|          | 0/1099121 [00:00<?, ?it/s]

In [None]:
aggregates_df = pd.DataFrame(aggregates)
aggregates_df['node'] = aggregates_df['node'].astype(int)
aggregates_df

Unnamed: 0,node,mean_public,mean_completion_percentage,mean_gender,mean_region,mean_AGE,mean_last_login,mean_completed_level_of_education,mean_hight,mean_weight,min_completion_percentage,max_completion_percentage,min_AGE,max_AGE,min_last_login,max_last_login,min_hight,max_hight,min_weight,max_weight
0,0,1,44.545455,1.0,171,24.818182,1.335549e+09,0,79.000000,38.454545,12,72,18.0,38.0,1325102400,1337889600,0,185,0,88
1,1,1,44.405405,0.0,2,21.729730,1.336054e+09,0,85.189189,19.837838,12,76,10.0,28.0,1316203200,1337889600,0,190,0,91
2,2,1,51.000000,1.0,171,23.000000,1.331662e+09,1,167.692308,53.461538,12,78,16.0,28.0,1312401600,1337889600,0,195,0,109
3,3,1,20.000000,1.0,69,24.333333,1.332072e+09,0,61.666667,30.000000,12,34,22.0,26.0,1328212800,1337889600,0,185,0,90
4,4,1,14.000000,1.0,184,26.000000,1.337890e+09,0,185.000000,90.000000,14,14,26.0,26.0,1337889600,1337889600,185,185,90,90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1099116,1099111,1,22.000000,1.0,119,39.000000,1.336766e+09,0,182.000000,89.000000,22,22,39.0,39.0,1336766400,1336766400,182,182,89,89
1099117,1099112,1,12.000000,0.0,15,22.000000,1.330546e+09,0,0.000000,0.000000,12,12,22.0,22.0,1330545600,1330545600,0,0,0,0
1099118,1099113,0,12.000000,1.0,11,11.000000,1.330546e+09,0,0.000000,0.000000,12,12,11.0,11.0,1330545600,1330545600,0,0,0,0
1099119,1099117,1,12.000000,0.0,2,14.000000,1.329595e+09,0,0.000000,0.000000,12,12,14.0,14.0,1329595200,1329595200,0,0,0,0


In [None]:
# df_final = pd.concat([node_features, df_graph], axis=1)
df_final = pd.merge(aggregates_df, df_graph, left_on='node', right_index=True)
df_final = pd.merge(df, df_final, left_index=True, right_index=True)

In [None]:
df_final

Unnamed: 0,public,completion_percentage,gender,region,AGE,last_login,completed_level_of_education,hight,weight,node,...,max_hight,min_weight,max_weight,neighbour_cnt_mean,SC_mean,AA_mean,vertex_cnt,edge_cnt,edge-vertex_cnt,density
0,1,14,1.0,184,26.0,1337889600,0,185,90,0,...,185,0,88,1.833333,1.833333,0.534722,12,11,-1,0.166667
1,1,64,1.0,171,23.0,1337889600,0,173,70,1,...,190,0,91,9.421053,9.421053,0.816157,38,179,141,0.254623
2,0,38,1.0,26,29.0,1336593600,0,0,0,2,...,195,0,109,2.000000,2.000000,0.554422,14,14,0,0.153846
3,1,12,0.0,2,26.0,1325102400,0,0,0,3,...,185,0,90,1.500000,1.500000,0.562500,4,3,-1,0.500000
4,1,47,0.0,174,27.0,1337889600,2,162,60,4,...,185,90,90,1.000000,1.000000,0.500000,2,1,-1,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1099116,1,12,0.0,2,14.0,1329595200,0,0,0,1099111,...,182,89,89,1.000000,1.000000,0.500000,2,1,-1,1.000000
1099117,1,12,1.0,13,11.0,1329768000,0,0,0,1099112,...,0,0,0,1.000000,1.000000,0.500000,2,1,-1,1.000000
1099118,1,67,1.0,89,33.0,1337976000,1,185,90,1099113,...,0,0,0,1.000000,1.000000,0.500000,2,1,-1,1.000000
1099119,1,12,1.0,116,33.0,1337025600,0,0,0,1099117,...,0,0,0,1.000000,1.000000,0.500000,2,1,-1,1.000000


In [None]:
import gc

def reduce_mem_usage(df, int_cast=True, obj_to_category=False, subset=None):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
    :param df: dataframe to reduce (pd.DataFrame)
    :param int_cast: indicate if columns should be tried to be casted to int (bool)
    :param obj_to_category: convert non-datetime related objects to category dtype (bool)
    :param subset: subset of columns to analyse (list)
    :return: dataset with the column dtypes adjusted (pd.DataFrame)
    """
    start_mem = df.memory_usage().sum() / 1024 ** 2;
    gc.collect()
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    cols = subset if subset is not None else df.columns.tolist()

    for col in tqdm(cols):
        col_type = df[col].dtype

        if col_type != object and col_type.name != 'category' and 'datetime' not in col_type.name:
            c_min = df[col].min()
            c_max = df[col].max()

            # test if column can be converted to an integer
            treat_as_int = str(col_type)[:3] == 'int'
            # if int_cast and not treat_as_int:
            #     treat_as_int = check_if_integer(df[col])

            if treat_as_int:
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
                    df[col] = df[col].astype(np.uint8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max:
                    df[col] = df[col].astype(np.uint16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max:
                    df[col] = df[col].astype(np.uint32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
                elif c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max:
                    df[col] = df[col].astype(np.uint64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        elif 'datetime' not in col_type.name and obj_to_category:
            df[col] = df[col].astype('category')
    gc.collect()
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.3f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
df_final = reduce_mem_usage(df_final)

Memory usage of dataframe is 310.27 MB


  0%|          | 0/36 [00:00<?, ?it/s]

Memory usage after optimization is: 87.001 MB
Decreased by 72.0%


In [None]:
df_final.to_parquet('df_final.parquet', engine='pyarrow', index=False)

In [None]:
# df_final = pd.read_parquet('df_final.parquet', engine='pyarrow')

In [8]:
df_final = df_final[df_final['AGE'] < 100]
node_labels = df_final['AGE']
df_final.drop(columns=["AGE", 'node'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final.drop(columns=["AGE", 'node'], inplace=True)


In [9]:
ind = df_final.index.values
np.random.shuffle(ind)
train_nids = ind[0:int(len(ind)*0.75)]
test_nids = ind[int(len(ind)*0.75):]

In [10]:
train_df = df_final.loc[train_nids]
val_df = df_final.loc[test_nids]
train_labels = node_labels[train_nids]
val_labels = node_labels[test_nids]

# Catboost

In [None]:
train_pool = Pool(
    data=train_df,
    label=train_labels
)

val_pool = Pool(
    data=val_df,
    label=val_labels
)

In [None]:
params = {
    'task_type': 'CPU',              # Если есть GPU, можно поставить 'GPU'
    'loss_function': 'RMSE',         # Или другая метрика, если нужна классификация/др.
    'iterations': 1500,             # Можно поставить 1000-3000; дополнительно использовать раннюю остановку
    'learning_rate': 0.03,          # Классический размер шага (обучение «не слишком быстрое, не слишком медленное»)
    'depth': 6,                     # Глубина деревьев: 6-8 — хороший компромисс
    'l2_leaf_reg': 3,               # Регуляризация; базовое значение — 3
    'random_strength': 1,           # Сила случайности при расщеплениях
    'bagging_temperature': 1,       # Для стекинга/бэггинга внутри CatBoost
    'boosting_type': 'Plain',       # 'Ordered' может быть полезно при небольших датасетах
    'grow_policy': 'SymmetricTree', # Альтернативы: 'Depthwise' (быстрее, но возможный оверфит), 'Lossguide'
    'random_seed': 56,              # Для воспроизводимости
    'thread_count': -1,             # Использовать все доступные ядра CPU
    'od_type': 'Iter',              # early_stopping по количеству итераций
    'od_wait': 50,                  # Количество итераций без улучшения на валидации, после которых обучение останавливается
}


In [None]:
model_cb = CatBoostRegressor(**params)
model_cb.fit(train_pool, eval_set=val_pool, plot=False, verbose=True, use_best_model=True)

0:	learn: 7.8645546	test: 11.8762397	best: 11.8762397 (0)	total: 27.6ms	remaining: 41.3s
1:	learn: 7.8385629	test: 11.8525156	best: 11.8525156 (1)	total: 51.6ms	remaining: 38.6s
2:	learn: 7.8139292	test: 11.8289731	best: 11.8289731 (2)	total: 79.7ms	remaining: 39.8s
3:	learn: 7.7907052	test: 11.8068985	best: 11.8068985 (3)	total: 104ms	remaining: 38.8s
4:	learn: 7.7685586	test: 11.7853463	best: 11.7853463 (4)	total: 128ms	remaining: 38.4s
5:	learn: 7.7475747	test: 11.7655970	best: 11.7655970 (5)	total: 153ms	remaining: 38.2s
6:	learn: 7.7275448	test: 11.7459552	best: 11.7459552 (6)	total: 177ms	remaining: 37.8s
7:	learn: 7.7059699	test: 11.7257916	best: 11.7257916 (7)	total: 202ms	remaining: 37.7s
8:	learn: 7.6878101	test: 11.7079582	best: 11.7079582 (8)	total: 226ms	remaining: 37.5s
9:	learn: 7.6702504	test: 11.6897096	best: 11.6897096 (9)	total: 250ms	remaining: 37.3s
10:	learn: 7.6496116	test: 11.6718135	best: 11.6718135 (10)	total: 274ms	remaining: 37.1s
11:	learn: 7.6340385	test: 

<catboost.core.CatBoostRegressor at 0x7f5c0a92de10>

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score
test_preds = model_cb.predict(val_df)

mae = mean_absolute_error(val_labels, test_preds)
r2 = r2_score(val_labels, test_preds)
print(f'MAE на тестовом наборе: {mae}')

MAE на тестовом наборе: 7.58500788690543


In [None]:
model_cb.get_feature_importance(prettified=True).head()

Unnamed: 0,Feature Id,Importances
0,weight,35.613251
1,completion_percentage,13.017465
2,hight,9.9826
3,completed_level_of_education,8.087805
4,region,7.929263


In [None]:
np.min(model_cb.evals_result_['validation']['RMSE'])

10.129232073546214

In [None]:
node_labels[node_labels > 100].shape

(0,)

In [None]:
model_cb.save_model('catboost_model.cbm')

# Linreg

In [11]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(train_df, train_labels)

In [13]:
from sklearn.metrics import mean_absolute_error, r2_score
test_preds = lr.predict(val_df)

mae = mean_absolute_error(val_labels, test_preds)
r2 = r2_score(val_labels, test_preds)
print(f'MAE на тестовом наборе: {mae}')

MAE на тестовом наборе: 8.592285367525175
