In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install catboost



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
friends = pd.read_csv('/content/drive/MyDrive/vkcup-ml/otbor/friends.csv')
X = pd.read_csv('/content/drive/MyDrive/vkcup-ml/otbor/train.csv')
edu = pd.read_csv('/content/drive/MyDrive/vkcup-ml/otbor/trainEducationFeatures.csv')
groups = pd.read_csv('/content/drive/MyDrive/vkcup-ml/otbor/trainGroups.csv')

In [None]:
friends

Unnamed: 0,uid,fuid
0,105728,104361
1,2026,38044
2,107090,6391
3,88157,6391
4,49843,6391
...,...,...
4109872,23805,36888
4109873,68932,51548
4109874,112906,5612
4109875,73731,35997


In [None]:
G = nx.from_pandas_edgelist(friends, 'uid', 'fuid')

In [None]:
X = pd.merge(X, edu, 'left')

In [None]:
nx.set_node_attributes(G, X[['uid', 'school_education']].set_index('uid').to_dict()['school_education'], "school")
nx.set_node_attributes(G, X[['uid', 'registered_year']].set_index('uid').to_dict()['registered_year'], "reg")

In [None]:
degrees = pd.DataFrame(G.degree())
degrees = degrees.rename(columns = {0: 'uid', 1: 'degree'})

In [None]:
X = pd.merge(X, degrees, 'left')

In [None]:
def get_friend_school(G, df):
    mean_col = []
    std_col = []
    for u in df['uid'].values:
        arr = []
        if u in G.nodes:
            for v in G[u]:
                if 'school' in G.nodes[v]:
                    arr.append(G.nodes[v]['school'])
            mean_col.append(np.nanmean(arr))
            std_col.append(np.nanstd(arr))
        else:
            mean_col.append(np.nan)
            std_col.append(np.nan)
    df = df.assign(friend_school = mean_col)
    df = df.assign(friend_school_std = std_col)
    return df

In [None]:
def get_ego_school(G, df):
    mean_col = []
    norm = 0
    for u in df['uid'].values:
        arr = []
        if u in G.nodes:
            E = nx.ego_graph(G, u)
            norm = 2 * E.number_of_edges()
            d = E.degree()
            for v in G[u]:
                if 'school' in G.nodes[v]:
                    arr.append(G.nodes[v]['school'] * d[v] / norm)
            mean_col.append(np.nansum(arr))
        else:
            mean_col.append(np.nan)
    df = df.assign(friend_school = mean_col)
    return df

In [None]:
def get_friend_reg(G, df):
    mean_col = []
    std_col = []
    for u in df['uid'].values:
        arr = []
        if u in G.nodes:
            for v in G[u]:
                if 'reg' in G.nodes[v]:
                    arr.append(G.nodes[v]['reg'])
            mean_col.append(np.nanmean(arr))
            std_col.append(np.nanstd(arr))
        else:
            mean_col.append(np.nan)
            std_col.append(np.nan)
    df = df.assign(friend_reg = mean_col)
    df = df.assign(friend_reg_std = std_col)
    return df

In [None]:
#X = get_friend_school(G, X)

In [None]:
X = get_ego_school(G, X)

In [None]:
X = get_friend_reg(G, X)

  # Remove the CWD from sys.path while we load stuff.
  keepdims=keepdims)


In [None]:
groups

Unnamed: 0,uid,gid
0,34673,110624
1,119303,108687
2,89035,234084
3,7322,351543
4,102327,304142
...,...,...
1086086,41186,90661
1086087,80927,348122
1086088,68207,213837
1086089,112909,342369


In [None]:
X = pd.merge(X, groups.groupby('uid', as_index=False).count().rename(columns={'gid': 'groups'}), 'left')

In [None]:
X['groups'] = X['groups'].fillna(0)

In [None]:
X

Unnamed: 0,uid,age,registered_year,school_education,graduation_1,graduation_2,graduation_3,graduation_4,graduation_5,graduation_6,graduation_7,degree,friend_school,friend_reg,friend_reg_std,groups
0,85788,32,2020,,,,,,,,,20.0,233.300000,2012.222222,3.520662,4.0
1,19171,30,2009,,,,,,,,,92.0,235.386544,2011.500000,3.106445,100.0
2,7780,51,2012,,,,,,,,,26.0,106.595745,2009.333333,1.247219,0.0
3,14998,17,2019,2019.0,,,,,,,,1.0,0.000000,,,0.0
4,94913,33,2010,2003.0,,,,,,,,19.0,108.027027,2009.166667,1.067187,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29464,12575,34,2011,,,,,,,,,44.0,192.876923,2013.692308,3.171620,0.0
29465,96062,39,2012,,,,,,,,,21.0,111.277778,2012.400000,3.006659,0.0
29466,33676,44,2012,,,,,,,,,17.0,0.000000,2012.666667,2.624669,0.0
29467,24043,38,2011,,,,,,,,,81.0,224.647490,2013.000000,3.741657,10.0


In [None]:
df_groups = pd.merge(X, groups, how='outer', on='uid')

In [None]:
df_groups

Unnamed: 0,uid,age,registered_year,school_education,graduation_1,graduation_2,graduation_3,graduation_4,graduation_5,graduation_6,graduation_7,degree,friend_school,friend_reg,friend_reg_std,groups,gid
0,85788,32,2020,,,,,,,,,20.0,233.300000,2012.222222,3.520662,4.0,320494.0
1,85788,32,2020,,,,,,,,,20.0,233.300000,2012.222222,3.520662,4.0,177386.0
2,85788,32,2020,,,,,,,,,20.0,233.300000,2012.222222,3.520662,4.0,220078.0
3,85788,32,2020,,,,,,,,,20.0,233.300000,2012.222222,3.520662,4.0,354929.0
4,19171,30,2009,,,,,,,,,92.0,235.386544,2011.500000,3.106445,100.0,282498.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1094962,105719,32,2008,2009.0,,2016.0,,,,,,150.0,169.835774,2010.657143,2.682673,100.0,65571.0
1094963,105719,32,2008,2009.0,,2016.0,,,,,,150.0,169.835774,2010.657143,2.682673,100.0,356707.0
1094964,105719,32,2008,2009.0,,2016.0,,,,,,150.0,169.835774,2010.657143,2.682673,100.0,140215.0
1094965,105719,32,2008,2009.0,,2016.0,,,,,,150.0,169.835774,2010.657143,2.682673,100.0,228298.0


In [None]:
groups_school = df_groups.groupby('gid').mean()[['school_education', 'registered_year']]

In [None]:
df_groups = pd.merge(df_groups, groups_school, how='left', on='gid')

In [None]:
df_groups

Unnamed: 0,uid,age,registered_year_x,school_education_x,graduation_1,graduation_2,graduation_3,graduation_4,graduation_5,graduation_6,graduation_7,degree,friend_school,friend_reg,friend_reg_std,groups,gid,school_education_y,registered_year_y
0,85788,32,2020,,,,,,,,,20.0,233.300000,2012.222222,3.520662,4.0,320494.0,1998.553571,2013.158730
1,85788,32,2020,,,,,,,,,20.0,233.300000,2012.222222,3.520662,4.0,177386.0,1998.461957,2012.677778
2,85788,32,2020,,,,,,,,,20.0,233.300000,2012.222222,3.520662,4.0,220078.0,,2020.000000
3,85788,32,2020,,,,,,,,,20.0,233.300000,2012.222222,3.520662,4.0,354929.0,2000.945472,2012.407797
4,19171,30,2009,,,,,,,,,92.0,235.386544,2011.500000,3.106445,100.0,282498.0,1987.920000,2013.388060
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1094962,105719,32,2008,2009.0,,2016.0,,,,,,150.0,169.835774,2010.657143,2.682673,100.0,65571.0,2007.500000,2009.000000
1094963,105719,32,2008,2009.0,,2016.0,,,,,,150.0,169.835774,2010.657143,2.682673,100.0,356707.0,2008.183099,2011.991266
1094964,105719,32,2008,2009.0,,2016.0,,,,,,150.0,169.835774,2010.657143,2.682673,100.0,140215.0,2006.000000,2009.000000
1094965,105719,32,2008,2009.0,,2016.0,,,,,,150.0,169.835774,2010.657143,2.682673,100.0,228298.0,2008.571429,2011.074074


In [None]:
df_groups = df_groups.groupby('uid', as_index=False).agg({'school_education_y': 'mean', 'registered_year_y': 'mean'}).rename(columns={'school_education_y': 'group_school', 'registered_year_y': 'group_reg'})

In [None]:
df_groups

Unnamed: 0,uid,group_school,group_reg
0,1,2004.306215,2011.835472
1,4,2004.676584,2011.802238
2,5,,
3,8,2005.266381,2012.264732
4,10,2003.380309,2013.188015
...,...,...,...
29464,120047,,
29465,120048,2017.272767,2017.053870
29466,120049,2002.756353,2012.144023
29467,120050,2004.661517,2011.017149


In [None]:
X = pd.merge(X, df_groups, 'left')

In [None]:
X

Unnamed: 0,uid,age,registered_year,school_education,graduation_1,graduation_2,graduation_3,graduation_4,graduation_5,graduation_6,graduation_7,degree,friend_school,friend_reg,friend_reg_std,groups,group_school,group_reg
0,85788,32,2020,,,,,,,,,20.0,233.300000,2012.222222,3.520662,4.0,1999.320333,2014.561076
1,19171,30,2009,,,,,,,,,92.0,235.386544,2011.500000,3.106445,100.0,2002.485901,2012.087726
2,7780,51,2012,,,,,,,,,26.0,106.595745,2009.333333,1.247219,0.0,,
3,14998,17,2019,2019.0,,,,,,,,1.0,0.000000,,,0.0,,
4,94913,33,2010,2003.0,,,,,,,,19.0,108.027027,2009.166667,1.067187,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29464,12575,34,2011,,,,,,,,,44.0,192.876923,2013.692308,3.171620,0.0,,
29465,96062,39,2012,,,,,,,,,21.0,111.277778,2012.400000,3.006659,0.0,,
29466,33676,44,2012,,,,,,,,,17.0,0.000000,2012.666667,2.624669,0.0,,
29467,24043,38,2011,,,,,,,,,81.0,224.647490,2013.000000,3.741657,10.0,2004.364283,2011.920498


In [None]:
y = X['age']
X = X.drop(columns=['uid', 'age'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [None]:
X_train

Unnamed: 0,registered_year,school_education,graduation_1,graduation_2,graduation_3,graduation_4,graduation_5,graduation_6,graduation_7,degree,friend_school,friend_reg,friend_reg_std,groups,group_school,group_reg
15654,2012,,,,,,,,,161.0,150.224951,2013.027778,3.113466,0.0,,
10646,2019,,,,,,,,,2.0,0.000000,2016.000000,0.000000,0.0,,
29179,2010,,,,,,,,,198.0,134.315457,2012.235294,3.058824,30.0,2004.701598,2011.734959
2701,2015,,,,,,,,,2.0,665.666667,2008.000000,0.000000,4.0,2003.000000,2013.625000
16808,2013,,,,,,,,,101.0,72.153132,2014.037037,3.882358,100.0,2003.908913,2012.294559
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21575,2009,,,,,,,,,17.0,0.000000,2014.333333,4.496913,0.0,,
5390,2012,1999.0,,,,,,,,95.0,132.743191,2012.363636,3.587167,100.0,2001.822920,2011.657980
860,2009,,,,,,,,,19.0,115.269231,2013.000000,3.000000,100.0,1998.556023,2012.321275
15795,2016,,,,,,,,,14.0,99.800000,2015.000000,2.828427,3.0,1998.547351,2011.781708


In [None]:
y_train

15654    34
10646    22
29179    35
2701     40
16808    27
         ..
21575    41
5390     39
860      65
15795    50
23654    29
Name: age, Length: 25048, dtype: int64

In [None]:
from catboost import CatBoostRegressor, Pool, metrics, cv

In [None]:
model = CatBoostRegressor(
    iterations=5000,
    loss_function='RMSE',
    random_seed=42,
    early_stopping_rounds=40,
    verbose=False,
    logging_level=None
)

In [None]:
train_pool = Pool(X_train, y_train)
test_pool = Pool(X_test, y_test)

In [None]:
grid = {'depth': [4, 6, 10],
        'l2_leaf_reg': [1, 3, 5, 7, 9]}

In [None]:
grid_search_result = model.grid_search(grid, train_pool, cv=5)

Stopped by overfitting detector  (40 iterations wait)

bestTest = 8.985077855
bestIteration = 1411

0:	loss: 8.9850779	best: 8.9850779 (0)	total: 4.48s	remaining: 1m 2s
Stopped by overfitting detector  (40 iterations wait)

bestTest = 8.987375474
bestIteration = 1384

1:	loss: 8.9873755	best: 8.9850779 (0)	total: 8.71s	remaining: 56.6s
Stopped by overfitting detector  (40 iterations wait)

bestTest = 8.977304008
bestIteration = 1917

2:	loss: 8.9773040	best: 8.9773040 (2)	total: 14.5s	remaining: 58.1s
Stopped by overfitting detector  (40 iterations wait)

bestTest = 8.992974386
bestIteration = 1315

3:	loss: 8.9929744	best: 8.9773040 (2)	total: 18.6s	remaining: 51.1s
Stopped by overfitting detector  (40 iterations wait)

bestTest = 8.989217117
bestIteration = 1570

4:	loss: 8.9892171	best: 8.9773040 (2)	total: 23.4s	remaining: 46.7s
Stopped by overfitting detector  (40 iterations wait)

bestTest = 8.9550242
bestIteration = 823

5:	loss: 8.9550242	best: 8.9550242 (5)	total: 27.2s	remain

In [None]:
grid_search_result['params']

{'depth': 6, 'l2_leaf_reg': 1}

In [None]:
grid_search_result['cv_results']

defaultdict(list,
            {'iterations': [0,
              1,
              2,
              3,
              4,
              5,
              6,
              7,
              8,
              9,
              10,
              11,
              12,
              13,
              14,
              15,
              16,
              17,
              18,
              19,
              20,
              21,
              22,
              23,
              24,
              25,
              26,
              27,
              28,
              29,
              30,
              31,
              32,
              33,
              34,
              35,
              36,
              37,
              38,
              39,
              40,
              41,
              42,
              43,
              44,
              45,
              46,
              47,
              48,
              49,
              50,
              51,
              52,
              53,
      

In [None]:
model.shrink(696)

In [None]:
model.get_all_params()

{'auto_class_weights': 'None',
 'bayesian_matrix_reg': 0.10000000149011612,
 'best_model_min_trees': 1,
 'boost_from_average': True,
 'boosting_type': 'Plain',
 'bootstrap_type': 'MVS',
 'border_count': 254,
 'classes_count': 0,
 'depth': 6,
 'eval_metric': 'RMSE',
 'feature_border_type': 'GreedyLogSum',
 'grow_policy': 'SymmetricTree',
 'iterations': 5000,
 'l2_leaf_reg': 1,
 'leaf_estimation_backtracking': 'AnyImprovement',
 'leaf_estimation_iterations': 1,
 'leaf_estimation_method': 'Newton',
 'learning_rate': 0.029999999329447743,
 'loss_function': 'RMSE',
 'max_leaves': 64,
 'min_data_in_leaf': 1,
 'model_shrink_mode': 'Constant',
 'model_shrink_rate': 0,
 'model_size_reg': 0.5,
 'nan_mode': 'Min',
 'od_pval': 0,
 'od_type': 'Iter',
 'od_wait': 40,
 'penalties_coefficient': 1,
 'pool_metainfo_options': {'tags': {}},
 'posterior_sampling': False,
 'random_seed': 42,
 'random_strength': 1,
 'rsm': 1,
 'sampling_frequency': 'PerTree',
 'score_function': 'Cosine',
 'sparse_features_co

In [None]:
model.get_best_score()

{'learn': {'RMSE': 7.401161534179478}}

In [None]:
len(model.eval_metrics(test_pool, 'RMSE')['RMSE'])

696

In [None]:
model.eval_metrics(test_pool, 'RMSE')['RMSE'][-1]

8.84147860434964

In [None]:
pd.Series(model.predict(test_pool))

0       66.085294
1       38.002589
2       33.115240
3       19.350851
4       41.504934
          ...    
4416    42.484889
4417    36.373582
4418    22.337442
4419    29.742267
4420    31.969597
Length: 4421, dtype: float64

In [None]:
res = X_test

In [None]:
X_test.assign(age=model.predict(test_pool))

Unnamed: 0,registered_year,school_education,graduation_1,graduation_2,graduation_3,graduation_4,graduation_5,graduation_6,graduation_7,degree,friend_school,friend_school_std,friend_reg,friend_reg_std,groups,group_school,group_reg,age
14492,2012,1969.0,,,,,,,,15.0,1980.000000,0.000000,2012.500000,3.840573,58.0,1996.103219,2012.819052,66.085294
13664,2013,2000.0,,,,,,,,106.0,1999.875000,10.647036,2012.347826,3.357130,68.0,2001.425783,2012.851591,38.002589
12102,2010,2006.0,,,,,,,,125.0,2006.111111,2.960647,2012.212121,3.235762,100.0,2002.133616,2012.107836,33.115240
11453,2013,,,,,,,,,13.0,,,2013.000000,2.000000,100.0,2015.142276,2014.078070,19.350851
14888,2008,,,,,,,,,29.0,2001.333333,10.739336,2010.500000,2.549510,29.0,2001.618640,2011.378284,41.504934
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3537,2015,,,,,,,,,30.0,2001.750000,2.861381,2011.714286,1.665986,29.0,1999.824617,2013.104906,42.484889
15980,2018,,,,,,,,,,,,,,0.0,,,36.373582
9750,2020,,,,,,,,,92.0,1997.857143,10.161954,2013.555556,3.744955,10.0,2010.630852,2015.290003,22.337442
20520,2021,,,,,,,,,37.0,2004.000000,6.976150,2016.000000,5.259911,1.0,2002.922129,2013.054199,29.742267


In [None]:
model.save_model("model_group_reg_cv")

In [None]:
model.get_feature_importance()

In [None]:
from catboost import CatBoost
new_model = CatBoost()
new_model.load_model("/content/model_friend_school")

<catboost.core.CatBoost at 0x7f6b536161d0>

In [None]:
new_model.get_all_params()

{'auto_class_weights': 'None',
 'bayesian_matrix_reg': 0.1000000015,
 'best_model_min_trees': 1,
 'boost_from_average': True,
 'boosting_type': 'Plain',
 'bootstrap_type': 'MVS',
 'border_count': 254,
 'classes_count': 0,
 'depth': 6,
 'eval_metric': 'RMSE',
 'feature_border_type': 'GreedyLogSum',
 'grow_policy': 'SymmetricTree',
 'iterations': 5000,
 'l2_leaf_reg': 3,
 'leaf_estimation_backtracking': 'AnyImprovement',
 'leaf_estimation_iterations': 1,
 'leaf_estimation_method': 'Newton',
 'learning_rate': 0.02999999933,
 'loss_function': 'RMSE',
 'max_leaves': 64,
 'min_data_in_leaf': 1,
 'model_shrink_mode': 'Constant',
 'model_shrink_rate': 0,
 'model_size_reg': 0.5,
 'nan_mode': 'Min',
 'od_pval': 0,
 'od_type': 'Iter',
 'od_wait': 40,
 'penalties_coefficient': 1,
 'pool_metainfo_options': {'tags': {}},
 'posterior_sampling': False,
 'random_seed': 42,
 'random_strength': 1,
 'rsm': 1,
 'sampling_frequency': 'PerTree',
 'score_function': 'Cosine',
 'sparse_features_conflict_fractio

In [None]:
new_model.shrink(588)

In [None]:
new_model.predict(X_test)

array([67.51916481, 38.35857808, 32.05387566, ..., 36.21660321,
       31.99233181, 33.71063802])

In [None]:
new_model.get_all_params()

{'auto_class_weights': 'None',
 'bayesian_matrix_reg': 0.1000000015,
 'best_model_min_trees': 1,
 'boost_from_average': True,
 'boosting_type': 'Plain',
 'bootstrap_type': 'MVS',
 'border_count': 254,
 'classes_count': 0,
 'depth': 6,
 'eval_metric': 'RMSE',
 'feature_border_type': 'GreedyLogSum',
 'grow_policy': 'SymmetricTree',
 'iterations': 5000,
 'l2_leaf_reg': 3,
 'leaf_estimation_backtracking': 'AnyImprovement',
 'leaf_estimation_iterations': 1,
 'leaf_estimation_method': 'Newton',
 'learning_rate': 0.02999999933,
 'loss_function': 'RMSE',
 'max_leaves': 64,
 'min_data_in_leaf': 1,
 'model_shrink_mode': 'Constant',
 'model_shrink_rate': 0,
 'model_size_reg': 0.5,
 'nan_mode': 'Min',
 'od_pval': 0,
 'od_type': 'Iter',
 'od_wait': 40,
 'penalties_coefficient': 1,
 'pool_metainfo_options': {'tags': {}},
 'posterior_sampling': False,
 'random_seed': 42,
 'random_strength': 1,
 'rsm': 1,
 'sampling_frequency': 'PerTree',
 'score_function': 'Cosine',
 'sparse_features_conflict_fractio

In [None]:
nx.__version__

'2.5.1'

In [None]:
res = edu[['uid', 'school_education']]

In [None]:
res

Unnamed: 0,uid,school_education
0,356,
1,26356,
2,97,
3,319,
4,30288,2008.0
...,...,...
29464,33331,1987.0
29465,119788,
29466,97581,2002.0
29467,104038,1969.0


In [None]:
res = res.assign(age=2021-res.school_education+18)

In [None]:
res.describe()

Unnamed: 0,uid,school_education,age
count,29469.0,9114.0,9114.0
mean,59836.123791,2000.950955,38.049045
std,34764.131475,12.326599,12.326599
min,1.0,1941.0,11.0
25%,29577.0,1994.0,30.0
50%,59799.0,2003.0,36.0
75%,89901.0,2009.0,45.0
max,120055.0,2028.0,98.0


In [None]:
res.describe()

Unnamed: 0,uid,school_education,age
count,29469.0,9114.0,9114.0
mean,59836.123791,2000.950955,38.049045
std,34764.131475,12.326599,12.326599
min,1.0,1941.0,11.0
25%,29577.0,1994.0,30.0
50%,59799.0,2003.0,36.0
75%,89901.0,2009.0,45.0
max,120055.0,2028.0,98.0


In [None]:
res = res.fillna(35)

In [None]:
res

Unnamed: 0,uid,school_education,age
0,356,35.0,35.0
1,26356,35.0,35.0
2,97,35.0,35.0
3,319,35.0,35.0
4,30288,2008.0,13.0
...,...,...,...
29464,33331,1987.0,34.0
29465,119788,35.0,35.0
29466,97581,2002.0,19.0
29467,104038,1969.0,52.0


In [None]:
res = res.drop(columns=['school_education'])

In [None]:
res

Unnamed: 0,uid,age
0,356,35.0
1,26356,35.0
2,97,35.0
3,319,35.0
4,30288,13.0
...,...,...
29464,33331,34.0
29465,119788,35.0
29466,97581,19.0
29467,104038,52.0
