In [1]:
import pandas
from sklearn import metrics
from elasticsearch import Elasticsearch
import time
import gc
import random
import xgboost as xgb
import json

In [2]:
df = pandas.read_csv('data/dataset_400features.csv')
df

Unnamed: 0,borrower,snapshot,project,2u_defix,2v_04,2w_1,2a_03u_defix,2a_03v_04,2a_03w_1,2b_05,...,2s_t_9ff58f4ffb29fa2,2s_t_ffc97d72e13e010,2s_t_a06bc25b5805d5f,2s_t_6c3c78838c761c6,2s_a_04v_02,2s_a_02t_3b484b82567a09e,2a_03t_35a18000230da77,2s_a_03t_ae7ab96520de3a1,origin,label
0,0xe8215f5aee29a1db273c80c9d269e8fa44b39126,2021-02,Aave,77.0,39.0,80.0,16.0,11.0,18.0,11.0,...,,,,,,,,,True,0
1,0xf8219bd4d474a8ed9e4041d6cae01467f3ee1e56,2022-05,Aave,22.0,15.0,50.0,2.0,2.0,7.0,1.0,...,,,,,,,,,True,0
2,0xf9a6ed52c4988580954672aa91a1d5eb229e5c78,2021-10,Aave,16.0,42.0,78.0,3.0,5.0,8.0,9.0,...,,,,,,,,,True,0
3,0x8032244b3b41b487d8de22363bc436e869e6859f,2022-06,Aave,26.0,16.0,60.0,2.0,3.0,13.0,4.0,...,,,,,,,,,False,1
4,0x8032244b3b41b487d8de22363bc436e869e6859f,2022-03,Aave,20.0,4.0,28.0,2.0,2.0,2.0,1.0,...,,,,,,,,,True,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204730,0x0524fe637b77a6f5f0b3a024f7fd9fe1e688a291,2022-07,Compound,385.0,155.0,331.0,86.0,38.0,59.0,11.0,...,,,,,,,,,False,0
204731,0x0524fe637b77a6f5f0b3a024f7fd9fe1e688a291,2022-08,Compound,398.0,158.0,343.0,87.0,38.0,60.0,11.0,...,,,,,,,,,False,0
204732,0x0524fe637b77a6f5f0b3a024f7fd9fe1e688a291,2022-09,Compound,409.0,161.0,353.0,91.0,39.0,63.0,11.0,...,,,,,,,,,False,0
204733,0x5f75fb6104ac8ce72879347db1041adf2f7745d6,2021-07,Compound,28.0,14.0,45.0,8.0,3.0,11.0,3.0,...,,1.0,,1.0,,,,,True,0


In [3]:
not_features = [
    'borrower',
    'snapshot',
    'project',
    'origin',
    'label'
]

In [4]:
fs = [x for x in df.columns if x not in not_features]
len(fs)

400

In [5]:
list_address = set(df['borrower'].unique())
len(list_address)

58232

In [6]:
list_address_aave = set(df[df['project']=='Aave']['borrower'].unique())
len(list_address_aave)

42619

In [7]:
list_address_compound = set(df[df['project']=='Compound']['borrower'].unique())
len(list_address_compound)

20652

In [8]:
list_address_common = [x for x in list_address_aave if x in list_address_compound]
len(list_address_common)

5039

In [9]:
df_train = df[(df['project']=='Aave') & (~df['borrower'].isin(list_address_common)) & (df['snapshot']<'2022-05')]
len(df_train)

80784

In [10]:
df_valid = df[(df['project']=='Aave') & (df['snapshot']=='2022-05')]
len(df_valid)

9575

In [11]:
dtrain = xgb.DMatrix(df_train[fs], df_train['label'])
dvalid = xgb.DMatrix(df_valid[fs], df_valid['label'])


In [12]:
eval_list = [(dtrain, 'train'), (dvalid, 'valid')]

In [13]:
params = {
    'seed': 123,
    'max_depth': 6, 
    'eta': 0.01, 
    'min_child_weight': 50,
    'eval_metric': 'auc',
    'objective': 'binary:logistic'
}
num_round = 250

In [14]:
bst = xgb.train(params, dtrain, num_round, 
                evals=eval_list,
                verbose_eval=100)

[0]	train-auc:0.60062	valid-auc:0.60195
[100]	train-auc:0.69594	valid-auc:0.65987
[200]	train-auc:0.72511	valid-auc:0.67672
[249]	train-auc:0.73391	valid-auc:0.69092


In [15]:
bst.save_model('models/xgb400_aave_only')