In [1]:
#specify GPU to use
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [None]:
from sklearn.model_selection import train_test_split
import openml
import category_encoders as ce
import numpy as np
import sklearn

# Load splice dataset
dataset = openml.datasets.get_dataset(46933, download_data=True, download_qualities=True, download_features_meta_data=True)
X, y, categorical_indicator, attribute_names = dataset.get_data(target=dataset.default_target_attribute)
categorical_feature_indices = [idx for idx, idx_bool in enumerate(categorical_indicator) if idx_bool]

X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_valid, y_train, y_valid = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

print("Training set size:", len(X_train))
print("Validation set size:", len(X_valid))
print("Test set size:", len(X_test))

X_train.head()

Training set size: 2460
Validation set size: 616
Test set size: 769


Unnamed: 0,molecule_structure_property_1,molecule_structure_property_2,molecule_structure_property_3,molecule_structure_property_4,molecule_structure_property_5,molecule_structure_property_6,molecule_structure_property_7,molecule_structure_property_8,molecule_structure_property_9,molecule_structure_property_10,...,molecule_structure_property_1608,molecule_structure_property_1609,molecule_structure_property_1610,molecule_structure_property_1611,molecule_structure_property_1612,molecule_structure_property_1613,molecule_structure_property_1614,molecule_structure_property_1615,molecule_structure_property_1616,molecule_structure_property_1617
1484,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
133,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
227,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
2157,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,1,0,0,0,0
3546,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
from GRANDE import GRANDE

params = {
    'depth': 5,
    'n_estimators': 1024,

    'learning_rate_weights': 0.001,
    'learning_rate_index': 0.01,
    'learning_rate_values': 0.01,
    'learning_rate_leaf': 0.05,
    'learning_rate_embedding': 0.01,

    'use_category_embeddings': True,
    'embedding_dim_cat': 8,
    'use_numeric_embeddings': False,
    'embedding_dim_num': 8,
    'embedding_threshold': 1,
    'loo_cardinality': 10,

    'dropout': 0.2,
    'selected_variables': 0.8,
    'data_subset_fraction': 1.0,
    'bootstrap': False,
    'missing_values': False,

    'optimizer': 'adam', #nadam, radam, adamw, adam 
    'cosine_decay_restarts': False,
    'reduce_on_plateau_scheduler': True,
    'label_smoothing': 0.0,
    'use_class_weights': False,
    'focal_loss': False,
    'swa': False,
    'es_metric': True,  # if True use AUC for binary, MSE for regression, val_loss for multiclass


    'epochs': 250,
    'batch_size': 256,
    'early_stopping_epochs': 50,

    'use_freq_enc': False,
    'use_robust_scale_smoothing': False,
    'problem_type': 'multiclass',
    
    'random_seed': 42,
    'verbose': 2,

}

model_grande = GRANDE(params=params)

model_grande.fit(X=X_train,
          y=y_train,
          X_val=X_valid,
          y_val=y_valid)

preds_grande = model_grande.predict_proba(X_test)

self.params {'depth': 5, 'n_estimators': 1024, 'learning_rate_weights': 0.001, 'learning_rate_index': 0.01, 'learning_rate_values': 0.01, 'learning_rate_leaf': 0.05, 'learning_rate_embedding': 0.01, 'use_category_embeddings': True, 'embedding_dim_cat': 8, 'use_numeric_embeddings': False, 'embedding_dim_num': 8, 'embedding_threshold': 1, 'loo_cardinality': 10, 'dropout': 0.2, 'selected_variables': 0.8, 'data_subset_fraction': 1.0, 'bootstrap': False, 'missing_values': False, 'optimizer': 'adam', 'cosine_decay_restarts': False, 'reduce_on_plateau_scheduler': True, 'label_smoothing': 0.0, 'use_class_weights': False, 'focal_loss': False, 'swa': False, 'es_metric': True, 'epochs': 250, 'batch_size': 256, 'early_stopping_epochs': 50, 'use_freq_enc': False, 'use_robust_scale_smoothing': False, 'problem_type': 'multiclass', 'random_seed': 42, 'verbose': 2, 'device': 'cuda:0', 'objective': 'multiclass'}
X_train shape before embeddings (2460, 1617)
number_of_variables 1 1617
number_of_variables 

In [4]:
def calculate_sample_weights(y_data):
    class_weights = sklearn.utils.class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_data), y = y_data)
    sample_weights = sklearn.utils.class_weight.compute_sample_weight(class_weight = 'balanced', y =y_data)
    return sample_weights

In [5]:
try:
    y_train = y_train.values.codes.astype(np.float64)
    y_valid = y_valid.values.codes.astype(np.float64)
    y_test = y_test.values.codes.astype(np.float64)
except:
    y_train = y_train.values.astype(np.float64)
    y_valid = y_valid.values.astype(np.float64)
    y_test = y_test.values.astype(np.float64)

In [6]:
binary_indices = []
low_cardinality_indices = []
high_cardinality_indices = []
num_columns = []
for column_index, column in enumerate(X_train.columns):
    if column_index in categorical_feature_indices:
        if len(X_train.iloc[:,column_index].unique()) <= 2:
            binary_indices.append(column)
        if len(X_train.iloc[:,column_index].unique()) < 5:
            low_cardinality_indices.append(column)
        else:
            high_cardinality_indices.append(column)
    else:
        num_columns.append(column)     
cat_columns = [col for col in X_train.columns if col not in num_columns]

In [7]:
if len(num_columns) > 0:
    mean_train_num = X_train[num_columns].mean(axis=0).iloc[0]
    X_train[num_columns] = X_train[num_columns].fillna(mean_train_num)
    X_valid[num_columns] = X_valid[num_columns].fillna(mean_train_num)
    X_test[num_columns] = X_test[num_columns].fillna(mean_train_num)
if len(cat_columns) > 0:
    mode_train_cat = X_train[cat_columns].mode(axis=0).iloc[0]
    X_train[cat_columns] = X_train[cat_columns].fillna(mode_train_cat)
    X_valid[cat_columns] = X_valid[cat_columns].fillna(mode_train_cat)
    X_test[cat_columns] = X_test[cat_columns].fillna(mode_train_cat)

X_train_raw = X_train.copy()
X_valid_raw = X_valid.copy()
X_test_raw = X_test.copy()

In [8]:
encoder_ordinal = ce.OrdinalEncoder(cols=binary_indices)
encoder_ordinal.fit(X_train)
X_train = encoder_ordinal.transform(X_train)
X_valid = encoder_ordinal.transform(X_valid)  
X_test = encoder_ordinal.transform(X_test)       

encoder = ce.LeaveOneOutEncoder(cols=high_cardinality_indices)
encoder.fit(X_train, y_train)
X_train = encoder.transform(X_train)
X_valid = encoder.transform(X_valid)
X_test = encoder.transform(X_test)

encoder = ce.OneHotEncoder(cols=low_cardinality_indices)
encoder.fit(X_train)
X_train = encoder.transform(X_train)
X_valid = encoder.transform(X_valid)
X_test = encoder.transform(X_test)

X_train = X_train.astype(np.float32)
X_valid = X_valid.astype(np.float32)
X_test = X_test.astype(np.float32)

In [9]:
if params['problem_type'] == 'regression':
    from xgboost import XGBRegressor
    model_xgb = XGBRegressor(n_estimators=1000, early_stopping_rounds=20)
    model_xgb.fit(X_train, 
                    y_train, 
                    eval_set=[(X_valid, y_valid)], 
                )
    preds_xgb = model_xgb.predict(X_test)
else:
    from xgboost import XGBClassifier
    model_xgb = XGBClassifier(n_estimators=1000, early_stopping_rounds=20)
    model_xgb.fit(X_train, 
                    y_train, 
                    #sample_weight=calculate_sample_weights(y_train), 
                    eval_set=[(X_valid, y_valid)], 
                    #sample_weight_eval_set=[calculate_sample_weights(y_valid)]
                )


    preds_xgb = model_xgb.predict_proba(X_test)

[0]	validation_0-mlogloss:0.74545
[1]	validation_0-mlogloss:0.54790
[2]	validation_0-mlogloss:0.42437
[3]	validation_0-mlogloss:0.34345
[4]	validation_0-mlogloss:0.29068
[5]	validation_0-mlogloss:0.25474
[6]	validation_0-mlogloss:0.23194
[7]	validation_0-mlogloss:0.21750
[8]	validation_0-mlogloss:0.20821
[9]	validation_0-mlogloss:0.20224
[10]	validation_0-mlogloss:0.19929
[11]	validation_0-mlogloss:0.19864
[12]	validation_0-mlogloss:0.19976
[13]	validation_0-mlogloss:0.20059
[14]	validation_0-mlogloss:0.20224
[15]	validation_0-mlogloss:0.20517
[16]	validation_0-mlogloss:0.20575
[17]	validation_0-mlogloss:0.20623
[18]	validation_0-mlogloss:0.20773
[19]	validation_0-mlogloss:0.20963
[20]	validation_0-mlogloss:0.21130
[21]	validation_0-mlogloss:0.21192
[22]	validation_0-mlogloss:0.21233
[23]	validation_0-mlogloss:0.21344
[24]	validation_0-mlogloss:0.21629
[25]	validation_0-mlogloss:0.21784
[26]	validation_0-mlogloss:0.21867
[27]	validation_0-mlogloss:0.21993
[28]	validation_0-mlogloss:0.2

In [10]:
if params['problem_type'] == 'regression':
    from catboost import CatBoostRegressor, Pool

    model_catboost = CatBoostRegressor(n_estimators=1000, 
                                        early_stopping_rounds=20)
    train_data = Pool(
            data=X_train_raw,
            label=y_train,
            cat_features=categorical_feature_indices,
        )

    eval_data = Pool(
            data=X_valid_raw,
            label=y_valid,
            cat_features=categorical_feature_indices,
        )

    model_catboost.fit(X=train_data, 
                    eval_set=eval_data)




    preds_catboost = model_catboost.predict(X_test_raw)
else:
    from catboost import CatBoostClassifier, Pool

    model_catboost = CatBoostClassifier(n_estimators=1000, 
                                        early_stopping_rounds=20)
    train_data = Pool(
            data=X_train_raw,
            label=y_train,
            cat_features=categorical_feature_indices,
            #weight=calculate_sample_weights(y_train)
        )

    eval_data = Pool(
            data=X_valid_raw,
            label=y_valid,
            cat_features=categorical_feature_indices,
            #weight=calculate_sample_weights(y_valid),
        )

    model_catboost.fit(X=train_data, 
                    eval_set=eval_data)



    preds_catboost = model_catboost.predict_proba(X_test_raw)


Learning rate set to 0.109938
0:	learn: 0.9182557	test: 0.9188767	best: 0.9188767 (0)	total: 130ms	remaining: 2m 10s
1:	learn: 0.7869023	test: 0.7895294	best: 0.7895294 (1)	total: 184ms	remaining: 1m 31s
2:	learn: 0.6835755	test: 0.6883721	best: 0.6883721 (2)	total: 271ms	remaining: 1m 30s
3:	learn: 0.6018571	test: 0.6085947	best: 0.6085947 (3)	total: 333ms	remaining: 1m 22s
4:	learn: 0.5353552	test: 0.5434583	best: 0.5434583 (4)	total: 389ms	remaining: 1m 17s
5:	learn: 0.4800435	test: 0.4894480	best: 0.4894480 (5)	total: 458ms	remaining: 1m 15s
6:	learn: 0.4340411	test: 0.4449823	best: 0.4449823 (6)	total: 524ms	remaining: 1m 14s
7:	learn: 0.3956256	test: 0.4077597	best: 0.4077597 (7)	total: 585ms	remaining: 1m 12s
8:	learn: 0.3620414	test: 0.3752527	best: 0.3752527 (8)	total: 660ms	remaining: 1m 12s
9:	learn: 0.3337102	test: 0.3481237	best: 0.3481237 (9)	total: 721ms	remaining: 1m 11s
10:	learn: 0.3094077	test: 0.3252190	best: 0.3252190 (10)	total: 816ms	remaining: 1m 13s
11:	learn: 

In [11]:
if params['problem_type'] == 'binary':
    accuracy = sklearn.metrics.accuracy_score(y_test, np.round(preds_grande[:,1]))
    f1_score = sklearn.metrics.f1_score(y_test, np.round(preds_grande[:,1]), average='macro')
    roc_auc = sklearn.metrics.roc_auc_score(y_test, preds_grande[:,1], average='macro', multi_class='ovo')

    print('Accuracy GRANDE:', accuracy)
    print('F1 Score GRANDE:', f1_score)
    print('ROC AUC GRANDE:', roc_auc)
    print('\n')

    accuracy = sklearn.metrics.accuracy_score(y_test, np.round(preds_xgb[:,1]))
    f1_score = sklearn.metrics.f1_score(y_test, np.round(preds_xgb[:,1]), average='macro')
    roc_auc = sklearn.metrics.roc_auc_score(y_test, preds_xgb[:,1], average='macro', multi_class='ovo')

    print('Accuracy XGB:', accuracy)
    print('F1 Score XGB:', f1_score)
    print('ROC AUC XGB:', roc_auc)
    print('\n')

    accuracy = sklearn.metrics.accuracy_score(y_test, np.round(preds_catboost[:,1]))
    f1_score = sklearn.metrics.f1_score(y_test, np.round(preds_catboost[:,1]), average='macro')
    roc_auc = sklearn.metrics.roc_auc_score(y_test, preds_catboost[:,1], average='macro', multi_class='ovo')

    print('Accuracy CatBoost:', accuracy)
    print('F1 Score CatBoost:', f1_score)
    print('ROC AUC CatBoost:', roc_auc)
    print('\n')
elif params['problem_type'] == 'multiclass':
    accuracy = sklearn.metrics.accuracy_score(y_test, np.argmax(preds_grande, axis=1))
    f1_score = sklearn.metrics.f1_score(y_test, np.argmax(preds_grande, axis=1), average='macro')
    roc_auc = sklearn.metrics.roc_auc_score(y_test, preds_grande, average='macro', multi_class='ovo', labels=[i for i in range(preds_grande.shape[1])])

    print('Accuracy GRANDE:', accuracy)
    print('F1 Score GRANDE:', f1_score)
    print('ROC AUC GRANDE:', roc_auc)
    print('\n')

    accuracy = sklearn.metrics.accuracy_score(y_test, np.argmax(preds_xgb, axis=1))
    f1_score = sklearn.metrics.f1_score(y_test, np.argmax(preds_xgb, axis=1), average='macro')
    roc_auc = sklearn.metrics.roc_auc_score(y_test, preds_xgb, average='macro', multi_class='ovo', labels=[i for i in range(preds_grande.shape[1])])

    print('Accuracy XGB:', accuracy)
    print('F1 Score XGB:', f1_score)
    print('ROC AUC XGB:', roc_auc)
    print('\n')

    accuracy = sklearn.metrics.accuracy_score(y_test, np.argmax(preds_catboost, axis=1))
    f1_score = sklearn.metrics.f1_score(y_test, np.argmax(preds_catboost, axis=1), average='macro')
    roc_auc = sklearn.metrics.roc_auc_score(y_test, preds_catboost, average='macro', multi_class='ovo', labels=[i for i in range(preds_grande.shape[1])])

    print('Accuracy CatBoost:', accuracy)
    print('F1 Score CatBoost:', f1_score)
    print('ROC AUC CatBoost:', roc_auc)
    print('\n')
else:
    mean_absolute_error = sklearn.metrics.mean_absolute_error(y_test, np.round(preds_grande))
    r2_score = sklearn.metrics.r2_score(y_test, np.round(preds_grande))

    print('MAE GRANDE:', mean_absolute_error)
    print('R2 Score GRANDE:', r2_score)
    print('\n')

    mean_absolute_error = sklearn.metrics.mean_absolute_error(y_test, np.round(preds_xgb))
    r2_score = sklearn.metrics.r2_score(y_test, np.round(preds_xgb))

    print('MAE XGB:', mean_absolute_error)
    print('R2 Score XGB:', r2_score)
    print('\n')

    mean_absolute_error = sklearn.metrics.mean_absolute_error(y_test, np.round(preds_catboost))
    r2_score = sklearn.metrics.r2_score(y_test, np.round(preds_catboost))

    print('MAE CatBoost:', mean_absolute_error)
    print('R2 Score CatBoost:', r2_score)
    print('\n')

Accuracy GRANDE: 0.9570871261378413
F1 Score GRANDE: 0.32602436323366557
ROC AUC GRANDE: 0.487305056710775


Accuracy XGB: 0.9570871261378413
F1 Score XGB: 0.32602436323366557
ROC AUC XGB: 0.47009195809703846


Accuracy CatBoost: 0.9570871261378413
F1 Score CatBoost: 0.32602436323366557
ROC AUC CatBoost: 0.4693427063642092


