In [1]:
from pmlb import fetch_data
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from autofeat import AutoFeatClassifier
from sklearn.feature_selection import VarianceThreshold

SEED = 42

In [2]:
def autofeat_benchmark(dataset_name, seed):
    df = fetch_data(dataset_name)

    X_train, X_valid, y_train, y_valid = train_test_split(df.drop(["target"], axis="columns"), df["target"], train_size = 0.8, stratify=df["target"], random_state = seed)

    # Initialize Autofeat model
    af_model = AutoFeatClassifier(verbose=1, feateng_steps=2)
    var_thresh = VarianceThreshold(threshold=0.0)
    X_train_var = var_thresh.fit_transform(X_train)


    # Fit Autofeat on training data
    af_model.fit(X_train_var, y_train)
    # Transform test data
    X_valid_var = var_thresh.transform(X_valid)
    X_test_feat = af_model.transform(X_valid_var)

    y_pred = af_model.predict(X_test_feat)
    acc = accuracy_score(y_valid, y_pred)
    return acc

In [3]:
datasets = ["allbp", "Hill_Valley_with_noise","Hill_Valley_without_noise","adult","allhyper","breast_cancer"]
autofeat_acc = []
for i in datasets:
    autofeat_acc.append(autofeat_benchmark(i, SEED))

for i in range(len(datasets)):
    print(f"Accuracy of autofeat on {datasets[i]}:{autofeat_acc[i]}")

2025-12-26 16:06:10,364 INFO: [AutoFeat] It is much more efficient to call fit_transform() instead of fit() and transform()!
2025-12-26 16:06:10,365 INFO: [AutoFeat] The 2 step feature engineering process could generate up to 17955 features.
2025-12-26 16:06:10,365 INFO: [AutoFeat] With 3017 data points this new feature matrix would use about 0.22 gb of space.
2025-12-26 16:06:10,369 INFO: [feateng] Step 1: transformation of original features


[feateng]               0/             27 features transformed

2025-12-26 16:06:12,963 INFO: [feateng] Generated 26 transformed features from 27 original features - done.
2025-12-26 16:06:12,981 INFO: [feateng] Step 2: first combination of features


[feateng]            1000/           1378 feature tuples combined

2025-12-26 16:06:13,816 INFO: [feateng] Generated 1288 feature combinations from 1378 original feature tuples - done.
2025-12-26 16:06:13,860 INFO: [feateng] Generated altogether 1404 new features in 2 steps
2025-12-26 16:06:13,861 INFO: [feateng] Removing correlated features, as well as additions at the highest level


[feateng]            1300/           1378 feature tuples combined

2025-12-26 16:06:13,986 INFO: [feateng] Generated a total of 1075 additional features


[featsel] Scaling data...

2025-12-26 16:06:15,694 INFO: [featsel] Feature selection run 1/5


done.


2025-12-26 16:13:08,244 INFO: [featsel] Feature selection run 2/5
2025-12-26 16:19:42,865 INFO: [featsel] Feature selection run 3/5
2025-12-26 16:26:21,312 INFO: [featsel] Feature selection run 4/5
2025-12-26 16:32:43,522 INFO: [featsel] Feature selection run 5/5
2025-12-26 16:39:10,762 INFO: [featsel] 11 features after 5 feature selection runs
  if np.max(np.abs(correlations[c].ravel()[:i])) < 0.9:
2025-12-26 16:39:10,766 INFO: [featsel] 7 features after correlation filtering
2025-12-26 16:39:13,938 INFO: [featsel] 5 features after noise filtering
2025-12-26 16:39:13,939 INFO: [AutoFeat] Computing 5 new features.


[AutoFeat]     3/    5 new features

2025-12-26 16:39:14,528 INFO: [AutoFeat]     5/    5 new features ...done.
2025-12-26 16:39:14,531 INFO: [AutoFeat] Final dataframe with 32 feature columns (5 new).
2025-12-26 16:39:14,531 INFO: [AutoFeat] Training final classification model.


[AutoFeat]     4/    5 new features

2025-12-26 16:39:15,805 INFO: [AutoFeat] Trained model: largest coefficients:
2025-12-26 16:39:15,806 INFO: [ 18.12918919 -15.35214854  -2.77704065]
2025-12-26 16:39:15,806 INFO: 1.255702 * x020*sqrt(x023)
2025-12-26 16:39:15,807 INFO: 0.728980 * sqrt(x023)*sqrt(x026)
2025-12-26 16:39:15,807 INFO: 0.151521 * sqrt(x019)*sqrt(x023)
2025-12-26 16:39:15,808 INFO: 0.147016 * x018*x019
2025-12-26 16:39:15,808 INFO: 0.033479 * sqrt(x000)*sqrt(x023)
2025-12-26 16:39:15,811 INFO: [AutoFeat] Final score: 0.8893
2025-12-26 16:39:15,813 INFO: [AutoFeat] Computing 5 new features.
2025-12-26 16:39:15,819 INFO: [AutoFeat]     5/    5 new features ...done.


[AutoFeat]     4/    5 new features

2025-12-26 16:39:17,001 INFO: [AutoFeat] It is much more efficient to call fit_transform() instead of fit() and transform()!
2025-12-26 16:39:17,002 INFO: [AutoFeat] The 2 step feature engineering process could generate up to 245350 features.
2025-12-26 16:39:17,003 INFO: [AutoFeat] With 969 data points this new feature matrix would use about 0.95 gb of space.
2025-12-26 16:39:17,004 INFO: [feateng] Step 1: transformation of original features


[feateng]               0/            100 features transformed

2025-12-26 16:39:15,728 INFO: [feateng] Generated 400 transformed features from 100 original features - done.
2025-12-26 16:39:15,731 INFO: [feateng] Step 2: first combination of features


[feateng]           46400/         124750 feature tuples combined

  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[feateng]          124600/         124750 feature tuples combined

2025-12-26 16:39:50,415 INFO: [feateng] Generated 124550 feature combinations from 124750 original feature tuples - done.


[feateng]          124700/         124750 feature tuples combined

2025-12-26 16:39:51,053 INFO: [feateng] Generated altogether 124951 new features in 2 steps
2025-12-26 16:39:51,054 INFO: [feateng] Removing correlated features, as well as additions at the highest level
2025-12-26 16:39:52,781 INFO: [feateng] Generated a total of 60066 additional features


[featsel] Scaling data...

2025-12-26 16:39:53,427 INFO: [featsel] Feature selection run 1/5


done.


2025-12-26 17:11:22,256 INFO: [featsel] Feature selection run 2/5
2025-12-26 17:43:50,046 INFO: [featsel] Feature selection run 3/5
2025-12-26 18:14:05,827 INFO: [featsel] Feature selection run 4/5
2025-12-26 18:45:06,383 INFO: [featsel] Feature selection run 5/5
2025-12-26 19:15:47,608 INFO: [featsel] 17 features after 5 feature selection runs
  if np.max(np.abs(correlations[c].ravel()[:i])) < 0.9:
2025-12-26 19:15:47,626 INFO: [featsel] 15 features after correlation filtering
2025-12-26 19:15:48,423 INFO: [featsel] 11 features after noise filtering
2025-12-26 19:15:48,445 INFO: [AutoFeat] Computing 11 new features.


[AutoFeat]    10/   11 new features

2025-12-26 19:15:50,011 INFO: [AutoFeat]    11/   11 new features ...done.
2025-12-26 19:15:50,061 INFO: [AutoFeat] Final dataframe with 111 feature columns (11 new).
2025-12-26 19:15:50,062 INFO: [AutoFeat] Training final classification model.
2025-12-26 19:15:50,292 INFO: [AutoFeat] Trained model: largest coefficients:
2025-12-26 19:15:50,294 INFO: [-25.60678152]
2025-12-26 19:15:50,302 INFO: 4.028814 * x014/x001
2025-12-26 19:15:50,308 INFO: 3.732620 * x012/x002
2025-12-26 19:15:50,310 INFO: 3.394317 * x085/x094
2025-12-26 19:15:50,311 INFO: 2.822139 * x014/x004
2025-12-26 19:15:50,313 INFO: 2.735492 * x010/x005
2025-12-26 19:15:50,314 INFO: 2.625879 * x085/x092
2025-12-26 19:15:50,315 INFO: 2.614704 * x087/x095
2025-12-26 19:15:50,316 INFO: 2.475281 * x095/x088
2025-12-26 19:15:50,317 INFO: 2.296650 * x012/x005
2025-12-26 19:15:50,318 INFO: 1.923587 * x086/x095
2025-12-26 19:15:50,318 INFO: 1.883476 * x086/x096
2025-12-26 19:15:50,325 INFO: [AutoFeat] Final score: 0.7079
2025-12-26

[AutoFeat]    10/   11 new features

2025-12-26 19:15:51,706 INFO: [AutoFeat] It is much more efficient to call fit_transform() instead of fit() and transform()!
2025-12-26 19:15:51,708 INFO: [AutoFeat] The 2 step feature engineering process could generate up to 245350 features.
2025-12-26 19:15:51,708 INFO: [AutoFeat] With 969 data points this new feature matrix would use about 0.95 gb of space.
2025-12-26 19:15:51,712 INFO: [feateng] Step 1: transformation of original features


[feateng]               0/            100 features transformed

2025-12-26 19:15:52,794 INFO: [feateng] Generated 400 transformed features from 100 original features - done.
2025-12-26 19:15:52,799 INFO: [feateng] Step 2: first combination of features


[feateng]           46500/         124750 feature tuples combined

  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[feateng]          124600/         124750 feature tuples combined

2025-12-26 19:16:28,905 INFO: [feateng] Generated 124550 feature combinations from 124750 original feature tuples - done.


[feateng]          124700/         124750 feature tuples combined

2025-12-26 19:16:29,489 INFO: [feateng] Generated altogether 124951 new features in 2 steps
2025-12-26 19:16:29,490 INFO: [feateng] Removing correlated features, as well as additions at the highest level
2025-12-26 19:16:31,442 INFO: [feateng] Generated a total of 54228 additional features


[featsel] Scaling data...

2025-12-26 19:16:32,157 INFO: [featsel] Feature selection run 1/5


done.


2025-12-26 19:48:15,513 INFO: [featsel] Feature selection run 2/5
2025-12-26 20:19:59,408 INFO: [featsel] Feature selection run 3/5
2025-12-26 20:53:33,598 INFO: [featsel] Feature selection run 4/5
2025-12-26 21:27:21,246 INFO: [featsel] Feature selection run 5/5
2025-12-26 22:00:37,627 INFO: [featsel] 392 features after 5 feature selection runs
  if np.max(np.abs(correlations[c].ravel()[:i])) < 0.9:
2025-12-26 22:00:37,857 INFO: [featsel] 15 features after correlation filtering
2025-12-26 22:00:38,821 INFO: [featsel] 15 features after noise filtering
2025-12-26 22:00:38,824 INFO: [AutoFeat] Computing 15 new features.


[AutoFeat]    14/   15 new features

2025-12-26 22:00:40,935 INFO: [AutoFeat]    15/   15 new features ...done.
2025-12-26 22:00:40,950 INFO: [AutoFeat] Final dataframe with 115 feature columns (15 new).
2025-12-26 22:00:40,951 INFO: [AutoFeat] Training final classification model.
2025-12-26 22:00:59,456 INFO: [AutoFeat] Trained model: largest coefficients:
2025-12-26 22:00:59,459 INFO: [-147.20274387]
2025-12-26 22:00:59,460 INFO: 20.533633 * x014/x000
2025-12-26 22:00:59,461 INFO: 18.590764 * x087/x099
2025-12-26 22:00:59,462 INFO: 12.935489 * x080/x099
2025-12-26 22:00:59,463 INFO: 10.931631 * x073/x098
2025-12-26 22:00:59,463 INFO: 10.026369 * x057/x098
2025-12-26 22:00:59,464 INFO: 9.559851 * x050/x098
2025-12-26 22:00:59,465 INFO: 8.982105 * x022/x000
2025-12-26 22:00:59,466 INFO: 8.844553 * x063/x098
2025-12-26 22:00:59,466 INFO: 8.785330 * x036/x099
2025-12-26 22:00:59,467 INFO: 8.637372 * x045/x098
2025-12-26 22:00:59,468 INFO: 8.432497 * x028/x098
2025-12-26 22:00:59,469 INFO: 7.319209 * x032/x099
2025-12-26 22:

[AutoFeat]    14/   15 new features

2025-12-26 22:01:00,817 INFO: [AutoFeat] It is much more efficient to call fit_transform() instead of fit() and transform()!
2025-12-26 22:01:00,819 INFO: [AutoFeat] The 2 step feature engineering process could generate up to 4851 features.
2025-12-26 22:01:00,819 INFO: [AutoFeat] With 39073 data points this new feature matrix would use about 0.76 gb of space.
2025-12-26 22:01:00,821 INFO: [feateng] Step 1: transformation of original features


[feateng]               0/             14 features transformed

2025-12-26 22:01:02,001 INFO: [feateng] Generated 47 transformed features from 14 original features - done.
2025-12-26 22:01:02,013 INFO: [feateng] Step 2: first combination of features


[feateng]            1500/           1830 feature tuples combined

2025-12-26 22:01:04,173 INFO: [feateng] Generated 1798 feature combinations from 1830 original feature tuples - done.


[feateng]            1800/           1830 feature tuples combined

2025-12-26 22:01:04,676 INFO: [feateng] Generated altogether 1868 new features in 2 steps
2025-12-26 22:01:04,677 INFO: [feateng] Removing correlated features, as well as additions at the highest level
2025-12-26 22:01:05,816 INFO: [feateng] Generated a total of 1279 additional features


[featsel] Scaling data...

2025-12-26 22:01:06,730 INFO: [featsel] Feature selection run 1/5


done.


2025-12-26 23:38:06,652 INFO: [featsel] Feature selection run 2/5
2025-12-27 01:14:57,733 INFO: [featsel] Feature selection run 3/5
2025-12-27 02:43:50,081 INFO: [featsel] Feature selection run 4/5
2025-12-27 04:17:36,758 INFO: [featsel] Feature selection run 5/5
2025-12-27 05:51:59,898 INFO: [featsel] 301 features after 5 feature selection runs
  if np.max(np.abs(correlations[c].ravel()[:i])) < 0.9:
2025-12-27 05:52:22,502 INFO: [featsel] 164 features after correlation filtering
2025-12-27 06:00:30,507 INFO: [featsel] 142 features after noise filtering
2025-12-27 06:00:30,532 INFO: [AutoFeat] Computing 139 new features.


[AutoFeat]   138/  139 new features

2025-12-27 06:00:47,126 INFO: [AutoFeat]   139/  139 new features ...done.
2025-12-27 06:00:47,176 INFO: [AutoFeat] Final dataframe with 153 feature columns (139 new).
2025-12-27 06:00:47,176 INFO: [AutoFeat] Training final classification model.
2025-12-27 06:01:26,137 INFO: [AutoFeat] Trained model: largest coefficients:
2025-12-27 06:01:26,139 INFO: [1.5746338e-20]
2025-12-27 06:01:26,163 INFO: [AutoFeat] Final score: 0.6948
2025-12-27 06:01:26,219 INFO: [AutoFeat] Computing 139 new features.
2025-12-27 06:01:26,327 INFO: [AutoFeat]   139/  139 new features ...done.


[AutoFeat]   138/  139 new features

2025-12-27 06:01:27,519 INFO: [AutoFeat] It is much more efficient to call fit_transform() instead of fit() and transform()!
2025-12-27 06:01:27,519 INFO: [AutoFeat] The 2 step feature engineering process could generate up to 17955 features.
2025-12-27 06:01:27,520 INFO: [AutoFeat] With 3016 data points this new feature matrix would use about 0.22 gb of space.
2025-12-27 06:01:27,522 INFO: [feateng] Step 1: transformation of original features


[feateng]               0/             27 features transformed

2025-12-27 06:01:28,680 INFO: [feateng] Generated 26 transformed features from 27 original features - done.
2025-12-27 06:01:28,682 INFO: [feateng] Step 2: first combination of features


[feateng]             600/           1378 feature tuples combined

2025-12-27 06:01:29,315 INFO: [feateng] Generated 1304 feature combinations from 1378 original feature tuples - done.


[feateng]            1300/           1378 feature tuples combined

2025-12-27 06:01:29,325 INFO: [feateng] Generated altogether 1404 new features in 2 steps
2025-12-27 06:01:29,326 INFO: [feateng] Removing correlated features, as well as additions at the highest level
2025-12-27 06:01:29,420 INFO: [feateng] Generated a total of 1076 additional features
2025-12-27 06:01:29,448 INFO: [featsel] Feature selection run 1/5


[featsel] Scaling data...done.


2025-12-27 06:09:01,534 INFO: [featsel] Feature selection run 2/5
2025-12-27 06:16:33,965 INFO: [featsel] Feature selection run 3/5
2025-12-27 06:24:10,050 INFO: [featsel] Feature selection run 4/5
2025-12-27 06:31:54,120 INFO: [featsel] Feature selection run 5/5
2025-12-27 06:39:30,438 INFO: [featsel] 14 features after 5 feature selection runs
  if np.max(np.abs(correlations[c].ravel()[:i])) < 0.9:
2025-12-27 06:39:30,446 INFO: [featsel] 13 features after correlation filtering
2025-12-27 06:39:36,446 INFO: [featsel] 3 features after noise filtering
2025-12-27 06:39:36,449 INFO: [AutoFeat] Computing 3 new features.


[AutoFeat]     1/    3 new features

2025-12-27 06:39:36,861 INFO: [AutoFeat]     3/    3 new features ...done.
2025-12-27 06:39:36,864 INFO: [AutoFeat] Final dataframe with 30 feature columns (3 new).
2025-12-27 06:39:36,865 INFO: [AutoFeat] Training final classification model.


[AutoFeat]     2/    3 new features

2025-12-27 06:39:37,379 INFO: [AutoFeat] Trained model: largest coefficients:
2025-12-27 06:39:37,380 INFO: [ 0.42167047 -0.15081509  0.18576533 -0.45664699]
2025-12-27 06:39:37,381 INFO: 0.527459 * x013*x018
2025-12-27 06:39:37,381 INFO: 0.143733 * x016*sqrt(x017)
2025-12-27 06:39:37,385 INFO: [AutoFeat] Final score: 0.9045
2025-12-27 06:39:37,388 INFO: [AutoFeat] Computing 3 new features.
2025-12-27 06:39:37,392 INFO: [AutoFeat]     3/    3 new features ...done.


[AutoFeat]     2/    3 new features

2025-12-27 06:39:38,410 INFO: [AutoFeat] It is much more efficient to call fit_transform() instead of fit() and transform()!
2025-12-27 06:39:38,411 INFO: [AutoFeat] The 2 step feature engineering process could generate up to 2016 features.
2025-12-27 06:39:38,411 INFO: [AutoFeat] With 228 data points this new feature matrix would use about 0.00 gb of space.
2025-12-27 06:39:38,412 INFO: [feateng] Step 1: transformation of original features


[feateng]               0/              9 features transformed

2025-12-27 06:39:37,853 INFO: [feateng] Generated 29 transformed features from 9 original features - done.
2025-12-27 06:39:37,855 INFO: [feateng] Step 2: first combination of features
2025-12-27 06:39:38,240 INFO: [feateng] Generated 693 feature combinations from 703 original feature tuples - done.
2025-12-27 06:39:38,243 INFO: [feateng] Generated altogether 729 new features in 2 steps
2025-12-27 06:39:38,244 INFO: [feateng] Removing correlated features, as well as additions at the highest level
2025-12-27 06:39:38,264 INFO: [feateng] Generated a total of 556 additional features
2025-12-27 06:39:38,270 INFO: [featsel] Feature selection run 1/5


[featsel] Scaling data...done.        703 feature tuples combined


2025-12-27 06:39:43,841 INFO: [featsel] Feature selection run 2/5
2025-12-27 06:39:49,080 INFO: [featsel] Feature selection run 3/5
2025-12-27 06:39:54,399 INFO: [featsel] Feature selection run 4/5
2025-12-27 06:39:59,631 INFO: [featsel] Feature selection run 5/5
2025-12-27 06:40:05,078 INFO: [featsel] 9 features after 5 feature selection runs
  if np.max(np.abs(correlations[c].ravel()[:i])) < 0.9:
2025-12-27 06:40:05,081 INFO: [featsel] 6 features after correlation filtering
2025-12-27 06:40:05,156 INFO: [featsel] 2 features after noise filtering
2025-12-27 06:40:05,157 INFO: [AutoFeat] Computing 1 new features.


[AutoFeat]     0/    1 new features

2025-12-27 06:40:05,646 INFO: [AutoFeat]     1/    1 new features ...done.
2025-12-27 06:40:05,653 INFO: [AutoFeat] Final dataframe with 10 feature columns (1 new).
2025-12-27 06:40:05,654 INFO: [AutoFeat] Training final classification model.
2025-12-27 06:40:05,708 INFO: [AutoFeat] Trained model: largest coefficients:
2025-12-27 06:40:05,708 INFO: [-0.57036614]
2025-12-27 06:40:05,709 INFO: 0.031672 * x004*x005**3
2025-12-27 06:40:05,710 INFO: 0.004075 * x003
2025-12-27 06:40:05,711 INFO: [AutoFeat] Final score: 0.7412
2025-12-27 06:40:05,713 INFO: [AutoFeat] Computing 1 new features.
2025-12-27 06:40:05,715 INFO: [AutoFeat]     1/    1 new features ...done.


Accuracy of autofeat on allbp:0.8900662251655629
Accuracy of autofeat on Hill_Valley_with_noise:0.7037037037037037
Accuracy of autofeat on Hill_Valley_without_noise:1.0
Accuracy of autofeat on adult:0.693213225509264
Accuracy of autofeat on allhyper:0.9139072847682119
Accuracy of autofeat on breast_cancer:0.6724137931034483
