In [28]:
import pandas as pd
from datacleaner import autoclean
from concurrent.futures import ProcessPoolExecutor

from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.model_selection import train_test_split

from tpot import TPOTClassifier, TPOTRegressor

In [33]:
def automl(url, target, metric, baseline):
    header = 0 if isinstance(target, str) else None
    df = pd.read_csv(url, sep=None, engine='python', header=header)
    df = autoclean(df)

    X = df.drop(target, axis=1)
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    try:
        model = TPOTClassifier(generations=3, population_size=50, verbosity=0)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    except:
        model = TPOTRegressor(generations=3, population_size=50, verbosity=0)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    metric_func = accuracy_score if metric == 'acc' else f1_score 
    try:
        result = metric_func(y_test, y_pred)
    except:
        result = metric_func(y_test, y_pred, average='micro')
    
    name = url.split('/')[-1]
    print(f'dataset: {name} - baseline: {baseline} - metric: {metric} - automl result: {result}')

In [35]:
datasets = [
    ('https://www.openml.org/data/get_csv/50/dataset_50_tic-tac-toe.arff', 'Class', 'f1', 1),
    ('https://www.openml.org/data/get_csv/1586223/php50jXam', 'Class', 'f1', 0.9964),
    ('https://www.openml.org/data/get_csv/16787463/phpZrCzJR', 'Target', 'f1', 0.9882),
    ('https://www.openml.org/data/get_csv/53247/vineyard.arff', 'binaryClass', 'f1', 0.8655),
    ('https://www.openml.org/data/get_csv/61/dataset_61_iris.arff', 'class', 'f1', 0.9599),
    ('http://archive.ics.uci.edu/ml/machine-learning-databases/hepatitis/hepatitis.data', 0, 'acc', 0.903),
    ('http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data', 13, 'acc', 0.900),
    ('http://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data', 34, 'acc', 0.987),
    ('http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data', 10, 'acc', 0.983),
    ('http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data', 0, 'acc', 0.762),
]

with ProcessPoolExecutor(max_workers=3) as pool:
    for url, target, metric, baseline in datasets:
        pool.submit(automl, url, target, metric, baseline)

dataset: php50jXam - baseline: 0.9964 - metric: f1 - automl result: 1.0
dataset: dataset_50_tic-tac-toe.arff - baseline: 1 - metric: f1 - automl result: 1.0
dataset: vineyard.arff - baseline: 0.8655 - metric: f1 - automl result: 1.0
dataset: dataset_61_iris.arff - baseline: 0.9599 - metric: f1 - automl result: 1.0
dataset: hepatitis.data - baseline: 0.903 - metric: acc - automl result: 0.8064516129032258
dataset: processed.cleveland.data - baseline: 0.9 - metric: acc - automl result: 0.5573770491803278
dataset: breast-cancer-wisconsin.data - baseline: 0.983 - metric: acc - automl result: 0.9714285714285714
dataset: breast-cancer.data - baseline: 0.762 - metric: acc - automl result: 0.6896551724137931
dataset: phpZrCzJR - baseline: 0.9882 - metric: f1 - automl result: 0.996512207274539


In [36]:
automl('http://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data', 34, 'acc', 0.987)

dataset: ionosphere.data - baseline: 0.987 - metric: acc - automl result: 0.9295774647887324
