In [1]:
from pypekit import Task
import pandas as pd


class IrisLoader(Task):
    input_types = ["source"]
    output_types = ["raw"]

    def run(self, _):
        from sklearn.datasets import load_iris
        iris = load_iris()
        iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
        iris_df['target'] = iris.target
        return iris_df


class TrainTestSplitter(Task):
    input_types = ["raw"]
    output_types = ["split"]

    def run(self, df):
        from sklearn.model_selection import train_test_split
        train_df, test_df = train_test_split(df, test_size=0.2)
        train_df['train'] = 1
        test_df['train'] = 0
        df = pd.concat([train_df, test_df], ignore_index=True)
        return df


class Scaler(Task):
    input_types = ["split"]
    output_types = ["processed"]

    def run(self, df):
        X = df.drop(columns=['target', 'train'])
        X_train = X[df['train'] == 1]

        scaler = self.get_scaler()
        scaler.fit(X_train)

        X_scaled = scaler.transform(X)
        scaled_df = pd.DataFrame(data=X_scaled, columns=X.columns)
        scaled_df['target'] = df['target']
        scaled_df['train'] = df['train']

        return scaled_df

    def get_scaler(self):
        raise NotImplementedError("Subclasses should implement this method.")


class MinMaxScaler(Scaler):
    def get_scaler(self):
        from sklearn.preprocessing import MinMaxScaler
        return MinMaxScaler()


class StandardScaler(Scaler):
    def get_scaler(self):
        from sklearn.preprocessing import StandardScaler
        return StandardScaler()


class PCA(Task):
    input_types = ["split", "processed"]
    output_types = ["processed"]

    def __init__(self, **kwargs):
        self.kwargs = kwargs

    def run(self, df):
        X = df.drop(columns=['target', 'train'])
        X_train = X[df['train'] == 1]

        from sklearn.decomposition import PCA
        pca = PCA(**self.kwargs)
        pca.fit(X_train)

        X_pca = pca.transform(X)
        pca_df = pd.DataFrame(data=X_pca, columns=[
                              f'PC{i+1}' for i in range(X_pca.shape[1])])
        pca_df['target'] = df['target']
        pca_df['train'] = df['train']

        return pca_df


class Classifier(Task):
    input_types = ["split", "processed"]
    output_types = ["processed", "predicted"]

    def run(self, df):
        X = df.drop(columns=['target', 'train'])
        y = df['target']
        X_train = X[df['train'] == 1]
        y_train = y[df['train'] == 1]

        classifier = self.get_classifier()
        classifier.fit(X_train, y_train)
        
        y_pred = classifier.predict(X)
        df['predicted'] = y_pred

        return df
    
    def get_scaler(self):
        raise NotImplementedError("Subclasses should implement this method.")


class LogisticRegression(Classifier):
    def __init__(self, **kwargs):
        self.kwargs = kwargs

    def get_classifier(self):
        from sklearn.linear_model import LogisticRegression
        return LogisticRegression(**self.kwargs)


class RandomForestClassifier(Classifier):
    def __init__(self, **kwargs):
        self.kwargs = kwargs

    def get_classifier(self):
        from sklearn.ensemble import RandomForestClassifier
        return RandomForestClassifier(**self.kwargs)


class SVC(Classifier):
    def __init__(self, **kwargs):
        self.kwargs = kwargs

    def get_classifier(self):
        from sklearn.svm import SVC
        return SVC(**self.kwargs)


class Evaluator(Task):
    input_types = ["predicted"]
    output_types = ["sink"]

    def run(self, df):
        df_test = df[df['train'] == 0]
        return (df_test['target'] == df_test['predicted']).mean()

In [2]:
from pypekit import Repository, CachedExecutor

repository = Repository([
    ("data_loader", IrisLoader()),
    ("train_test_splitter", TrainTestSplitter()),
    ("minmax_scaler", MinMaxScaler()),
    ("standard_scaler", StandardScaler()),
    ("pca", PCA(n_components=2)),
    ("logistic_regression", LogisticRegression(max_iter=200)),
    ("random_forest", RandomForestClassifier(n_estimators=100)),
    ("svc", SVC(kernel='linear')),
    ("evaluator", Evaluator()),
])

pipeline_dict = repository.build_pipelines()
for pipeline in pipeline_dict.values():
    print(pipeline)

Pipeline(id=436faec5de2c46f0861a55e47ba4ea6d, tasks=['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'random_forest', 'svc', 'evaluator'])
Pipeline(id=47fd0e67368b4384ac1697b2002a771a, tasks=['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'random_forest', 'evaluator'])
Pipeline(id=786e5e03fcd74d2e8e961d86d157782b, tasks=['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'svc', 'random_forest', 'evaluator'])
Pipeline(id=bb3f1c5f01564596825143ae9e5afb6f, tasks=['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'svc', 'evaluator'])
Pipeline(id=d8c05822ae2f4070943665dc510d9c02, tasks=['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'evaluator'])
Pipeline(id=8d2fe2f1a6fe4e18a33699dc3d063f49, tasks=['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'random_forest', 'logistic_regression', 'svc', 'evalu

In [3]:
executor = CachedExecutor(pipeline_dict, verbose=True)
results = executor.run()

Pipeline 436faec5de2c46f0861a55e47ba4ea6d completed.
Pipeline 47fd0e67368b4384ac1697b2002a771a completed.
Pipeline 786e5e03fcd74d2e8e961d86d157782b completed.
Pipeline bb3f1c5f01564596825143ae9e5afb6f completed.
Pipeline d8c05822ae2f4070943665dc510d9c02 completed.
Pipeline 8d2fe2f1a6fe4e18a33699dc3d063f49 completed.
Pipeline e9d03a4552e74c99a9c09bd5bb45bb9d completed.
Pipeline 8b9c276567e246f0b57727da4428d624 completed.
Pipeline a127428762344a45b032941180054b92 completed.
Pipeline 5b667608169246a9866b618789c9e732 completed.
Pipeline ccc8251d41e94a5c98140731c48e7d36 completed.
Pipeline 0240efbeffce4afd86657b094fa6d51f completed.
Pipeline c4f8eb1f81ad49b48fdb6f6aeb2e0fe9 completed.
Pipeline f98540880e404162a912da65d2b0e16b completed.
Pipeline a68d2fac00bc430c8346ffea61876577 completed.
Pipeline 2292da5ed69d42b49212a961285e10a3 completed.
Pipeline 12451ce147584d5494a2275298122959 completed.
Pipeline 528bb8747257432e880260ba56a674f6 completed.
Pipeline ad3ec0cd9d6a46408190d6ac21db9e42 comp

In [5]:
for result in results.values():
    print(result)

{'pipeline_id': '436faec5de2c46f0861a55e47ba4ea6d', 'output': np.float64(0.9), 'runtime': 2.896575700000001, 'tasks': ['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'random_forest', 'svc', 'evaluator']}
{'pipeline_id': '47fd0e67368b4384ac1697b2002a771a', 'output': np.float64(0.9), 'runtime': 2.8932545000000007, 'tasks': ['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'random_forest', 'evaluator']}
{'pipeline_id': '786e5e03fcd74d2e8e961d86d157782b', 'output': np.float64(0.9), 'runtime': 2.071262300000001, 'tasks': ['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'svc', 'random_forest', 'evaluator']}
{'pipeline_id': 'bb3f1c5f01564596825143ae9e5afb6f', 'output': np.float64(0.9), 'runtime': 1.9994754000000001, 'tasks': ['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'svc', 'evaluator']}
{'pipeline_id': 'd8c05822ae2f4070943665dc510d9c02', 'o