In [24]:
from pypekit import Task
import pandas as pd


class IrisLoader(Task):
    input_types = ["source"]
    output_types = ["raw"]

    def run(self, _):
        from sklearn.datasets import load_iris
        iris = load_iris()
        iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
        iris_df['target'] = iris.target
        return iris_df


class TrainTestSplitter(Task):
    input_types = ["raw"]
    output_types = ["split"]

    def run(self, df):
        from sklearn.model_selection import train_test_split
        train_df, test_df = train_test_split(
            df, test_size=0.2, random_state=42)
        train_df['train'] = 1
        test_df['train'] = 0
        df = pd.concat([train_df, test_df], ignore_index=True)
        return df


class Scaler(Task):
    input_types = ["split"]
    output_types = ["processed"]

    def run(self, df):
        X = df.drop(columns=['target', 'train'])
        X_train = X[df['train'] == 1]

        scaler = self.get_scaler()
        scaler.fit(X_train)

        X_scaled = scaler.transform(X)
        scaled_df = pd.DataFrame(data=X_scaled, columns=X.columns)
        scaled_df['target'] = df['target']
        scaled_df['train'] = df['train']

        return scaled_df

    def get_scaler(self):
        raise NotImplementedError("Subclasses should implement this method.")


class MinMaxScaler(Scaler):
    def get_scaler(self):
        from sklearn.preprocessing import MinMaxScaler
        return MinMaxScaler()


class StandardScaler(Scaler):
    def get_scaler(self):
        from sklearn.preprocessing import StandardScaler
        return StandardScaler()


class PCA(Task):
    input_types = ["split", "processed"]
    output_types = ["processed"]

    def __init__(self, **kwargs):
        self.kwargs = kwargs

    def run(self, df):
        X = df.drop(columns=['target', 'train'])
        X_train = X[df['train'] == 1]

        from sklearn.decomposition import PCA
        pca = PCA(**self.kwargs)
        pca.fit(X_train)

        X_pca = pca.transform(X)
        pca_df = pd.DataFrame(data=X_pca, columns=[
                              f'PC{i+1}' for i in range(X_pca.shape[1])])
        pca_df['target'] = df['target']
        pca_df['train'] = df['train']

        return pca_df


class Classifier(Task):
    input_types = ["split", "processed"]
    output_types = ["processed", "predicted"]

    def run(self, df):
        X = df.drop(columns=['target', 'train'])
        y = df['target']
        X_train = X[df['train'] == 1]
        y_train = y[df['train'] == 1]

        classifier = self.get_classifier()
        classifier.fit(X_train, y_train)
        
        y_pred = classifier.predict(X)
        df['predicted'] = y_pred

        return df


class LogisticRegression(Classifier):
    def __init__(self, **kwargs):
        self.kwargs = kwargs

    def get_classifier(self):
        from sklearn.linear_model import LogisticRegression
        return LogisticRegression(**self.kwargs)


class RandomForestClassifier(Classifier):
    def __init__(self, **kwargs):
        self.kwargs = kwargs

    def get_classifier(self):
        from sklearn.ensemble import RandomForestClassifier
        return RandomForestClassifier(**self.kwargs)


class SVC(Classifier):
    def __init__(self, **kwargs):
        self.kwargs = kwargs

    def get_classifier(self):
        from sklearn.svm import SVC
        return SVC(**self.kwargs)


class Evaluator(Task):
    input_types = ["predicted"]
    output_types = ["sink"]

    def run(self, df):
        df_test = df[df['train'] == 0]
        return (df_test['target'] == df_test['predicted']).mean()

In [25]:
from pypekit import Repository, CachedExecutor

repository = Repository([
    ("data_loader", IrisLoader()),
    ("train_test_splitter", TrainTestSplitter()),
    ("minmax_scaler", MinMaxScaler()),
    ("standard_scaler", StandardScaler()),
    ("pca", PCA(n_components=2)),
    ("logistic_regression", LogisticRegression(max_iter=200)),
    ("random_forest", RandomForestClassifier(n_estimators=100)),
    ("svc", SVC(kernel='linear')),
    ("evaluator", Evaluator()),
])

pipeline_dict = repository.build_pipelines()
for pipeline in pipeline_dict.values():
    print(pipeline)

Pipeline(id=d3e1b06fddad4443af19c038458474e7, tasks=['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'random_forest', 'svc', 'evaluator'])
Pipeline(id=dd782600c44f4055982cc4cb6d88b50b, tasks=['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'random_forest', 'evaluator'])
Pipeline(id=d999a5d962af4110a92ff5371575cf2b, tasks=['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'svc', 'random_forest', 'evaluator'])
Pipeline(id=8fddcc2b13894bc895f12918626ddfcd, tasks=['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'svc', 'evaluator'])
Pipeline(id=5e9aaa29eabc4195bc816d53580182e3, tasks=['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'evaluator'])
Pipeline(id=e77a4661b0204ec8b00fedd8040336db, tasks=['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'random_forest', 'logistic_regression', 'svc', 'evalu

In [26]:
executor = CachedExecutor(pipeline_dict, verbose=True)
results = executor.run()

Pipeline d3e1b06fddad4443af19c038458474e7 completed.
Pipeline dd782600c44f4055982cc4cb6d88b50b completed.
Pipeline d999a5d962af4110a92ff5371575cf2b completed.
Pipeline 8fddcc2b13894bc895f12918626ddfcd completed.
Pipeline 5e9aaa29eabc4195bc816d53580182e3 completed.
Pipeline e77a4661b0204ec8b00fedd8040336db completed.
Pipeline 7a74ee546ef242b1a37f2e6d7924e3de completed.
Pipeline b637f49f64f04c8e95def5ab3fd3560e completed.
Pipeline 92201c86e2c14da59931fe8ff85a8854 completed.
Pipeline 0ca59e9770524f7897f4e3a4efb9111d completed.
Pipeline 3275bba6727f42aeb536cb6034963f87 completed.
Pipeline 98c1f64c9c2842c7a86dbe92181fccda completed.
Pipeline 7553f040840944bea362b441a028af75 completed.
Pipeline 10c05ef32494406eaaa803d3bc9ed6e7 completed.
Pipeline ffb70ff25c0842d5a37765dcc310e4d8 completed.
Pipeline fc07505a01f7458095c430e7791e5d66 completed.
Pipeline 9f0a97c45e434fbaa6fe57fb5133fb5a completed.
Pipeline 0625a70014444040a23b75031ef801f2 completed.
Pipeline 1339c823b1894455bf76993e2ee49720 comp

In [27]:
for result in results.values():
    print(f"Accuracy: {result['output']}, Tasks: {result['tasks']}")

Accuracy: 0.9666666666666667, Tasks: ['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'random_forest', 'svc', 'evaluator']
Accuracy: 0.9666666666666667, Tasks: ['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'random_forest', 'evaluator']
Accuracy: 0.9666666666666667, Tasks: ['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'svc', 'random_forest', 'evaluator']
Accuracy: 0.9666666666666667, Tasks: ['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'svc', 'evaluator']
Accuracy: 0.9666666666666667, Tasks: ['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'evaluator']
Accuracy: 0.9666666666666667, Tasks: ['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'random_forest', 'logistic_regression', 'svc', 'evaluator']
Accuracy: 0.9666666666666667, Tasks: ['data_loader', 'train_test_splitter', 'minmax_scal