In [None]:
from pypekit import Task
import pandas as pd


class IrisLoader(Task):
    input_types = ["source"]
    output_types = ["raw"]

    def run(self, _):
        from sklearn.datasets import load_iris
        iris = load_iris()
        iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
        iris_df['target'] = iris.target
        return iris_df


class TrainTestSplitter(Task):
    input_types = ["raw"]
    output_types = ["split"]

    def run(self, df):
        from sklearn.model_selection import train_test_split
        train_df, test_df = train_test_split(df, test_size=0.2)
        train_df['train'] = 1
        test_df['train'] = 0
        df = pd.concat([train_df, test_df], ignore_index=True)
        return df


class Scaler(Task):
    input_types = ["split"]
    output_types = ["processed"]

    def run(self, df):
        X = df.drop(columns=['target', 'train'])
        X_train = X[df['train'] == 1]

        scaler = self.get_scaler()
        scaler.fit(X_train)

        X_scaled = scaler.transform(X)
        scaled_df = pd.DataFrame(data=X_scaled, columns=X.columns)
        scaled_df['target'] = df['target']
        scaled_df['train'] = df['train']

        return scaled_df

    def get_scaler(self):
        raise NotImplementedError("Subclasses should implement this method.")


class MinMaxScaler(Scaler):
    def get_scaler(self):
        from sklearn.preprocessing import MinMaxScaler
        return MinMaxScaler()


class StandardScaler(Scaler):
    def get_scaler(self):
        from sklearn.preprocessing import StandardScaler
        return StandardScaler()


class PCA(Task):
    input_types = ["split", "processed"]
    output_types = ["processed"]

    def __init__(self, **kwargs):
        self.kwargs = kwargs

    def run(self, df):
        X = df.drop(columns=['target', 'train'])
        X_train = X[df['train'] == 1]

        from sklearn.decomposition import PCA
        pca = PCA(**self.kwargs)
        pca.fit(X_train)

        X_pca = pca.transform(X)
        pca_df = pd.DataFrame(data=X_pca, columns=[
                              f'PC{i+1}' for i in range(X_pca.shape[1])])
        pca_df['target'] = df['target']
        pca_df['train'] = df['train']

        return pca_df


class Classifier(Task):
    input_types = ["split", "processed"]
    output_types = ["processed", "predicted"]

    def run(self, df):
        X = df.drop(columns=['target', 'train'])
        y = df['target']
        X_train = X[df['train'] == 1]
        y_train = y[df['train'] == 1]

        classifier = self.get_classifier()
        classifier.fit(X_train, y_train)
        
        y_pred = classifier.predict(X)
        df['predicted'] = y_pred

        return df
    
    def get_scaler(self):
        raise NotImplementedError("Subclasses should implement this method.")


class LogisticRegression(Classifier):
    def __init__(self, **kwargs):
        self.kwargs = kwargs

    def get_classifier(self):
        from sklearn.linear_model import LogisticRegression
        return LogisticRegression(**self.kwargs)


class RandomForestClassifier(Classifier):
    def __init__(self, **kwargs):
        self.kwargs = kwargs

    def get_classifier(self):
        from sklearn.ensemble import RandomForestClassifier
        return RandomForestClassifier(**self.kwargs)


class SVC(Classifier):
    def __init__(self, **kwargs):
        self.kwargs = kwargs

    def get_classifier(self):
        from sklearn.svm import SVC
        return SVC(**self.kwargs)


class Evaluator(Task):
    input_types = ["predicted"]
    output_types = ["sink"]

    def run(self, df):
        df_test = df[df['train'] == 0]
        return (df_test['target'] == df_test['predicted']).mean()

In [None]:
from pypekit import Repository, CachedExecutor

repository = Repository([
    ("data_loader", IrisLoader()),
    ("train_test_splitter", TrainTestSplitter()),
    ("minmax_scaler", MinMaxScaler()),
    ("standard_scaler", StandardScaler()),
    ("pca", PCA(n_components=2)),
    ("logistic_regression", LogisticRegression(max_iter=200)),
    ("random_forest", RandomForestClassifier(n_estimators=100)),
    ("svc", SVC(kernel='linear')),
    ("evaluator", Evaluator()),
])

pipeline_dict = repository.build_pipelines()
for pipeline in pipeline_dict.values():
    print(pipeline)

Pipeline(id=5a8deb1b5e84441b9f298f65f5b83acc, tasks=['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'random_forest', 'svc', 'evaluator'])
Pipeline(id=aa8a8f279aba45889fb03f1c286983fe, tasks=['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'random_forest', 'evaluator'])
Pipeline(id=3ac58002889446e2ad36a2aa4708bdb6, tasks=['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'svc', 'random_forest', 'evaluator'])
Pipeline(id=6d44d7600b274c3b98da501864c26ceb, tasks=['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'svc', 'evaluator'])
Pipeline(id=212ab338781741488bc9af875dcc5e08, tasks=['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'evaluator'])
Pipeline(id=34311650a6174f3f8913eb12e1e19d14, tasks=['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'random_forest', 'logistic_regression', 'svc', 'evalu

In [3]:
executor = CachedExecutor(pipeline_dict, verbose=True)
results = executor.run()

Ran pipeline 5a8deb1b5e84441b9f298f65f5b83acc. Runtime: 2.45s. 1/144 pipelines completed.
Ran pipeline aa8a8f279aba45889fb03f1c286983fe. Runtime: 2.44s. 2/144 pipelines completed.
Ran pipeline 3ac58002889446e2ad36a2aa4708bdb6. Runtime: 1.72s. 3/144 pipelines completed.
Ran pipeline 6d44d7600b274c3b98da501864c26ceb. Runtime: 1.66s. 4/144 pipelines completed.
Ran pipeline 212ab338781741488bc9af875dcc5e08. Runtime: 1.66s. 5/144 pipelines completed.
Ran pipeline 34311650a6174f3f8913eb12e1e19d14. Runtime: 1.73s. 6/144 pipelines completed.
Ran pipeline 53974fe2c8034de79263458d8ce4bc80. Runtime: 1.69s. 7/144 pipelines completed.
Ran pipeline a8c7aa61fa4c4a22850506983e7d7e46. Runtime: 1.69s. 8/144 pipelines completed.
Ran pipeline 7a497fdfed8a4b0594eec74d49fa8242. Runtime: 1.69s. 9/144 pipelines completed.
Ran pipeline 046c38eccfe64e2a97978b46fb192cac. Runtime: 1.68s. 10/144 pipelines completed.
Ran pipeline 5147f9a9f1a74d3bb759b54926270f41. Runtime: 2.55s. 11/144 pipelines completed.
Ran pipe

In [4]:
for result in results.values():
    print(result)

{'pipeline_id': '5a8deb1b5e84441b9f298f65f5b83acc', 'output': np.float64(0.9333333333333333), 'runtime': 2.4475462, 'tasks': ['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'random_forest', 'svc', 'evaluator']}
{'pipeline_id': 'aa8a8f279aba45889fb03f1c286983fe', 'output': np.float64(0.9333333333333333), 'runtime': 2.4445745999999997, 'tasks': ['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'random_forest', 'evaluator']}
{'pipeline_id': '3ac58002889446e2ad36a2aa4708bdb6', 'output': np.float64(0.9333333333333333), 'runtime': 1.7210162999999987, 'tasks': ['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'svc', 'random_forest', 'evaluator']}
{'pipeline_id': '6d44d7600b274c3b98da501864c26ceb', 'output': np.float64(0.9333333333333333), 'runtime': 1.6625276999999992, 'tasks': ['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'svc', 'evaluator']}
{