In [1]:
from pypekit import Task
import pandas as pd


class IrisLoader(Task):
    input_types = ["source"]
    output_types = ["raw"]

    def run(self, _):
        from sklearn.datasets import load_iris
        iris = load_iris()
        iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
        iris_df['target'] = iris.target
        return iris_df


class TrainTestSplitter(Task):
    input_types = ["raw"]
    output_types = ["split"]

    def run(self, df):
        from sklearn.model_selection import train_test_split
        train_df, test_df = train_test_split(df, test_size=0.2)
        train_df['train'] = 1
        test_df['train'] = 0
        df = pd.concat([train_df, test_df], ignore_index=True)
        return df


class Scaler(Task):
    input_types = ["split"]
    output_types = ["processed"]

    def run(self, df):
        X = df.drop(columns=['target', 'train'])
        X_train = X[df['train'] == 1]

        scaler = self.get_scaler()
        scaler.fit(X_train)

        X_scaled = scaler.transform(X)
        scaled_df = pd.DataFrame(data=X_scaled, columns=X.columns)
        scaled_df['target'] = df['target']
        scaled_df['train'] = df['train']

        return scaled_df

    def get_scaler(self):
        raise NotImplementedError("Subclasses should implement this method.")


class MinMaxScaler(Scaler):
    def get_scaler(self):
        from sklearn.preprocessing import MinMaxScaler
        return MinMaxScaler()


class StandardScaler(Scaler):
    def get_scaler(self):
        from sklearn.preprocessing import StandardScaler
        return StandardScaler()


class PCA(Task):
    input_types = ["split", "processed"]
    output_types = ["processed"]

    def __init__(self, **kwargs):
        self.kwargs = kwargs

    def run(self, df):
        X = df.drop(columns=['target', 'train'])
        X_train = X[df['train'] == 1]

        from sklearn.decomposition import PCA
        pca = PCA(**self.kwargs)
        pca.fit(X_train)

        X_pca = pca.transform(X)
        pca_df = pd.DataFrame(data=X_pca, columns=[
                              f'PC{i+1}' for i in range(X_pca.shape[1])])
        pca_df['target'] = df['target']
        pca_df['train'] = df['train']

        return pca_df


class Classifier(Task):
    input_types = ["split", "processed"]
    output_types = ["processed", "predicted"]

    def run(self, df):
        X = df.drop(columns=['target', 'train'])
        y = df['target']
        X_train = X[df['train'] == 1]
        y_train = y[df['train'] == 1]

        classifier = self.get_classifier()
        classifier.fit(X_train, y_train)
        
        y_pred = classifier.predict(X)
        df['predicted'] = y_pred

        return df
    
    def get_scaler(self):
        raise NotImplementedError("Subclasses should implement this method.")


class LogisticRegression(Classifier):
    def __init__(self, **kwargs):
        self.kwargs = kwargs

    def get_classifier(self):
        from sklearn.linear_model import LogisticRegression
        return LogisticRegression(**self.kwargs)


class RandomForestClassifier(Classifier):
    def __init__(self, **kwargs):
        self.kwargs = kwargs

    def get_classifier(self):
        from sklearn.ensemble import RandomForestClassifier
        return RandomForestClassifier(**self.kwargs)


class SVC(Classifier):
    def __init__(self, **kwargs):
        self.kwargs = kwargs

    def get_classifier(self):
        from sklearn.svm import SVC
        return SVC(**self.kwargs)


class Evaluator(Task):
    input_types = ["predicted"]
    output_types = ["sink"]

    def run(self, df):
        df_test = df[df['train'] == 0]
        return (df_test['target'] == df_test['predicted']).mean()

In [2]:
from pypekit import Repository, CachedExecutor

repository = Repository([
    ("data_loader", IrisLoader()),
    ("train_test_splitter", TrainTestSplitter()),
    ("minmax_scaler", MinMaxScaler()),
    ("standard_scaler", StandardScaler()),
    ("pca", PCA(n_components=2)),
    ("logistic_regression", LogisticRegression(max_iter=200)),
    ("random_forest", RandomForestClassifier(n_estimators=100)),
    ("svc", SVC(kernel='linear')),
    ("evaluator", Evaluator()),
])

pipeline_dict = repository.build_pipelines()
for pipeline in pipeline_dict.values():
    print(pipeline)

Pipeline(id=ac269be357104c5891a06946dda286fe, tasks=['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'random_forest', 'svc', 'evaluator'])
Pipeline(id=a42b491cb89e40dbbaf209bd062ec82f, tasks=['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'random_forest', 'evaluator'])
Pipeline(id=0645c19c3cf04e21a9d1db63df46207a, tasks=['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'svc', 'random_forest', 'evaluator'])
Pipeline(id=35b6eb918b814563a5002ee1477a7e45, tasks=['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'svc', 'evaluator'])
Pipeline(id=822d48c09c114f2ea114eade982ee3b8, tasks=['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'evaluator'])
Pipeline(id=bab7bf55cac74ff1ac31c94c3b73777a, tasks=['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'random_forest', 'logistic_regression', 'svc', 'evalu

In [3]:
executor = CachedExecutor(pipeline_dict, verbose=True)
results = executor.run()

Pipeline ac269be357104c5891a06946dda286fe completed.
Pipeline a42b491cb89e40dbbaf209bd062ec82f completed.
Pipeline 0645c19c3cf04e21a9d1db63df46207a completed.
Pipeline 35b6eb918b814563a5002ee1477a7e45 completed.
Pipeline 822d48c09c114f2ea114eade982ee3b8 completed.
Pipeline bab7bf55cac74ff1ac31c94c3b73777a completed.
Pipeline a138deac453048958c1a49cf222ce835 completed.
Pipeline 1c72528edff54f468d3b9f89a60702c0 completed.
Pipeline 41888bc2335049e4a92bbc2d5ade9cb8 completed.
Pipeline b747969eb50445549d504ac0fec11aae completed.
Pipeline 67a7ecef3a444be0b804bee6f783e20c completed.
Pipeline 78096945368446299bd1d2e9b918c6f9 completed.
Pipeline 9867345334be4f8fa82570bc1b835693 completed.
Pipeline 23647f24d3184ab5b23deb33f6ffb35e completed.
Pipeline b7794c92728f4c0b97ad9492bd332844 completed.
Pipeline a86a4c09abb24e8587d935af76849a50 completed.
Pipeline 4fe1fb508a2d4fc4a6ddcb6261799974 completed.
Pipeline c10f992b2901410396f43142a16d557e completed.
Pipeline c736661b71024ae397ac339f6b105d1b comp

In [4]:
for result in results.values():
    print(result)

{'pipeline_id': 'ac269be357104c5891a06946dda286fe', 'output': np.float64(0.9), 'runtime': 2.478000100000001, 'tasks': ['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'random_forest', 'svc', 'evaluator']}
{'pipeline_id': 'a42b491cb89e40dbbaf209bd062ec82f', 'output': np.float64(0.9), 'runtime': 2.4761547000000004, 'tasks': ['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'random_forest', 'evaluator']}
{'pipeline_id': '0645c19c3cf04e21a9d1db63df46207a', 'output': np.float64(0.9), 'runtime': 1.6895932000000007, 'tasks': ['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'svc', 'random_forest', 'evaluator']}
{'pipeline_id': '35b6eb918b814563a5002ee1477a7e45', 'output': np.float64(0.9), 'runtime': 1.6341876000000004, 'tasks': ['data_loader', 'train_test_splitter', 'minmax_scaler', 'pca', 'logistic_regression', 'svc', 'evaluator']}
{'pipeline_id': '822d48c09c114f2ea114eade982ee3b8', '