# Getting Started

## Installation

```
pip install pypekit
```

## Define Tasks

In [1]:
from pypekit import Task

class Source(Task):
    input_types = ["source"]
    output_types = ["a"]

    def run(self, _):
        print("Running Source")
        return "source"

class Transform1(Task):
    input_types = ["a"]
    output_types = ["b"]

    def run(self, x):
        print("Running Transform1")
        return x + "_transformed-1"
    
class Transform2(Task):
    input_types = ["a", "b"]
    output_types = ["b"]

    def run(self, x):
        print("Running Transform2")
        return x + "_transformed-2"
    
class Sink1(Task):
    input_types = ["b"]
    output_types = ["sink"]

    def run(self, x):
        print("Running Sink1")
        return x + "_sink-1"
    
class Sink2(Task):
    input_types = ["b"]
    output_types = ["sink"]

    def run(self, x):
        print("Running Sink2")
        return x + "_sink-2"


## Build Repository

In [2]:
from pypekit import Repository

repository = Repository([
    Source,
    Transform1,
    Transform2,
    Sink1,
    Sink2
])

root = repository.build_tree()

## Inspect Tree

In [3]:
tree_representation = repository.build_tree_string()
print(tree_representation)

└── Root()
    └── Source()
        ├── Transform1()
        │   ├── Transform2()
        │   │   ├── Sink1()
        │   │   └── Sink2()
        │   ├── Sink1()
        │   └── Sink2()
        └── Transform2()
            ├── Sink1()
            └── Sink2()



## Build Pipelines

In [4]:
pipelines = repository.build_pipelines()
for p in pipelines:
    print(p)

Pipeline(tasks=[Source(), Transform1(), Transform2(), Sink1()])
Pipeline(tasks=[Source(), Transform1(), Transform2(), Sink2()])
Pipeline(tasks=[Source(), Transform1(), Sink1()])
Pipeline(tasks=[Source(), Transform1(), Sink2()])
Pipeline(tasks=[Source(), Transform2(), Sink1()])
Pipeline(tasks=[Source(), Transform2(), Sink2()])


## Execute Pipelines with Caching

In [5]:
from pypekit import CachedExecutor

executor = CachedExecutor(pipelines, verbose=True)
results = executor.run()

Running Source
Running Transform1
Running Transform2
Running Sink1
Pipeline 1/6 completed. Runtime: 0.00s.
Running Sink2
Pipeline 2/6 completed. Runtime: 0.00s.
Running Sink1
Pipeline 3/6 completed. Runtime: 0.00s.
Running Sink2
Pipeline 4/6 completed. Runtime: 0.00s.
Running Transform2
Running Sink1
Pipeline 5/6 completed. Runtime: 0.00s.
Running Sink2
Pipeline 6/6 completed. Runtime: 0.00s.


## Inspect Results

In [6]:
for r in results.values():
    print(f"Tasks: {r['tasks']}    Output: {r['output']}")

Tasks: ['Source()', 'Transform1()', 'Transform2()', 'Sink1()']    Output: source_transformed-1_transformed-2_sink-1
Tasks: ['Source()', 'Transform1()', 'Transform2()', 'Sink2()']    Output: source_transformed-1_transformed-2_sink-2
Tasks: ['Source()', 'Transform1()', 'Sink1()']    Output: source_transformed-1_sink-1
Tasks: ['Source()', 'Transform1()', 'Sink2()']    Output: source_transformed-1_sink-2
Tasks: ['Source()', 'Transform2()', 'Sink1()']    Output: source_transformed-2_sink-1
Tasks: ['Source()', 'Transform2()', 'Sink2()']    Output: source_transformed-2_sink-2


# Reusing Cache

In [7]:
new_executor = CachedExecutor(pipelines, cache=executor.cache, verbose=True)
new_executor.run();

Pipeline 1/6 completed. Runtime: 0.00s.
Pipeline 2/6 completed. Runtime: 0.00s.
Pipeline 3/6 completed. Runtime: 0.00s.
Pipeline 4/6 completed. Runtime: 0.00s.
Pipeline 5/6 completed. Runtime: 0.00s.
Pipeline 6/6 completed. Runtime: 0.00s.


# Instances, Parameters and Pipelines

In [8]:
from pypekit import Pipeline

# New task, that takes arguments
class Transform3(Task):
    input_types = ["a"]
    output_types = ["b"]

    def __init__(self, **kwargs):
        self.test = kwargs.get("test", False)

    def run(self, x):
        print("Running Transform3 with test =", self.test)
        return x + "_transformed-3" + ("-test" if self.test else "")

pipeline = Pipeline([
    Transform1(),
    Sink2()
])

repository = Repository([
    Source,
    Transform2, Transform2,         # Instances of TransformTask2 can occur twice on each branch of the tree
    (Transform3, {"test": True}),   # TransformTask3 will be instantiated with the argument test=True every time it is used
    Sink1(),                        # Every node with the task SinkTask1 will have the same instance
    pipeline                        # Pipeline instance as task
])

repository.build_tree()
print(repository.build_tree_string())

└── Root()
    └── Source()
        ├── Transform2()
        │   ├── Transform2()
        │   │   └── Sink1()
        │   └── Sink1()
        ├── Transform2()
        │   ├── Transform2()
        │   │   └── Sink1()
        │   └── Sink1()
        ├── Transform3(test=True)
        │   ├── Transform2()
        │   │   ├── Transform2()
        │   │   │   └── Sink1()
        │   │   └── Sink1()
        │   ├── Transform2()
        │   │   ├── Transform2()
        │   │   │   └── Sink1()
        │   │   └── Sink1()
        │   └── Sink1()
        └── Pipeline(tasks=[Transform1(), Sink2()])

