In [1]:
import ray
import rayvens
import sys
import time
import os
import yaml
import json

import pandas as pd
from io import StringIO

In [2]:
import random
import string

def generate_id(N: int=8) -> str:
    
    return ''.join(random.choices(string.ascii_lowercase + string.digits, k=N))

# File System Directory as Source and Sink
___

This tutorial will demo how to read, process, and upload files from and to a ```rayvens``` stream, where the source and sink of the stream are directories in the local file system.
___

### Run Parameters

Let's first define some run parameters for ```rayvens```:

* ```run_mode```: for the rayvens run mode. We will demonstrate on the local run mode.
* ```after_idle_for```: idle time before stream is disconnected.
* ```rayvens_logs```: whether to print the full ```rayvens``` logs. We will opt not to print them.

In [3]:
from collections import namedtuple

run_mode, after_idle_for, rayvens_logs = 'local', 5, False

Args = namedtuple('Args',['run_mode','after_idle_for','rayvens_logs'])
args = Args(run_mode, after_idle_for, rayvens_logs)

### Initialization

We now initialize ```ray``` and ```rayvens```. 

In [4]:
import ray
import rayvens

if args.run_mode == 'operator':
    ray.init(address='auto',ignore_reinit_error=True)
else:
    ray.init(ignore_reinit_error=True)
rayvens.init(mode=args.run_mode ,release=(not args.rayvens_logs))

2022-10-29 19:30:28,197	INFO services.py:1272 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


### Source and Sink Configurations

We choose a file system directory ```path```, which we will use both to read and process files as well as to upload files.

We also choose another file system directory ```path_dest``` into which files will be trasferred once processed.

The default values are the current working directory and its parent.

In [5]:
path = './rayvens_directory_test' # will be created in current working directory
path_dest = './processed' # will be created under path

if not os.path.exists(path):
    os.mkdir(path)

In [6]:
from pathlib import Path

stream_path = Path(path)

if stream_path.is_dir():
    source_path = str(stream_path)
    sink_path = str(stream_path)
    source_uri = f'file:{str(source_path)}?'
elif stream_path.is_file():
    source_path = str(stream_path)
    sink_path = str(stream_path.parent)
    source_uri = f'file:{str(stream_path.parent)}?filename={stream_path.name}'
else:
    raise TypeError(f'Path {path} must either be a directory or a file.')

sink_uri = f'file:{str(sink_path)}?'
source_uri += f'move={str(path_dest)}'

We define the configurations for the stream source and sink.

NOTE: we can either choose to keep the files in the source directory with ```keep_file=True``` or move them to a different directory by specifying one under ```move_after_read```. As expected, these two options are mutually exclusive.

In [7]:
source_config = dict(kind='file-source', path=source_path, keep_file=False, move_after_read=path_dest)
sink_config = dict(kind='file-sink', path=sink_path)

We can emunalte directory sink of a ```rayvens``` stream with a generic sink. 

In [8]:
# sink_config = dict(kind='generic-sink', uri=sink_uri)

### rayvens Sink

We are ready to test the given file system directory under ```path``` as a ```rayvens``` stream sink. 

We will use the ```rayvens``` stream to upload a json file to the directory under ```path```.

In [9]:
json_content = {'content': ('foo', None, 1.0, 2)}
json_name = f"file_{generate_id()}.json"
json_path = os.path.join(sink_path,json_name)

In [10]:
stream = rayvens.Stream('files-upload')

sink = stream.add_sink(sink_config)

event = rayvens.OutputEvent(json.dumps(json_content),{"CamelFileName": json_name})

stream << event

stream.disconnect_all(after_idle_for=args.after_idle_for)

[2m[36m(pid=29196)[0m Exec command =>  /Users/oritd/.pyenv/versions/3.9.0/envs/rayvens/bin/python3.9 /Users/oritd/Workspace/playground/rayvens/rayvens/core/harness.py kamel local run --property quarkus.http.port=58284 -d camel:camel-quarkus-microprofile-health /Users/oritd/Workspace/playground/sandbox/files-upload-file-sink.yaml
[2m[36m(pid=29196)[0m [Kamel subprocess] Kamel `local run` command finished successfully.
[2m[36m(pid=29196)[0m Integration files-upload-file-sink is ready.


In [11]:
print('Upload of file {} successful: {}.'.format(json_name,os.path.exists(json_path)))

Upload of file file_t0wx20kc.json successful: True.


### rayvens Source

We will now test the given file system directory under ```path``` as a ```rayvens``` source.

NOTE: Moving will delete all files under ```path```, once processed, and trasfer them to the directory under ```path_dest```.

In [12]:
stream = rayvens.Stream('files-download')

source = stream.add_source(source_config)

def process_file(event):
    json_event = json.loads(event)
    print(f"File name: {json_event['filename']}, Contents: {json_event['body']}")

stream >> process_file

stream.disconnect_all(after_idle_for=args.after_idle_for)

[2m[36m(pid=29197)[0m Exec command =>  /Users/oritd/.pyenv/versions/3.9.0/envs/rayvens/bin/python3.9 /Users/oritd/Workspace/playground/rayvens/rayvens/core/harness.py kamel local run /Users/oritd/Workspace/playground/rayvens/rayvens/core/FileQueueName.java -d mvn:com.googlecode.json-simple:json-simple:1.1.1 --property quarkus.http.port=62044 -d camel:camel-quarkus-microprofile-health /Users/oritd/Workspace/playground/sandbox/files-download-file-source.yaml
[2m[36m(pid=29197)[0m [Kamel subprocess] Kamel `local run` command finished successfully.
[2m[36m(pid=29197)[0m Integration files-download-file-source is ready.
[2m[36m(pid=29197)[0m File name: file_t0wx20kc.json, Contents: {"content": ["foo", null, 1.0, 2]}


In [13]:
print('Files kept in path:',os.path.exists(json_path))

Files kept in path: False


### Shutting down

Finally, we make sure to shut down eveything.

In [14]:
stream.disconnect_all()

ray.shutdown()