In [None]:
import pandas
import lithops
import io
import dataplug
import boto3

from dataplug import CloudObject
from dataplug.formats.generic.csv import CSV, partition_chunk_size

session = boto3.Session(region_name="us-east-1")
creds = session.get_credentials().get_frozen_credentials()

s3_config = {
        "credentials": {
                "AccessKeyId": creds.access_key,
                "SecretAccessKey": creds.secret_key,
                "SessionToken": creds.token,
        }
}

In [None]:
my_id = input()
print("Your ID is", my_id)
read_bucket_name = 'scipy-tutorial-data'
write_bucket_name = 'scipy-tutorial-' + my_id

In [None]:
co = CloudObject.from_s3(CSV, f"s3://{read_bucket_name}/yellow_tripdata_2015-01.csv", s3_config=s3_config,
                         metadata_bucket=write_bucket_name)


co.preprocess()
data_slices = co.partition(partition_chunk_size, chunk_size=25*1024*1024)  # 100MB

def count_total_passengers(data_slice):
    csv_bytes = data_slice.get().encode('utf-8')
    df = pandas.read_csv(io.BytesIO(csv_bytes))
    return df['passenger_count'].sum()

def reduce_fn(total_passengers):
    return sum(total_passengers)


# Step 3: Create a Lithops executor and map the function to the data
executor = lithops.FunctionExecutor()
futures = executor.map_reduce(map_function=count_total_passengers, map_iterdata=data_slices, reduce_function=reduce_fn)
result = futures.get_result()

print(f"Total passengers: {result}")

# TODO: add metrics store for time, cost and performance --> get best config by brute force, make some plots