In [18]:
import pandas as pd
from io import BytesIO
import boto3
import os

In [10]:
def add_root(file, root):
    """
    Apend root to path
    """
    return os.path.join(root,file)


def save_to_buffer_as_parquet(df):
    out_buffer =  BytesIO()
    df.to_parquet(out_buffer, index=False)
    return out_buffer



In [25]:
class S3BucketConnector:
    def __init__(self, aws_access_key, aws_secret, bucket_name, region_name = "us-east-1"):
        self.region_name = region_name
        self.session = boto3.Session(os.environ[aws_access_key], os.environ[aws_secret], region_name=self.region_name)
        self._s3 = self.session.resource("s3")
        self._bucket = self._s3.Bucket(bucket_name)

    def read_parquet_to_dataframe(self, key: str):
        """
        Read a parquet file stored in S3 and return a dataframe
        """
        print(f"Reading file {key} from {self._bucket.name} in {self.region_name}")
        obj = self._bucket.Object(key=key).get().get('Body').read()
        if not obj:
            raise Exception(f"{key} does not exist")
        data = BytesIO(obj)
        df = pd.read_parquet(data)
        return df

    def write_df_to_s3_as_parquet(self, df: pd.DataFrame, key: str):
        """
        Write a pandas DF as parquet
        """ 
        out_buffer =  BytesIO()
        df.to_parquet(out_buffer, index=False)
        print(f"Writing file {key} to {self._bucket.name} in {self.region_name}")
        self._bucket.put_object(Body=out_buffer.getvalue(), Key=key)

In [26]:
DATA_ROOT = "..\\data\\raw"

bucket_name = "slafaurie-airflow"
slafaurie_airflow_bucket = S3BucketConnector("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", bucket_name)

prefix = "olist/one-run/raw/"
for filename in os.listdir(DATA_ROOT):
    df = pd.read_csv(add_root(filename, DATA_ROOT))
    key = prefix + filename.split(".")[0] + ".parquet"
    slafaurie_airflow_bucket.write_df_to_s3_as_parquet(df, key)

Writing file olist/one-run/raw/olist_closed_deals_dataset.parquet to slafaurie-airflow in us-east-1
complete
Writing file olist/one-run/raw/olist_customers_dataset.parquet to slafaurie-airflow in us-east-1
complete
Writing file olist/one-run/raw/olist_geolocation_dataset.parquet to slafaurie-airflow in us-east-1
complete
Writing file olist/one-run/raw/olist_orders_dataset.parquet to slafaurie-airflow in us-east-1
complete
Writing file olist/one-run/raw/olist_order_items_dataset.parquet to slafaurie-airflow in us-east-1
complete
Writing file olist/one-run/raw/olist_order_payments_dataset.parquet to slafaurie-airflow in us-east-1
complete
Writing file olist/one-run/raw/olist_order_reviews_dataset.parquet to slafaurie-airflow in us-east-1
complete
Writing file olist/one-run/raw/olist_products_dataset.parquet to slafaurie-airflow in us-east-1
complete
Writing file olist/one-run/raw/olist_sellers_dataset.parquet to slafaurie-airflow in us-east-1
complete
Writing file olist/one-run/raw/produ