In [1]:
# | default_exp s3_loader2

In [2]:
# | export
import boto3
import os
import datetime
from datetime import tzinfo
from dateutil.tz import tzutc
from torch_snippets import stem, fname

In [3]:
from torch_snippets.s3_loader2 import S3FileHandler

aws_access_key_id = "AKIAQFXXXXXXXX6CN"
aws_secret_access_key = "AC3AJsZ6XXXXXXXXXXXXXXXXXejfNN9h"

In [4]:
# | export
# | hide
class S3FileHandler:
    def __init__(self, aws_access_key, aws_secret_access_key):
        self.s3_client = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)

    def list_s3_buckets(self):
        """
        Lists all the s3 buckets in s3
        """        
        try:
            # Call S3 to list current buckets
            response = self.s3_client.list_buckets()
            buckets = [bucket['Name'] for bucket in response['Buckets']]
            return buckets
        except Exception as e:
            print(e)

    def list_s3_objects(self, bucket_name, key=""):
        """
        List all files in an S3 bucket or within a specific prefix.

        :param bucket_name: str. Name of the S3 bucket.
        :param key: str or None. Specific prefix to list files from, defaults to None.
        """
        try:
            # Initialize a paginator for listing objects
            paginator = self.s3_client.get_paginator('list_objects_v2')
            # Use the paginator to fetch all objects in the specified bucket and prefix if provided
            files = dict()
            for page in paginator.paginate(Bucket=bucket_name, Prefix=key):
                # Access the 'Contents' from the page, which lists the objects
                if 'Contents' in page:
                    for obj in page['Contents']:
                        files[obj['Key']] = obj['Size']
                        # print(f"{obj['Key']} ({obj['Size']} bytes)")
            return files
        except Exception as e:
            print(f"An error occurred: {e}")

    def download_s3_folder(self, bucket_name, local_dir, prefix="", verbose=0):
        """
        Download all files from an S3 bucket prefix to a local directory.

        :param bucket_name: str. Name of the S3 bucket.
        :param local_dir: str. Local directory to which files will be downloaded.
        :param prefix: str or None. Prefix path of the folder in the bucket. If None, the whole bucket is downloaded.
        """
        if not prefix.endswith("/"):
            prefix = prefix + "/"
        # Ensure local directory exists
        if prefix == "":
            local_dir = os.path.join(local_dir, bucket_name)
        else:
            local_dir = os.path.join(local_dir, stem(prefix))
        if not os.path.exists(local_dir):
            os.makedirs(local_dir)

        # List objects within the specified prefix
        paginator = self.s3_client.get_paginator('list_objects_v2')
        for page in paginator.paginate(Bucket=bucket_name, Prefix=prefix):
            for obj in page.get('Contents', []):
                key = obj['Key']
                if not key.endswith('/'):  # skip directories/folders
                    # Define file path locally in same structure
                    local_file_path = os.path.join(local_dir, key[len(prefix):])
                    local_file_dir = os.path.dirname(local_file_path)
                    
                    # Ensure local file directory exists
                    if not os.path.exists(local_file_dir):
                        os.makedirs(local_file_dir)

                    # Download the file
                    self.s3_client.download_file(bucket_name, key, local_file_path)
                    if verbose:
                        print(f"Downloaded {key} to {local_file_path}")

    def download_s3_file(self, bucket_name, key, local_dir, metadata=False, verbose=0):
        """
        Download a specific file from an S3 bucket and optionally return its metadata.
    
        :param bucket_name: str. Name of the S3 bucket.
        :param key: str. The key of the file in the S3 bucket.
        :param local_dir: str. Local directory to which the file will be downloaded.
        :param metadata: bool. If True, return the file's metadata; otherwise, return None.
        :param verbose: bool.
        :return: dict or None. Returns metadata of the file if metadata is True, otherwise None.
        """
        # Define the local file path
        local_file_path = os.path.join(local_dir, os.path.basename(key))

        # Ensure the local directory exists
        if not os.path.exists(local_dir):
            os.makedirs(local_dir)

        # Download the file
        self.s3_client.download_file(bucket_name, key, local_file_path)
        if verbose:
            print(f"Downloaded {key} to {local_file_path}")

        # Optionally retrieve and return metadata
        if metadata:
            response = self.s3_client.head_object(Bucket=bucket_name, Key=key)
            return response  # Return the metadata dictionary
        return None

    def upload_file_to_s3(self, bucket_name, localfile_path, s3_key, metadata=None):
        """
        Upload a file to an S3 bucket with optional metadata.
    
        :param bucket_name: str. Name of the S3 bucket.
        :param localfile_path: str. Local path to the file to be uploaded.
        :param s3_key: str. S3 key (path within the bucket) where the file will be stored with file name included.
        :param metadata: dict or None. Optional metadata for the file. Defaults to None.
        """
        try:
            # Setup the file upload options
            extra_args = {}
            if metadata:
                extra_args["Metadata"] = metadata

            # Perform the file upload
            with open(localfile_path, 'rb') as file_data:
                self.s3_client.upload_fileobj(
                        Fileobj=file_data,
                        Bucket=bucket_name,
                        Key=s3_key,
                        ExtraArgs=extra_args
                       )
            if metadata:
                print(f"File uploaded successfully to {bucket_name}/{s3_key} with metadata {metadata}")
            else:
                print(f"File uploaded successfully to {bucket_name}/{s3_key}")
        except Exception as e:
            print(f"Failed to upload file: {e}")

    def inmemory_download_s3(bucket_name, key):
        """
        Downloads a file from an Amazon S3 bucket and loads it directly into a pandas DataFrame. 
        The function automatically detects the file format based on its extension.

        Parameters:
        key (str): The S3 object key of the file to download.
        bucket (str, optional): The name of the S3 bucket. Defaults to AWS_BUCKET from .env if not provided.
        """
        response = self.s3_client.get_object(Bucket=bucket_name, Key=key)
        file_content = response['Body'].read()
        return file_content

In [5]:
mys3 = S3FileHandler(aws_access_key_id, aws_secret_access_key)

### List all Buckets
To lists all the s3 buckets in s3 for given credentials

In [6]:
mys3.list_s3_buckets()

['buckettest0011',
 'candidate-proctoring',
 'sagemaker-ap-south-1-011528263565',
 'sagemaker-studio-011528263565-u1h3juay9nd',
 'sentiment-classification-fastapi']

### List all file objects
List all files in an S3 bucket or within a specific prefix of the given bucket along with the file size.

:param bucket_name: str. Name of the S3 bucket.  
:param key: str or None. Specific prefix to list files from, defaults to None.

In [7]:
mys3.list_s3_objects(bucket_name="buckettest0011")

{'test/test/line_profiling_results.txt': 921,
 'test/test/outer_function_profile.txt': 2845}

### S3 Folder Download
Download all files from an S3 bucket prefix to a local directory.

:param bucket_name: str. Name of the S3 bucket.  
:param local_dir: str. Local directory to which files will be downloaded.  
:param prefix: str or None. Prefix path of the folder in the bucket. If None, the whole bucket is downloaded.  
:param verbose: bool. Display the download status  

In [8]:
mys3.download_s3_folder(bucket_name="buckettest0011", local_dir='.', prefix="test/test", verbose=1)

Downloaded test/test/line_profiling_results.txt to ./test/line_profiling_results.txt
Downloaded test/test/outer_function_profile.txt to ./test/outer_function_profile.txt


### S3 File Download
Download a specific file from an S3 bucket and optionally return its metadata.

:param bucket_name: str. Name of the S3 bucket.  
:param key: str. The key of the file in the S3 bucket.  
:param local_dir: str. Local directory to which the file will be downloaded.  
:param metadata: bool. If True, return the file's metadata; otherwise, return None.  
:param verbose: bool.  
:return: dict or None. Returns metadata of the file if metadata is True, otherwise None.  

In [9]:
mys3.download_s3_file(bucket_name="buckettest0011", key="test/test/outer_function_profile.txt", local_dir=".", metadata=True)

{'ResponseMetadata': {'RequestId': '4RT5YGB089R8ER6Y',
  'HostId': 'JJCRUZzdH+CUZ5enf4O4r1O2oqr7QFgbmff21q7d8NEgeDDTFTjYl2kH75m3vLp5FaTeA3syDNl8G73FW52w8g==',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'JJCRUZzdH+CUZ5enf4O4r1O2oqr7QFgbmff21q7d8NEgeDDTFTjYl2kH75m3vLp5FaTeA3syDNl8G73FW52w8g==',
   'x-amz-request-id': '4RT5YGB089R8ER6Y',
   'date': 'Tue, 15 Oct 2024 11:47:02 GMT',
   'last-modified': 'Tue, 15 Oct 2024 09:40:40 GMT',
   'etag': '"7c49753bd7d2109ce96bd2568ad8fbef"',
   'x-amz-server-side-encryption': 'AES256',
   'x-amz-meta-author': 'XXXXX',
   'accept-ranges': 'bytes',
   'content-type': 'binary/octet-stream',
   'server': 'AmazonS3',
   'content-length': '2845'},
  'RetryAttempts': 0},
 'AcceptRanges': 'bytes',
 'LastModified': datetime.datetime(2024, 10, 15, 9, 40, 40, tzinfo=tzutc()),
 'ContentLength': 2845,
 'ETag': '"7c49753bd7d2109ce96bd2568ad8fbef"',
 'ContentType': 'binary/octet-stream',
 'ServerSideEncryption': 'AES256',
 'Metadata': {'author': 'XXX

### Uploading file from local to s3 with/without metadata
Upload a file to an S3 bucket with optional metadata.

:param bucket_name: str. Name of the S3 bucket.  
:param localfile_path: str. Local path to the file to be uploaded.  
:param s3_key: str. S3 key (path within the bucket) where the file will be stored with file name included.  
:param metadata: dict or None. Optional metadata for the file. Defaults to None.

In [10]:
mys3.upload_file_to_s3(bucket_name="buckettest0011",
                       localfile_path="/home/user/Documents/line_profiling_results.txt",
                       s3_key="test/test/line_profiling_results.txt")

File uploaded successfully to buckettest0011/test/test/line_profiling_results.txt


In [11]:
metadata = {"author": "xxxxx"}
mys3.upload_file_to_s3(bucket_name="buckettest0011",
                       localfile_path="/home/yravi/Documents/line_profiling_results.txt",
                       s3_key="test/test/line_profiling_results.txt",
                       metadata=metadata)

File uploaded successfully to buckettest0011/test/test/line_profiling_results.txt with metadata {'author': 'xxxxx'}


Now lets check by downloading the uploaded file if the metadata is present or not

In [12]:
mys3.download_s3_file(bucket_name="buckettest0011", key="test/test/line_profiling_results.txt", local_dir=".", metadata=True, verbose=1)

Downloaded test/test/line_profiling_results.txt to ./line_profiling_results.txt


{'ResponseMetadata': {'RequestId': '4RTFDPRWMCY0V3KB',
  'HostId': '7xhJWRbpiSDCoBpCusjp6HisKzqnC2ofYgK51LHD9lw+NYtromEd0wAipoM3qC8eXfdBmHKnOxSV8jkwz0yi1w==',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '7xhJWRbpiSDCoBpCusjp6HisKzqnC2ofYgK51LHD9lw+NYtromEd0wAipoM3qC8eXfdBmHKnOxSV8jkwz0yi1w==',
   'x-amz-request-id': '4RTFDPRWMCY0V3KB',
   'date': 'Tue, 15 Oct 2024 11:47:02 GMT',
   'last-modified': 'Tue, 15 Oct 2024 11:47:02 GMT',
   'etag': '"5a627cd11fe9a0ec5877b4a4f0f33a62"',
   'x-amz-server-side-encryption': 'AES256',
   'x-amz-meta-author': 'xxxxx',
   'accept-ranges': 'bytes',
   'content-type': 'binary/octet-stream',
   'server': 'AmazonS3',
   'content-length': '921'},
  'RetryAttempts': 0},
 'AcceptRanges': 'bytes',
 'LastModified': datetime.datetime(2024, 10, 15, 11, 47, 2, tzinfo=tzutc()),
 'ContentLength': 921,
 'ETag': '"5a627cd11fe9a0ec5877b4a4f0f33a62"',
 'ContentType': 'binary/octet-stream',
 'ServerSideEncryption': 'AES256',
 'Metadata': {'author': 'xxxxx