In [12]:
# Upload CSV files to S3
import boto3
import sagemaker
import os

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Initialize SageMaker session and get role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

In [3]:
# Use the specific bucket instead of the default one
bucket = "usd-team1-ads508"  # Specific bucket name
sm = boto3.Session().client(service_name="sagemaker", region_name=region)

In [4]:
# Display the bucket we're working with
print(f"Using S3 bucket: {bucket}")

Using S3 bucket: usd-team1-ads508


In [5]:
# verify we can access the bucket
!aws s3 ls s3://{bucket}/

                           PRE ab_data/
                           PRE ld_data/


In [6]:
# Create a dedicated prefix for our data files
s3_prefix = ""
!aws s3api put-object --bucket {bucket} --key {s3_prefix}


usage: aws [options] <command> <subcommand> [<subcommand> ...] [parameters]
To see help text, you can run:

  aws help
  aws <command> help
  aws <command> <subcommand> help

aws: error: argument --key: expected one argument



In [7]:
# Set path to the local data directory
local_data_path = "../data/"  # notebook is in data_management folder

In [8]:
# Use os.walk to find all CSV files in the data directory and its subfolders
print("\nCSV files to be uploaded:")
csv_files = []

for root, dirs, files in os.walk(local_data_path):
    for file in files:
        if file.endswith(".csv"):
            file_path = os.path.join(root, file)
            relative_path = os.path.relpath(file_path, local_data_path)
            csv_files.append((file_path, relative_path))
            print(f"Found: {file_path}")

if not csv_files:
    print("No CSV files found in the data directory or its subfolders.")


CSV files to be uploaded:
Found: ../data/ld_data/adult23.csv
Found: ../data/ld_data/adult19.csv
Found: ../data/ld_data/adult22.csv
Found: ../data/ld_data/adult20.csv
Found: ../data/ld_data/adult21.csv
Found: ../data/ab_data/analytic_data2019.csv
Found: ../data/ab_data/analytic_data2020_0.csv
Found: ../data/ab_data/analytic_data2021.csv
Found: ../data/ab_data/analytic_data2022.csv
Found: ../data/ab_data/analytic_data2023_0.csv
Found: ../data/jc_data/employee_benefits_2019_2023.csv


In [9]:
# Upload all CSV files from the data directory and its subfolders to S3
print("\nUploading files to S3...")
for local_file_path, relative_path in csv_files:
    # Preserve the subfolder structure in S3
    s3_file_key = f"{s3_prefix}{relative_path}"
    
    # Upload without ACL since the bucket doesn't support it
    print(f"Uploading {local_file_path} to s3://{bucket}/{s3_file_key}")
    !aws s3 cp {local_file_path} s3://{bucket}/{s3_file_key}


Uploading files to S3...
Uploading ../data/ld_data/adult23.csv to s3://usd-team1-ads508/ld_data/adult23.csv
upload: ../data/ld_data/adult23.csv to s3://usd-team1-ads508/ld_data/adult23.csv
Uploading ../data/ld_data/adult19.csv to s3://usd-team1-ads508/ld_data/adult19.csv
upload: ../data/ld_data/adult19.csv to s3://usd-team1-ads508/ld_data/adult19.csv
Uploading ../data/ld_data/adult22.csv to s3://usd-team1-ads508/ld_data/adult22.csv
upload: ../data/ld_data/adult22.csv to s3://usd-team1-ads508/ld_data/adult22.csv
Uploading ../data/ld_data/adult20.csv to s3://usd-team1-ads508/ld_data/adult20.csv
upload: ../data/ld_data/adult20.csv to s3://usd-team1-ads508/ld_data/adult20.csv
Uploading ../data/ld_data/adult21.csv to s3://usd-team1-ads508/ld_data/adult21.csv
upload: ../data/ld_data/adult21.csv to s3://usd-team1-ads508/ld_data/adult21.csv
Uploading ../data/ab_data/analytic_data2019.csv to s3://usd-team1-ads508/ab_data/analytic_data2019.csv
upload: ../data/ab_data/analytic_data2019.csv to s3

In [10]:
# Check source path
!aws s3 ls s3://usd-team1-ads508/

                           PRE ab_data/
                           PRE jc_data/
                           PRE ld_data/


In [11]:
# Check public bucket files
!aws s3 ls s3://usd-team1-ads508/ --recursive

2025-03-20 22:43:00    8238746 ab_data/analytic_data2019.csv
2025-03-20 22:43:02   12165941 ab_data/analytic_data2020_0.csv
2025-03-20 22:43:03   12481406 ab_data/analytic_data2021.csv
2025-03-20 22:43:04   12763323 ab_data/analytic_data2022.csv
2025-03-20 22:43:05   12528929 ab_data/analytic_data2023_0.csv
2025-03-20 22:43:07   55574528 jc_data/employee_benefits_2019_2023.csv
2025-03-20 22:42:54   26731512 ld_data/adult19.csv
2025-03-20 22:42:57   30421672 ld_data/adult20.csv
2025-03-20 22:42:59   27977846 ld_data/adult21.csv
2025-03-20 22:42:56   28090511 ld_data/adult22.csv
2025-03-20 22:42:53   29397605 ld_data/adult23.csv
