In [12]:
# Upload CSV files to S3
import boto3
import sagemaker
import os

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Initialize SageMaker session and get role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

In [3]:
# Use the specific bucket instead of the default one
bucket = "usd-team1-ads508"  # Specific bucket name
sm = boto3.Session().client(service_name="sagemaker", region_name=region)

In [4]:
# Display the bucket we're working with
print(f"Using S3 bucket: {bucket}")

Using S3 bucket: usd-team1-ads508


In [5]:
# verify we can access the bucket
!aws s3 ls s3://{bucket}/

                           PRE ab_data/
                           PRE jc_data/
                           PRE ld_data/


In [6]:
# Create a dedicated prefix for our data files
s3_prefix = ""
!aws s3api put-object --bucket {bucket} --key {s3_prefix}


usage: aws [options] <command> <subcommand> [<subcommand> ...] [parameters]
To see help text, you can run:

  aws help
  aws <command> help
  aws <command> <subcommand> help

aws: error: argument --key: expected one argument



In [7]:
# Set path to the local data directory
local_data_path = "../data/"  # notebook is in data_management folder

In [8]:
# Use os.walk to find all CSV files in the data directory and its subfolders
print("\nCSV files to be uploaded:")
csv_files = []

for root, dirs, files in os.walk(local_data_path):
    for file in files:
        if file.endswith(".csv"):
            file_path = os.path.join(root, file)
            relative_path = os.path.relpath(file_path, local_data_path)
            csv_files.append((file_path, relative_path))
            print(f"Found: {file_path}")

if not csv_files:
    print("No CSV files found in the data directory or its subfolders.")


CSV files to be uploaded:
Found: ../data/industry_mapping.csv
Found: ../data/industry_year_avg.csv
Found: ../data/nhis_industry_codes.csv
Found: ../data/ab_data/analytic_data2019.csv
Found: ../data/ab_data/analytic_data2020_0.csv
Found: ../data/ab_data/analytic_data2021.csv
Found: ../data/ab_data/analytic_data2022.csv
Found: ../data/ab_data/analytic_data2023_0.csv
Found: ../data/jc_data/employee_benefits_2019.csv
Found: ../data/jc_data/employee_benefits_2020.csv
Found: ../data/jc_data/employee_benefits_2021.csv
Found: ../data/jc_data/employee_benefits_2022.csv
Found: ../data/jc_data/employee_benefits_2023.csv
Found: ../data/ld_data/adult19.csv
Found: ../data/ld_data/adult20.csv
Found: ../data/ld_data/adult21.csv
Found: ../data/ld_data/adult22.csv
Found: ../data/ld_data/adult23.csv


In [9]:
# Upload all CSV files from the data directory and its subfolders to S3
print("\nUploading files to S3...")
for local_file_path, relative_path in csv_files:
    # Preserve the subfolder structure in S3
    s3_file_key = f"{s3_prefix}{relative_path}"
    
    # Upload without ACL since the bucket doesn't support it
    print(f"Uploading {local_file_path} to s3://{bucket}/{s3_file_key}")
    !aws s3 cp {local_file_path} s3://{bucket}/{s3_file_key}


Uploading files to S3...
Uploading ../data/industry_mapping.csv to s3://usd-team1-ads508/industry_mapping.csv
upload: ../data/industry_mapping.csv to s3://usd-team1-ads508/industry_mapping.csv
Uploading ../data/industry_year_avg.csv to s3://usd-team1-ads508/industry_year_avg.csv
upload: ../data/industry_year_avg.csv to s3://usd-team1-ads508/industry_year_avg.csv
Uploading ../data/nhis_industry_codes.csv to s3://usd-team1-ads508/nhis_industry_codes.csv
upload: ../data/nhis_industry_codes.csv to s3://usd-team1-ads508/nhis_industry_codes.csv
Uploading ../data/ab_data/analytic_data2019.csv to s3://usd-team1-ads508/ab_data/analytic_data2019.csv
upload: ../data/ab_data/analytic_data2019.csv to s3://usd-team1-ads508/ab_data/analytic_data2019.csv
Uploading ../data/ab_data/analytic_data2020_0.csv to s3://usd-team1-ads508/ab_data/analytic_data2020_0.csv
upload: ../data/ab_data/analytic_data2020_0.csv to s3://usd-team1-ads508/ab_data/analytic_data2020_0.csv
Uploading ../data/ab_data/analytic_dat

In [10]:
# Check source path
!aws s3 ls s3://usd-team1-ads508/

                           PRE ab_data/
                           PRE jc_data/
                           PRE ld_data/
2025-03-29 01:03:58       5204 industry_mapping.csv
2025-03-29 01:03:59       7558 industry_year_avg.csv
2025-03-29 01:04:00       5043 nhis_industry_codes.csv


In [11]:
# Check public bucket files
!aws s3 ls s3://usd-team1-ads508/ --recursive

2025-03-29 01:04:01    8235550 ab_data/analytic_data2019.csv
2025-03-29 01:04:02   12162745 ab_data/analytic_data2020_0.csv
2025-03-29 01:04:04   12478210 ab_data/analytic_data2021.csv
2025-03-29 01:04:05   12760127 ab_data/analytic_data2022.csv
2025-03-29 01:04:07   12525733 ab_data/analytic_data2023_0.csv
2025-03-29 01:03:58       5204 industry_mapping.csv
2025-03-29 01:03:59       7558 industry_year_avg.csv
2025-03-29 01:04:08   22577017 jc_data/employee_benefits_2019.csv
2025-03-29 01:04:10   19092391 jc_data/employee_benefits_2020.csv
2025-03-29 01:04:11   21645598 jc_data/employee_benefits_2021.csv
2025-03-29 01:04:12   21489356 jc_data/employee_benefits_2022.csv
2025-03-29 01:04:14   22037845 jc_data/employee_benefits_2023.csv
2025-03-29 01:04:15   26731512 ld_data/adult19.csv
2025-03-29 01:04:17   30421672 ld_data/adult20.csv
2025-03-29 01:04:18   27977846 ld_data/adult21.csv
2025-03-29 01:04:21   28090511 ld_data/adult22.csv
2025-03-29 01:04:23   29397605 ld_data/adult23.csv
2