In [None]:
import boto3
import uuid
import os
import pandas as pd
from datetime import datetime
from boto3.s3.transfer import TransferConfig, S3Transfer

transfer_config = TransferConfig(
    max_concurrency=10,
    multipart_threshold=8 * 1024**2,
    multipart_chunksize=8 * 1024**2
)
_s3_client = boto3.client("s3")
_transfer = S3Transfer(_s3_client, transfer_config)

def prepare_and_upload(csv_path, bucket, in_prefix, script_local, script_s3_prefix):
    # make the csv file to parquet 
    df = .read_csv(csv_path)
    base = os.path.splitext(os.path.basename(csv_path))[0]
    local_parquet = f"/tmp/{base}-{uuid.uuid4()}.parquet"
    df.to_parquet(local_parquet, index=False)
    print(f"[{datetime.now()}] Wrote local parquet: {local_parquet}")

    # Parallel upload Parquet to S3
    pq_key = f"{in_prefix}/{os.path.basename(local_parquet)}"
    _transfer.upload_file(local_parquet, bucket, pq_key)
    in_s3 = f"s3://{bucket}/{pq_key}"
    print(f"[{datetime.now()}] Parallel-uploaded Parquet to {in_s3}")

    # parallel upload classify_batch.py to S3 to run in Spark job 
    script_key = f"{script_s3_prefix}/{os.path.basename(script_local)}"
    _transfer.upload_file(script_local, bucket, script_key)
    script_s3 = f"s3://{bucket}/{script_key}"
    print(f"[{datetime.now()}] Uploaded Spark script to {script_s3}")

    return in_s3, script_s3

# Example usage:
in_s3_path, script_s3_path = prepare_and_upload(
    csv_path="data_test.csv",
    bucket="realunique",
    in_prefix="incoming/parquet",
    script_local="classify_batch.py",
    script_s3_prefix="scripts"
)

[2025-05-27 18:21:36.198249] Wrote local parquet: /tmp/data_test-348a6954-1cf8-45af-a8c2-5e7f33691619.parquet
[2025-05-27 18:21:36.402930] Uploaded data to s3://realunique/incoming/parquet/data_test-348a6954-1cf8-45af-a8c2-5e7f33691619.parquet
[2025-05-27 18:21:36.474431] Uploaded Spark script to s3://realunique/scripts/classify_batch.py


In [None]:
# build a CLI‚Äêcompatible StepConfig.json 
import json, os
from datetime import datetime

today = datetime.utcnow().strftime("%Y-%m-%d")
step = [{
  "Name": "BiasClassification",
  "ActionOnFailure": "CONTINUE",
  "Jar": "command-runner.jar",
  "Args": [
    "spark-submit",
    "--deploy-mode", "cluster",
    "--master", "yarn",
    "s3://realralph/scripts/classify_batch.py",
    "--input-path", "s3://realralph/incoming/parquet/",
    "--output-path", f"s3://realralph/classified/parquet/{today}"
  ]
}]

with open("step_config.json","w") as f:
    json.dump(step, f, indent=2)

print("Wrote step_config.json:")
print(json.dumps(step, indent=2))

Wrote step_config.json:
[
  {
    "Name": "BiasClassification",
    "ActionOnFailure": "CONTINUE",
    "Jar": "command-runner.jar",
    "Args": [
      "spark-submit",
      "--deploy-mode",
      "cluster",
      "--master",
      "yarn",
      "s3://realralph/scripts/classify_batch.py",
      "--input-path",
      "s3://realralph/incoming/parquet/",
      "--output-path",
      "s3://realralph/classified/parquet/2025-05-27"
    ]
  }
]


  today = datetime.utcnow().strftime("%Y-%m-%d")


In [None]:
# submit that JSON to the EMR cluster for execution 
!aws emr add-steps \
  --cluster-id j-1IROT086IQ69Y \
  --steps file://step_config.json

{
    "StepIds": [
        "s-04481581N89XDKK3ZRW2"
    ]
}
