In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles
import uuid
import os
from datetime import datetime

# build the spark session to upload to S3 
spark = (
    SparkSession.builder
        .appName("UploadViaSpark")
        # bring in the S3A connector JARs
        .config(
            "spark.jars.packages",
            "org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.256"
        )
        # set the S3A filesystem
        .config(
            "spark.hadoop.fs.s3a.aws.credentials.provider",
            "com.amazonaws.auth.DefaultAWSCredentialsProviderChain"
        )
        # tuning for parallel/multipart
        .config("spark.hadoop.fs.s3a.connection.maximum", "10")
        .config("spark.hadoop.fs.s3a.multipart.size",     str(8 * 1024**2))
        .config("spark.hadoop.fs.s3a.multipart.threshold",str(8 * 1024**2))
        .getOrCreate()
)


25/05/29 13:34:45 WARN Utils: Your hostname, Real-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.150.150.155 instead (on interface en0)
25/05/29 13:34:45 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/apple/.ivy2/cache
The jars for the packages stored in: /Users/apple/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-0bef3968-be5e-4c63-8972-34e1a2c1a828;1.0
	confs: [default]


:: loading settings :: url = jar:file:/opt/anaconda3/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 126ms :: artifacts dl 5ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.12.262 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.4 from central in [default]
	org.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]
	:: evicted modules:
	com.amazonaws#aws-java-sdk-bundle;1.12.256 by [com.amazonaws#aws-java-sdk-bundle;1.12.262] in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   4   |   0   |   0   |   1   ||   3   |   0   |
	-----------------------------------------------------

In [None]:
def prepare_and_upload(csv_path, bucket, in_prefix, script_local, script_s3_prefix):
    # read csv and change to parquet and upload to S3
    df = spark.read.option("header", True).csv(csv_path)
    base = os.path.splitext(os.path.basename(csv_path))[0]
    parquet_s3_path = f"s3a://{bucket}/{in_prefix}/{base}-{uuid.uuid4()}.parquet"
    
    df.write \
      .mode("overwrite") \
      .parquet(parquet_s3_path)
    print(f"[{datetime.now()}] Wrote Parquet to {parquet_s3_path}")
    
    return parquet_s3_path

# ── Example usage ──
in_s3_path = prepare_and_upload(
    csv_path="../data/data_test.csv",
    bucket="realunique",
    in_prefix="incoming/parquet",
    script_local="classify_batch.py",
    script_s3_prefix="scripts"
)

print("Done.", in_s3_path)

                                                                                

[2025-05-29 13:37:55.273511] Wrote Parquet to s3a://realunique/incoming/parquet/data_test-e92fa965-64ff-47fd-8f42-0b0c9ba5eff8.parquet
Done. s3a://realunique/incoming/parquet/data_test-e92fa965-64ff-47fd-8f42-0b0c9ba5eff8.parquet


In [None]:
# build a CLI‐compatible StepConfig.json 
import json, os
from datetime import datetime

today = datetime.utcnow().strftime("%Y-%m-%d")
step = [{
  "Name": "BiasClassification",
  "ActionOnFailure": "CONTINUE",
  "Jar": "command-runner.jar",
  "Args": [
    "spark-submit",
    "--deploy-mode", "cluster",
    "--master", "yarn",
    "s3://realralph/scripts/classify_batch.py",
    "--input-path", "s3://realralph/incoming/parquet/",
    "--output-path", f"s3://realralph/classified/parquet/{today}"
  ]
}]

with open("step_config.json","w") as f:
    json.dump(step, f, indent=2)

print("Wrote step_config.json:")
print(json.dumps(step, indent=2))

Wrote step_config.json:
[
  {
    "Name": "BiasClassification",
    "ActionOnFailure": "CONTINUE",
    "Jar": "command-runner.jar",
    "Args": [
      "spark-submit",
      "--deploy-mode",
      "cluster",
      "--master",
      "yarn",
      "s3://realralph/scripts/classify_batch.py",
      "--input-path",
      "s3://realralph/incoming/parquet/",
      "--output-path",
      "s3://realralph/classified/parquet/2025-05-27"
    ]
  }
]


  today = datetime.utcnow().strftime("%Y-%m-%d")


In [None]:
# submit that JSON to the EMR cluster for execution 
!aws emr add-steps \
  --cluster-id j-1IROT086IQ69Y \
  --steps file://step_config.json

{
    "StepIds": [
        "s-04481581N89XDKK3ZRW2"
    ]
}
