####  Run this cell to set up and start your interactive session.


In [0]:
%idle_timeout 2880
%glue_version 5.0
%worker_type G.1X
%number_of_workers 5
%region us-east-2

#using pre-generated standard imports; will hopefully learn which are actually needed in AWS later
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import boto3


from pyspark.sql import SparkSession, functions as F, Window, types as T
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)


In [0]:
#setup environment
try:
    args = getResolvedOptions(sys.argv, ['bucket_name', 'reports_prefix', 'bls_file_path', 'population_file_path'])
except:
    args = {}

BUCKET_NAME = args.get('bucket_name', 'rivkasfirstawsbucket')
REPORTS_PREFIX = args.get('reports_prefix', 'reports/')
BLS_FILE = args.get('bls_file_path', 'bls_data/raw/pr.data.0.Current')
POPULATION_FILE = args.get('population_file_path', 'population_data/raw/population_data.json')
print(BUCKET_NAME, REPORTS_PREFIX, BLS_FILE, POPULATION_FILE)


s3 = boto3.client('s3')
spark = SparkSession.builder.appName("BLSAnalysis").getOrCreate()

In [0]:
# Read bls
bls_df = spark.read.option("header", True) \
    .option("sep", "\t") \
    .option("inferSchema", True) \
    .csv(f"s3://{BUCKET_NAME}/{BLS_FILE}")

bls_df = bls_df.toDF(*[c.strip() for c in bls_df.columns])

#trim string columns
string_columns = [field.name for field in bls_df.schema.fields if isinstance(field.dataType, T.StringType)]
print(string_columns)
for col_name in string_columns:
    bls_df = bls_df.withColumn(col_name, F.trim(F.col(col_name)))

bls_df.printSchema()
bls_df.show()

In [0]:
#aggregate bls data
#may need to clean
sum_per_year = bls_df.groupBy("series_id", "year").agg(F.sum("value").alias("yearly_sum"))


best_year = sum_per_year.withColumn("rank", F.row_number().over(Window.partitionBy("series_id").orderBy(F.desc("yearly_sum")))) \
                                   .filter(F.col("rank") == 1) \
                                   .drop("rank")

best_year.show()
#todo: rework to create tables. Need to learn AWS lakehouse.
best_year.toPandas().to_csv(f"s3://{BUCKET_NAME}/reports/best_year.csv",index=False)

In [0]:
population_df = spark.read.option("multiline", True).json(f"s3://{BUCKET_NAME}/{POPULATION_FILE}")

population_df = population_df.select(F.explode("data").alias("data"))
population_df = population_df.select("data.*")

population_df.printSchema()
population_df.show()


In [0]:
#filter population data and aggregate:
#may need to clean
population_df = population_df.filter((F.col("Year") >= 2013) & (F.col("Year") <= 2018))

population_stats = population_df.agg(
    F.mean("Population").alias("mean_population"),
    F.stddev("Population").alias("std_population")
)

population_stats.show()

#todo: rework to create tables. Need to learn AWS lakehouse.
population_stats.toPandas().to_csv(f"s3://{BUCKET_NAME}/reports/population_stats.csv",index=False)

In [0]:
#join the datasets and filter:
target_series = "PRS30006032"
target_period = "Q01"

report = bls_df.filter((F.col("series_id") == target_series) & (F.col("period") == target_period))\
                .join(population_df, bls_df.year == population_df.Year, how="left")\
                .select(bls_df["*"], "Population")



report.show()

#todo: rework to create tables. Need to learn AWS lakehouse.
report.toPandas().to_csv(f"s3://{BUCKET_NAME}/reports/bls_with_population.csv",index=False)