####  Run this cell to set up and start your interactive session.


In [25]:
%idle_timeout 2880
%glue_version 5.0
%worker_type G.1X
%number_of_workers 5
%region us-east-2

#using pre-generated standard imports; will hopefully learn which are actually needed in AWS later
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import boto3


from pyspark.sql import SparkSession, functions as F, Window, types as T
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)


You are already connected to a glueetl session 24aadab4-fba2-4e5c-864b-d79a2a21e07d.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Current idle_timeout is 2880 minutes.
idle_timeout has been set to 2880 minutes.


You are already connected to a glueetl session 24aadab4-fba2-4e5c-864b-d79a2a21e07d.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Setting Glue version to: 5.0


You are already connected to a glueetl session 24aadab4-fba2-4e5c-864b-d79a2a21e07d.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Previous worker type: G.1X
Setting new worker type to: G.1X


You are already connected to a glueetl session 24aadab4-fba2-4e5c-864b-d79a2a21e07d.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Previous number of workers: 5
Setting new number of workers to: 5


You are already connected to a glueetl session 24aadab4-fba2-4e5c-864b-d79a2a21e07d.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Previous region: us-east-2
Setting new region to: us-east-2
Region is set to: us-east-2



In [3]:
#setup environment
try:
    args = getResolvedOptions(sys.argv, ['bucket_name', 'reports_prefix', 'bls_file_path', 'population_file_path'])
except:
    args = {}

BUCKET_NAME = args.get('bucket_name', 'rivkasfirstawsbucket')
REPORTS_PREFIX = args.get('reports_prefix', 'reports/')
BLS_FILE = args.get('bls_file_path', 'bls_data/raw/pr.data.0.Current')
POPULATION_FILE = args.get('population_file_path', 'population_data/raw/population_data.json')
print(BUCKET_NAME, REPORTS_PREFIX, BLS_FILE, POPULATION_FILE)


s3 = boto3.client('s3')
spark = SparkSession.builder.appName("BLSAnalysis").getOrCreate()

rivkasfirstawsbucket reports/ bls_data/raw/pr.data.0.Current population_data/raw/population_data.json


In [27]:
# Read bls
bls_df = spark.read.option("header", True) \
    .option("sep", "\t") \
    .option("inferSchema", True) \
    .csv(f"s3://{BUCKET_NAME}/{BLS_FILE}")

bls_df = bls_df.toDF(*[c.strip() for c in bls_df.columns])

#trim string columns
string_columns = [field.name for field in bls_df.schema.fields if isinstance(field.dataType, T.StringType)]
print(string_columns)
for col_name in string_columns:
    bls_df = bls_df.withColumn(col_name, F.trim(F.col(col_name)))

bls_df.printSchema()
bls_df.show()

['series_id', 'period', 'footnote_codes']
root
 |-- series_id: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- period: string (nullable = true)
 |-- value: double (nullable = true)
 |-- footnote_codes: string (nullable = true)

+-----------+----+------+-----+--------------+
|  series_id|year|period|value|footnote_codes|
+-----------+----+------+-----+--------------+
|PRS30006011|1995|   Q01|  2.6|          NULL|
|PRS30006011|1995|   Q02|  2.1|          NULL|
|PRS30006011|1995|   Q03|  0.9|          NULL|
|PRS30006011|1995|   Q04|  0.1|          NULL|
|PRS30006011|1995|   Q05|  1.4|          NULL|
|PRS30006011|1996|   Q01| -0.2|          NULL|
|PRS30006011|1996|   Q02| -0.3|          NULL|
|PRS30006011|1996|   Q03| -0.1|          NULL|
|PRS30006011|1996|   Q04|  0.2|          NULL|
|PRS30006011|1996|   Q05| -0.1|          NULL|
|PRS30006011|1997|   Q01|  0.3|          NULL|
|PRS30006011|1997|   Q02|  0.7|          NULL|
|PRS30006011|1997|   Q03|  1.0|          NULL|
|

In [19]:
#aggregate bls data
#may need to clean
sum_per_year = bls_df.groupBy("series_id", "year").agg(F.sum("value").alias("yearly_sum"))


best_year = sum_per_year.withColumn("rank", F.row_number().over(Window.partitionBy("series_id").orderBy(F.desc("yearly_sum")))) \
                                   .filter(F.col("rank") == 1) \
                                   .drop("rank")

best_year.show()
#todo: rework to create tables. Need to learn AWS lakehouse.
best_year.toPandas().to_csv(f"s3://{BUCKET_NAME}/reports/best_year.csv",index=False)

+-----------------+----+------------------+
|        series_id|year|        yearly_sum|
+-----------------+----+------------------+
|PRS30006011      |2022|              20.5|
|PRS30006012      |2022|              17.1|
|PRS30006013      |1998|           705.895|
|PRS30006021      |2010|              17.7|
|PRS30006022      |2010|12.399999999999999|
|PRS30006023      |2014|503.21600000000007|
|PRS30006031      |2022|              20.5|
|PRS30006032      |2021|              17.1|
|PRS30006033      |1998|           702.672|
|PRS30006061      |2022|              37.0|
|PRS30006062      |2021|              31.6|
|PRS30006063      |2024|           646.748|
|PRS30006081      |2021|              24.4|
|PRS30006082      |2021|              24.4|
|PRS30006083      |2021|           110.742|
|PRS30006091      |2002|              43.3|
|PRS30006092      |2002| 44.39999999999999|
|PRS30006093      |2013| 514.1560000000001|
|PRS30006101      |2020|              33.5|
|PRS30006102      |2020|        

In [8]:
population_df = spark.read.option("multiline", True).json(f"s3://{BUCKET_NAME}/{POPULATION_FILE}")

population_df = population_df.select(F.explode("data").alias("data"))
population_df = population_df.select("data.*")

population_df.printSchema()
population_df.show()


root
 |-- Nation: string (nullable = true)
 |-- Nation ID: string (nullable = true)
 |-- Population: double (nullable = true)
 |-- Year: long (nullable = true)

+-------------+---------+------------+----+
|       Nation|Nation ID|  Population|Year|
+-------------+---------+------------+----+
|United States|  01000US|3.16128839E8|2013|
|United States|  01000US|3.18857056E8|2014|
|United States|  01000US|3.21418821E8|2015|
|United States|  01000US|3.23127515E8|2016|
|United States|  01000US|3.25719178E8|2017|
|United States|  01000US|3.27167439E8|2018|
|United States|  01000US|3.28239523E8|2019|
|United States|  01000US|3.31893745E8|2021|
|United States|  01000US|3.33287562E8|2022|
|United States|  01000US|3.34914896E8|2023|
+-------------+---------+------------+----+

huh?


In [10]:
#filter population data and aggregate:
#may need to clean
population_df = population_df.filter((F.col("Year") >= 2013) & (F.col("Year") <= 2018))

population_stats = population_df.agg(
    F.mean("Population").alias("mean_population"),
    F.stddev("Population").alias("std_population")
)

population_stats.show()

#todo: rework to create tables. Need to learn AWS lakehouse.
population_stats.toPandas().to_csv(f"s3://{BUCKET_NAME}/reports/population_stats.csv",index=False)

+---------------+-----------------+
|mean_population|   std_population|
+---------------+-----------------+
|   3.22069808E8|4158441.040908092|
+---------------+-----------------+


In [28]:
#join the datasets and filter:
target_series = "PRS30006032"
target_period = "Q01"

report = bls_df.filter((F.col("series_id") == target_series) & (F.col("period") == target_period))\
                .join(population_df, bls_df.year == population_df.Year, how="left")\
                .select(bls_df["*"], "Population")



report.show()

#todo: rework to create tables. Need to learn AWS lakehouse.
report.toPandas().to_csv(f"s3://{BUCKET_NAME}/reports/bls_with_population.csv",index=False)

+-----------+----+------+-----+--------------+------------+
|  series_id|year|period|value|footnote_codes|  Population|
+-----------+----+------+-----+--------------+------------+
|PRS30006032|1995|   Q01|  0.0|          NULL|        NULL|
|PRS30006032|1996|   Q01| -4.2|          NULL|        NULL|
|PRS30006032|1997|   Q01|  2.8|          NULL|        NULL|
|PRS30006032|1998|   Q01|  0.9|          NULL|        NULL|
|PRS30006032|1999|   Q01| -4.1|          NULL|        NULL|
|PRS30006032|2000|   Q01|  0.5|          NULL|        NULL|
|PRS30006032|2001|   Q01| -6.3|          NULL|        NULL|
|PRS30006032|2002|   Q01| -6.6|          NULL|        NULL|
|PRS30006032|2003|   Q01| -5.7|          NULL|        NULL|
|PRS30006032|2004|   Q01|  2.0|          NULL|        NULL|
|PRS30006032|2005|   Q01| -0.5|          NULL|        NULL|
|PRS30006032|2006|   Q01|  1.8|          NULL|        NULL|
|PRS30006032|2007|   Q01| -0.8|          NULL|        NULL|
|PRS30006032|2008|   Q01| -3.5|         