##### Imports

In [69]:
import json
import os
from io import StringIO

import boto3
import pandas as pd
from botocore.exceptions import ClientError, NoCredentialsError
from dotenv import find_dotenv, load_dotenv

#### Load env variables

In [70]:
load_dotenv(find_dotenv())

True

#### Create S3 Client

In [71]:
S3_BUCKET = "data"  # Replace with your S3 bucket name
s3 = boto3.client(
    "s3",
    endpoint_url=os.environ.get("MINIO_ENDPOINT"),
    aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"),
    region_name=os.environ.get("AWS_REGION") or "us-east-1",
)

## Population Data

#### Fetch the Population data from S3

In [72]:
POPULATION_OBJ_KEY = "population/population_data.json"  # Replace with your S3 object key
try:
    response = s3.get_object(Bucket=S3_BUCKET, Key=POPULATION_OBJ_KEY)
    content = response["Body"].read().decode("utf-8")
    population_json = json.loads(content)
except NoCredentialsError:
    print("Error: AWS credentials not found.")
    exit(1)
except ClientError as e:
    print(f"Error fetching file from S3: {e}")
    exit(1)

#### Load the JSON data into Pandas Dataframe

In [73]:
population_df = pd.DataFrame(population_json["data"])

# Ensure Population is numeric
population_df["Population"] = pd.to_numeric(population_df["Population"])

# Filter years 2013 to 2018 (inclusive)
df_filtered = population_df[(population_df["Year"] >= 2013) & (population_df["Year"] <= 2018)]

# --------------------------
# Compute mean and standard deviation
# --------------------------
mean_population = df_filtered["Population"].mean()
std_population = df_filtered["Population"].std()

#### Display the calculations and Results

In [74]:
print("US Population Statistics (2013-2018):")
print(f"Mean population: {mean_population:,.0f}")
print(f"Standard deviation: {std_population:,.0f}")

US Population Statistics (2013-2018):
Mean population: 322,069,808
Standard deviation: 4,158,441


## Series Data

In [75]:
try:
    response = s3.get_object(Bucket=S3_BUCKET, Key="pub/time.series/pr/pr.data.0.Current")
    content = response["Body"].read().decode("utf-8")
except NoCredentialsError:
    print("Error: AWS credentials not found.")
    exit(1)
except ClientError as e:
    print(f"Error fetching file from S3: {e}")
    exit(1)

In [76]:
df = pd.read_csv(StringIO(content), sep="\t")
df.columns = df.columns.str.strip()

In [77]:
# Ensure value is numeric (safe guard)
df["value"] = pd.to_numeric(df["value"], errors="coerce")

# Strip for seried_id
df["series_id"] = df["series_id"].str.strip()

# Sum values per series_id and year
yearly_sum = df.groupby(["series_id", "year"], as_index=False)["value"].sum()

# For each series_id, get the year with the maximum summed value
best_year_report = yearly_sum.loc[
    yearly_sum.groupby("series_id")["value"].idxmax()
].reset_index(drop=True)

In [78]:
joined_df = df.merge(
    population_df, left_on=["year"], right_on=["Year"], how="inner"
)

In [79]:
results = joined_df[(joined_df["series_id"] == "PRS30006032") & (joined_df["period"] == "Q01")]
results[["series_id", "year", "period", "value", "Population"]]

Unnamed: 0,series_id,year,period,value,Population
350,PRS30006032,2013,Q01,0.5,316128839.0
355,PRS30006032,2014,Q01,-0.1,318857056.0
360,PRS30006032,2015,Q01,-1.7,321418821.0
365,PRS30006032,2016,Q01,-1.4,323127515.0
370,PRS30006032,2017,Q01,0.9,325719178.0
375,PRS30006032,2018,Q01,0.5,327167439.0
380,PRS30006032,2019,Q01,-1.6,328239523.0
385,PRS30006032,2021,Q01,0.7,331893745.0
390,PRS30006032,2022,Q01,5.3,333287562.0
395,PRS30006032,2023,Q01,0.3,334914896.0
