Importing Libraries

In [1]:
# Dat Data Quest Task - Part 3

# Import libraries
import pandas as pd
import boto3
import json
import io  # For reading files from S3

Configure S3 Access

In [2]:
# Configure S3 Access
S3_BUCKET = "arc-cloud-dq"  # Replace with your actual bucket name
S3_REGION = "us-east-1"  # Replace with your bucket's region

s3_client = boto3.client("s3", region_name=S3_REGION)


Loading Data from S3 into Pandas DataFrames

In [3]:
def load_csv_from_s3(s3_client, bucket_name, file_key):
    """
    Loads a CSV file from S3 into a Pandas DataFrame.
    Specify the tab separated files
    """
    try:
        response = s3_client.get_object(Bucket=bucket_name, Key=file_key)
        csv_data = response['Body'].read().decode('utf-8')
        df = pd.read_csv(io.StringIO(csv_data), sep='\t')  # Specify tab separator
        return df
    except Exception as e:
        print(f"Error loading CSV from S3: {e}")
        return None

def load_json_from_s3(s3_client, bucket_name, file_key):
    """Loads a JSON file from S3 into a Pandas DataFrame."""
    try:
        response = s3_client.get_object(Bucket=bucket_name, Key=file_key)
        json_data = response['Body'].read().decode('utf-8')
        data = json.loads(json_data)
        df = pd.DataFrame(data['data'])  # Assuming the data is under the 'data' key
        return df
    except Exception as e:
        print(f"Error loading JSON from S3: {e}")
        return None

# Load the BLS data (Part 1)
bls_df = load_csv_from_s3(s3_client, S3_BUCKET, "pr.data.0.Current")

# Load the population data (Part 2)
population_df = load_json_from_s3(s3_client, S3_BUCKET, "datausa_population.json")

# Display the first few rows of each DataFrame to verify loading
print("BLS Data:")
if bls_df is not None:
    print(bls_df.head())
    print ("BLS loaded")
    #Print columns to load accurately
    print("Columns in BLS data:", bls_df.columns)
else:
    print("BLS data failed to load")

print("\nPopulation Data:")
if population_df is not None:
    print(population_df.head())
    print ("Population data loaded")
    print("Columns in Population data:", population_df.columns)
else:
    print ("Population data is none")


BLS Data:
   series_id          year period         value footnote_codes
0  PRS30006011        1995    Q01           2.6            NaN
1  PRS30006011        1995    Q02           2.1            NaN
2  PRS30006011        1995    Q03           0.9            NaN
3  PRS30006011        1995    Q04           0.1            NaN
4  PRS30006011        1995    Q05           1.4            NaN
BLS loaded
Columns in BLS data: Index(['series_id        ', 'year', 'period', '       value',
       'footnote_codes'],
      dtype='object')

Population Data:
  ID Nation         Nation  ID Year  Year  Population    Slug Nation
0   01000US  United States     2022  2022   331097593  united-states
1   01000US  United States     2021  2021   329725481  united-states
2   01000US  United States     2020  2020   326569308  united-states
3   01000US  United States     2019  2019   324697795  united-states
4   01000US  United States     2018  2018   322903030  united-states
Population data loaded
Columns in Popu

Population Data Analysis

In [4]:
if population_df is not None:
    # Filter population data for years 2013 to 2018 (inclusive)
    population_df = population_df[(population_df['Year'] >= '2013') & (population_df['Year'] <= '2018')]

    # Convert 'Population' column to numeric (if it's not already)
    population_df['Population'] = pd.to_numeric(population_df['Population'])

    # Calculate the mean and standard deviation of the population
    population_mean = population_df['Population'].mean()
    population_std = population_df['Population'].std()

    print(f"Mean Population (2013-2018): {population_mean:,.2f}")
    print(f"Standard Deviation of Population (2013-2018): {population_std:,.2f}")
else:
    print("population data was not loaded")


Mean Population (2013-2018): 317,437,383.00
Standard Deviation of Population (2013-2018): 4,257,089.54


Time-Series Data Analysis

In [8]:
if bls_df is not None:
    # Ensure correct column name before proceeding
    if 'series_id' not in bls_df.columns or 'value' not in bls_df.columns or 'year' not in bls_df.columns:
        print("One or more required columns ('series_id', 'value', 'year') not found in BLS data. Skipping Time-Series Analysis.")
    else:
        # Convert 'value' column to numeric and handle errors
        bls_df['value'] = pd.to_numeric(bls_df['value'], errors='coerce')

        # Group by series_id and year, and sum the values
        grouped_bls = bls_df.groupby(['series_id', 'year'])['value'].sum().reset_index()

        # Find the best year for each series_id
        best_years = grouped_bls.loc[grouped_bls.groupby('series_id')['value'].idxmax()]

        # Generate the report
        report_df = best_years[['series_id', 'year', 'value']]
        report_df.rename(columns={'value': 'summed_value'}, inplace=True)

        print("\nBest Year Report:")
        print(report_df)
else:
    print ("bls_data is none")


One or more required columns ('series_id', 'value', 'year') not found in BLS data. Skipping Time-Series Analysis.


Combined Data Analysis

In [11]:
if bls_df is not None:
    # Ensure correct column name before proceeding
    if 'series_id' not in bls_df.columns or 'value' not in bls_df.columns or 'year' not in bls_df.columns:
        print("One or more required columns ('series_id', 'value', 'year') not found in BLS data. Skipping Time-Series Analysis.")
    else:
        # Convert 'value' column to numeric and handle errors
        bls_df['value'] = pd.to_numeric(bls_df['value'], errors='coerce')

        # Group by series_id and year, and sum the values
        grouped_bls = bls_df.groupby(['series_id', 'year'])['value'].sum().reset_index()

        # Find the best year for each series_id
        best_years = grouped_bls.loc[grouped_bls.groupby('series_id')['value'].idxmax()]

        # Generate the report
        report_df = best_years[['series_id', 'year', 'value']]
        report_df.rename(columns={'value': 'summed_value'}, inplace=True)

        print("\nBest Year Report:")
        print(report_df)
else:
    print ("bls_data is none")


One or more required columns ('series_id', 'value', 'year') not found in BLS data. Skipping Time-Series Analysis.


Find the Best Year for Each series_id Rearc requires:

For each series_id, find the year with the highest sum of "value".