In [70]:
import pandas as pd
import boto3
import json

In [23]:
# 0. Load the data
s3 = boto3.client('s3')
s3.download_file('noventa-scratch-bucket', 'census_data.json', './tmp/census_data.json')
s3.download_file('noventa-scratch-bucket', 'productivity_cost/2024-06-06/pr.data.0.Current', './tmp/pr.data.0.Current')


In [25]:
# 0. (continued) Load the data into DataFrames
# Load PR Data omtp a dataframe
pr_df = pd.read_csv('./tmp/pr.data.0.Current', sep='\t')

# Load the census JSON file into a dataframe.
with open('./tmp/census_data.json') as f:
    data = json.load(f)

data_to_convert = data['data']
census_df = pd.DataFrame(data_to_convert)

Unnamed: 0,ID Nation,Nation,ID Year,Year,Population,Slug Nation
0,01000US,United States,2022,2022,331097593,united-states
1,01000US,United States,2021,2021,329725481,united-states
2,01000US,United States,2020,2020,326569308,united-states
3,01000US,United States,2019,2019,324697795,united-states
4,01000US,United States,2018,2018,322903030,united-states
5,01000US,United States,2017,2017,321004407,united-states
6,01000US,United States,2016,2016,318558162,united-states
7,01000US,United States,2015,2015,316515021,united-states
8,01000US,United States,2014,2014,314107084,united-states
9,01000US,United States,2013,2013,311536594,united-states


Unnamed: 0,series_id,year,period,value,footnote_codes
0,PRS30006011,1995,Q01,2.600,
1,PRS30006011,1995,Q02,2.100,
2,PRS30006011,1995,Q03,0.900,
3,PRS30006011,1995,Q04,0.100,
4,PRS30006011,1995,Q05,1.400,
...,...,...,...,...,...
35812,PRS88003203,2023,Q02,116.953,
35813,PRS88003203,2023,Q03,116.928,
35814,PRS88003203,2023,Q04,115.824,R
35815,PRS88003203,2023,Q05,116.472,R


In [40]:
# 1. What is the average population between 2013 and 2018?
census_df['Year'] = pd.to_datetime(census_df['Year'], format='%Y')
start_year = pd.to_datetime('2013', format='%Y')
end_year = pd.to_datetime('2018', format='%Y')

filtered_census_df = census_df[(census_df['Year'] >= start_year) & (census_df['Year'] <= end_year)]

population_mean = filtered_census_df['Population'].mean()
population_std = filtered_census_df['Population'].std()

print('Mean:', population_mean)
print('Standard Deviation:', population_std)

Mean: 317437383.0
Standard Deviation: 4257089.5415293295


In [66]:
# 2. For each Series ID, select the year with the highest value (best year)
# Clean the column names
pr_df.columns = pr_df.columns.str.strip()

grouped = pr_df.groupby(['series_id', 'year'])['value'].sum()
grouped = grouped.reset_index()
best_years = grouped.loc[grouped.groupby('series_id')['value'].idxmax()]

print(best_years.to_string())

        series_id       year    value
27    PRS30006011 2022-01-01   20.500
57    PRS30006012 2022-01-01   17.100
63    PRS30006013 1998-01-01  704.125
105   PRS30006021 2010-01-01   17.600
135   PRS30006022 2010-01-01   12.500
169   PRS30006023 2014-01-01  503.171
207   PRS30006031 2022-01-01   20.400
236   PRS30006032 2021-01-01   17.100
243   PRS30006033 1998-01-01  700.712
297   PRS30006061 2022-01-01   38.900
327   PRS30006062 2022-01-01   31.700
358   PRS30006063 2023-01-01  631.806
386   PRS30006081 2021-01-01   23.400
413   PRS30006082 2021-01-01   23.400
440   PRS30006083 2021-01-01  112.459
448   PRS30006091 2002-01-01   43.300
478   PRS30006092 2002-01-01   44.400
517   PRS30006093 2011-01-01  520.088
556   PRS30006101 2020-01-01   33.500
586   PRS30006102 2020-01-01   36.000
619   PRS30006103 2023-01-01  622.072
646   PRS30006111 2020-01-01   31.500
664   PRS30006112 2008-01-01   42.400
709   PRS30006113 2023-01-01  633.119
737   PRS30006131 2021-01-01   17.700
764   PRS300

In [69]:
# 3. Report the value of the series with the series_id 'PRS30006032' and period 'Q01' for each year
# Clean the data
pr_df.columns = pr_df.columns.str.strip()
census_df.columns = census_df.columns.str.strip()
pr_df = pr_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
census_df = census_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# Convert the 'year' column to datetime to match census_df "Year" column. 
pr_df['year'] = pd.to_datetime(pr_df['year'], format='%Y')

filtered_pr_df = pr_df[(pr_df['series_id'] == 'PRS30006032') & (pr_df['period'] == 'Q01')]
merged_df = pd.merge(filtered_pr_df, census_df, left_on='year', right_on="Year", how='left')
report_df = merged_df[['series_id', 'year', 'period', 'value', 'Population']]

print(report_df.to_string())


      series_id       year period  value   Population
0   PRS30006032 1995-01-01    Q01    0.0          NaN
1   PRS30006032 1996-01-01    Q01   -4.4          NaN
2   PRS30006032 1997-01-01    Q01    2.7          NaN
3   PRS30006032 1998-01-01    Q01    1.0          NaN
4   PRS30006032 1999-01-01    Q01   -4.1          NaN
5   PRS30006032 2000-01-01    Q01    0.5          NaN
6   PRS30006032 2001-01-01    Q01   -6.5          NaN
7   PRS30006032 2002-01-01    Q01   -6.7          NaN
8   PRS30006032 2003-01-01    Q01   -5.6          NaN
9   PRS30006032 2004-01-01    Q01    2.1          NaN
10  PRS30006032 2005-01-01    Q01   -0.6          NaN
11  PRS30006032 2006-01-01    Q01    1.8          NaN
12  PRS30006032 2007-01-01    Q01   -0.7          NaN
13  PRS30006032 2008-01-01    Q01   -3.4          NaN
14  PRS30006032 2009-01-01    Q01  -21.0          NaN
15  PRS30006032 2010-01-01    Q01    3.4          NaN
16  PRS30006032 2011-01-01    Q01    1.7          NaN
17  PRS30006032 2012-01-01  