In [10]:
import pandas as pd
import requests as r
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objs as go
import numpy as np
from matplotlib.pyplot import xlabel


# Data Collection
## CPI Data Import

In [11]:
url = 'https://api.bls.gov/publicAPI/v2/timeseries/data/'

data = {
    "seriesid": ["CUSR0000SAF"],
    "startyear": "2005",
    "endyear": "2024",
    "catalog": True,
    "calculations": False,
    "annualaverage": False,
    "aspects": False,
    "registrationkey": "5974f3d1e0ac4efdad23d2e1b3e4b4e3"
}

html = r.post(url, json=data)
html = html.json()
data = html['Results']['series'][0]['data']
CPI_data = pd.DataFrame(data, columns=['year', 'periodName', 'value'])
month_to_num = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    'December': 12
}

CPI_data.periodName = CPI_data.periodName.apply(lambda x: month_to_num[x])
CPI_data.rename(columns={'year': 'year', 'periodName': 'month', 'value': 'CPI'}, inplace=True)
CPI_data['date'] = pd.to_datetime(CPI_data[['year', 'month']].assign(day=1))
CPI_data.sort_values(by='date', ascending=True, inplace=True)
CPI_data['CPI'] = pd.to_numeric(CPI_data['CPI'], errors='coerce')

In [12]:
# Calculate percentage rate of change of CPI
def rate_of_change(data):
    rate = np.zeros(len(data))
    for time_frame in range(1, len(data)):
        tmp = float(data.iloc[time_frame, 2])
        prev = float(data.iloc[time_frame - 1, 2])
        rate[time_frame] = (tmp - prev) * 100 / (tmp + prev)
    return rate

CPI_data['rate_of_change'] = rate_of_change(CPI_data)

In [13]:
fig = px.line(CPI_data, x='date', y='CPI', title='Timeseries of CPI Monthly', labels={'date': 'Year', 'rate_of_change': 'Rate of Change (%)'})

fig.write_image('/Users/eddie/cs163/src/CPI.png')
fig.show()

In [14]:
fig = px.line(CPI_data, x='date', y='rate_of_change', title='Rate of Change of CPI Monthly', labels={'date': 'Year', 'rate_of_change': 'Rate of Change (%)'})
fig.write_image('/Users/eddie/cs163/src/CPI ROC.png')
fig.show()

In [15]:
CPI_data.describe()

Unnamed: 0,month,CPI,date,rate_of_change
count,236.0,236.0,236,236.0
mean,6.432203,245.285195,2014-10-16 07:43:43.728813568,0.116584
min,1.0,189.1,2005-01-01 00:00:00,-0.187437
25%,3.0,219.07375,2009-11-23 12:00:00,0.041235
50%,6.0,244.803,2014-10-16 12:00:00,0.089714
75%,9.0,258.70325,2019-09-08 12:00:00,0.160337
max,12.0,327.849,2024-08-01 00:00:00,0.64804
std,3.445574,35.922101,,0.127634


## Food Price Collection

In [16]:
df_food = pd.read_csv('ap.data.3.Food', sep='\t', low_memory=False)
columns = []
for col in df_food.columns:
    columns.append(col.strip())
    
df_food = df_food.apply(lambda x: x.str.rstrip() if x.dtype == "object" else x)
df_food.columns = columns

In [17]:
df_food.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145331 entries, 0 to 145330
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   series_id       145331 non-null  object 
 1   year            145331 non-null  int64  
 2   period          145331 non-null  object 
 3   value           145331 non-null  object 
 4   footnote_codes  0 non-null       float64
dtypes: float64(1), int64(1), object(3)
memory usage: 5.5+ MB


In [18]:
df_food.iloc[:, 0].nunique()

672

In [19]:
food_items_series = df_food['series_id'].unique().tolist()
url = 'https://api.bls.gov/publicAPI/v2/timeseries/data/'
series = []
step = 48
for page in range(1, int(len(food_items_series) / step)):
        data1 = {
            "seriesid": food_items_series[page * step : (page + 1) * step],
            "startyear": "2005",
            "endyear": "2024",
            "catalog": True,
            "calculations": False,
            "annualaverage": False,
            "aspects": False,
            "registrationkey": "5974f3d1e0ac4efdad23d2e1b3e4b4e3"
        }
        html = r.post(url, json=data1)
        html = html.json()
        series.append(html['Results']['series'])

In [20]:
item_list = []
for item in series[0]:
    item_name = item['catalog']['measure_data_type']
    for info in item['data']:
        year = info['year']
        month = info['periodName']
        value = info['value']
        entry = {'item': item_name, 'year': year, 'month': month, 'value': value}
        item_list.append(entry)
df_item = pd.DataFrame(item_list)
df_item.month = df_item.month.apply(lambda x: month_to_num[x])
df_item['date'] = pd.to_datetime(df_item[['year', 'month']].assign(day=1))
df_item['value'] = pd.to_numeric(df_item['value'], errors='coerce').fillna(0)
df_item.drop(columns=['year', 'month'], inplace=True)
df_item.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5010 entries, 0 to 5009
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   item    5010 non-null   object        
 1   value   5010 non-null   float64       
 2   date    5010 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 117.6+ KB


In [21]:
df_item.describe()

Unnamed: 0,value,date
count,5010.0,5010
mean,2.139684,2013-11-06 12:59:47.065868288
min,0.0,2005-01-01 00:00:00
25%,1.259,2009-03-01 00:00:00
50%,1.7135,2013-06-01 00:00:00
75%,2.70475,2018-04-01 00:00:00
max,6.357,2024-08-01 00:00:00
std,1.294897,


In [22]:
entry_availability = df_item.groupby('date')['item'].count()

fig = px.bar(entry_availability, title='Data Availability at different time periods ', labels={'date': 'Date', 'value': 'Availability'})
fig.write_image('/Users/eddie/cs163/src/Item Price Availability.png')
fig.show()


## Personal Consumption Expenditures 

In [23]:
from datetime import datetime
url = 'https://apps.bea.gov/api/data?&UserID=6E42AF4F-01C3-4ADB-9244-2DD195EFF66E&method=GetData&DataSetName=NIPA&TableName=T20805&Frequency=M&Year=2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024&ResultFormat=JSON'

html = r.get(url)
html = html.json()
food_pce = []
for table in html['BEAAPI']["Results"]["Data"]:
    if "Food" in table['LineDescription'] :
        food_pce.append(table)

df_pce = pd.DataFrame(food_pce, columns=['LineDescription', 'TimePeriod', 'DataValue'])
time_data = []
for time in df_pce['TimePeriod']:
    tmp = time.split('M')
    year = int(tmp[0])
    month = int(tmp[1])
    time_data.append(datetime(year, month, 1))

df_pce['TimePeriod'] = time_data
df_pce['DataValue'] = pd.to_numeric(df_pce['DataValue'].str.replace(',',''), errors='coerce')

In [24]:
df_pce.describe()

Unnamed: 0,TimePeriod,DataValue
count,470,470.0
mean,2014-10-01 02:27:03.829787136,905334.3
min,2005-01-01 00:00:00,497149.0
25%,2009-11-08 12:00:00,712393.8
50%,2014-10-01 00:00:00,855214.5
75%,2019-08-24 06:00:00,1040535.0
max,2024-07-01 00:00:00,1487246.0
std,,252604.3


In [25]:
df_pce.sort_values(by='TimePeriod', ascending=True, inplace=True)
fig = px.line(df_pce, x='TimePeriod', y='DataValue', color='LineDescription')
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))
fig.write_image('/Users/eddie/cs163/src/BEA PCE.png')
fig.show()