# Load Modules

In [None]:
# load modules
import os
import re
import itertools
import Haver
import calendar
import blpapi
# import urllib.request
import numpy as np
import pandas as pd
import seaborn as sns
import pyarrow as pa
import statsmodels.api as sm
import matplotlib.pyplot as plt
import pandas.io.formats.excel
from tqdm.notebook import tqdm
# from openpyxl import load_workbook
from datetime import datetime
from dateutil import relativedelta
from sklearn.neighbors import KernelDensity
from scipy.stats import norminvgauss, wald
from xbbg import blp

pd.io.formats.excel.ExcelFormatter.header_style = None
writer_spec = {
    "engine":"openpyxl",
    "mode":"a",
    "if_sheet_exists":"overlay"
}

def using_clump(a):
    return [a[s] for s in np.ma.clump_unmasked(np.ma.masked_invalid(a))]

sessionOptions = blpapi.SessionOptions()
sessionOptions.setApplicationIdentityKey("52e02b77-2aa7-4412-b718-804f115a0546")
session = blpapi.Session(sessionOptions)
kwargs = {'sess':session}

# Historical

In [None]:
itemList = pd.read_excel('./cpi_itemList.xlsx', sheet_name='itemList')
itemList_dict = itemList.set_index('ItemCode').to_dict()
startMonth = datetime(1970,1,1)
# endMonth = datetime(2025,3,1)
today = datetime.today()

#全国だから前月初
endMonth = today - relativedelta.relativedelta(months=1)
endMonth = endMonth.replace(day=1)

nhor = (endMonth.year - startMonth.year)*12 + endMonth.month - startMonth.month + 1
darray = np.full((nhor, len(itemList)), fill_value=np.nan)
temp_df = pd.DataFrame(data=darray, columns=itemList['ItemCode'])
temp_df.index = [startMonth + relativedelta.relativedelta(months=x) for x in range(nhor)]

In [None]:
all_dfs = pd.read_excel('./all_data.xlsx', engine="openpyxl", sheet_name=None, index_col=0, skiprows=[0,1,3,4,5])
itemMonthWeights = pd.read_excel('./monthWeights.xlsx', engine="openpyxl", sheet_name=None)
yoy_adjustments = pd.read_excel('./cpi_special_factor.xlsx', engine="openpyxl", sheet_name=None, index_col=0, skiprows=[0,1,3,4,5])

In [None]:
# yoy
df_yoy = temp_df.copy()
for ix, code in enumerate(itemList['ItemCode']):
    if itemList['in2020'][ix] == 1:
        df_yoy[code] = all_dfs['20yoy'][code]
    elif itemList['in2015'][ix] == 1:
        df_yoy[code] = all_dfs['15yoy'][code]
    elif itemList['in2010'][ix] == 1:
        df_yoy[code] = all_dfs['10yoy'][code]
    else:
        df_yoy[code] = all_dfs['05yoy'][code]
df_yoy.columns = pd.MultiIndex.from_arrays(itemList[['ItemCode','Name','Cat','Type']].T.values)

# interpolate seasonal survey items (yoy)
for col in df_yoy.columns:
    test_array = df_yoy[col].values
    list_by_nan = using_clump(test_array)
    if len(list_by_nan) > 1 and col[2] != 1:
        df_intplt = df_yoy[[col]].copy()
        avg_list = [x.mean() for x in list_by_nan]
        df_intplt['NAflag'] = df_intplt[col].isna()*1
        df_intplt['L_NAflag'] = df_intplt['NAflag'].shift(1)
        df_intplt['NA_end'] = [1 if x==1 and y==0 else 0 for x,y in zip(df_intplt['NAflag'],df_intplt['L_NAflag'])]
        df_intplt['NA_cum'] = df_intplt['NA_end'].cumsum()*df_intplt['NAflag']

        df_intplt['fillflag'] = 0
        if itemList_dict['in2005'][col[0]] == 1:
            df_intplt['fillflag'] = [1 if y < datetime(2011,1,1) else x for x,y in zip(df_intplt['fillflag'],df_intplt.index)]
        elif itemList_dict['in2005r'][col[0]] == 1:
            df_intplt['fillflag'] = [1 if y < datetime(2011,1,1) else x for x,y in zip(df_intplt['fillflag'],df_intplt.index)]
        elif itemList_dict['in2010'][col[0]] == 1:
            df_intplt['fillflag'] = [1 if y < datetime(2016,1,1) else x for x,y in zip(df_intplt['fillflag'],df_intplt.index)]
        elif itemList_dict['in2015'][col[0]] == 1:
            df_intplt['fillflag'] = [1 if y < datetime(2021,1,1) else x for x,y in zip(df_intplt['fillflag'],df_intplt.index)]
        elif itemList_dict['in2020'][col[0]] == 1:
            df_intplt['fillflag'] = [1 if y < datetime(2026,1,1) else x for x,y in zip(df_intplt['fillflag'],df_intplt.index)]

        df_intplt['newcol'] = [x if y==0 else avg_list[y-1] if z==1 else np.nan for x,y,z in zip(df_intplt[col],df_intplt['NA_cum'],df_intplt['fillflag'])]
        df_yoy[col] = df_intplt['newcol']

In [None]:
# index
df_index = temp_df.copy()
for ix, code in enumerate(itemList['ItemCode']):
    if itemList['in2020'][ix] == 1:
        df_index[code] = all_dfs['20index'][code]
    elif itemList['in2015'][ix] == 1:
        df_index[code] = all_dfs['15index'][code]
    elif itemList['in2010'][ix] == 1:
        df_index[code] = all_dfs['10index'][code]
    else:
        df_index[code] = all_dfs['05index'][code]
df_index.columns = pd.MultiIndex.from_arrays(itemList[['ItemCode','Name','Cat','Type']].T.values)

# interpolate seasonal survey items (index)
for col in df_index.columns:
    test_array = df_index[col].values
    list_by_nan = using_clump(test_array)
    if len(list_by_nan) > 1 and col[2] != 1:
        df_intplt = df_index[[col]].copy()
        avg_list = [x.mean() for x in list_by_nan]
        df_intplt['NAflag'] = df_intplt[col].isna()*1
        df_intplt['L_NAflag'] = df_intplt['NAflag'].shift(1)
        df_intplt['NA_end'] = [1 if x==1 and y==0 else 0 for x,y in zip(df_intplt['NAflag'],df_intplt['L_NAflag'])]
        df_intplt['NA_cum'] = df_intplt['NA_end'].cumsum()*df_intplt['NAflag']
        df_intplt['newcol'] = [x if y==0 else avg_list[y-1] for x,y in zip(df_intplt[col], df_intplt['NA_cum'])]
        df_index[col] = df_intplt['newcol']

In [None]:
# define a function to set weight
itemFixWeights = itemList[['ItemCode','weight20','weight15','weight10','weight05r','weight05']].set_index('ItemCode').to_dict()
def set_weight(i, idx, col, itemFixWeights=itemFixWeights, itemMonthWeights=itemMonthWeights, df_yoy=df_yoy, df_index=df_index, type='yoy'):
    itemCode = col[0]
    itemName = col[1]
    if type == 'yoy':
        na_check = df_yoy[col].isna()
        if na_check.iloc[i]:
            return None
        else:
            if idx <= datetime(2008, 12, 31):
                if itemName in itemMonthWeights['05mwght'].columns:
                    # print(itemName, idx)
                    return itemMonthWeights['05mwght'][itemName][idx.month-1]
                else:
                    return itemFixWeights['weight05'][itemCode]
            elif idx <= datetime(2010, 12, 31):
                if itemName in itemMonthWeights['05mwght'].columns:
                    return itemMonthWeights['05mwght'][itemName][idx.month-1]
                else:
                    return itemFixWeights['weight05r'][itemCode]
            elif idx <= datetime(2015, 12, 31):
                if itemName in itemMonthWeights['10mwght'].columns:
                    return itemMonthWeights['10mwght'][itemName][idx.month-1]
                else:
                    return itemFixWeights['weight10'][itemCode]
            elif idx <= datetime(2020, 12, 31):
                if itemName in itemMonthWeights['15mwght'].columns:
                    return itemMonthWeights['15mwght'][itemName][idx.month-1]
                else:
                    return itemFixWeights['weight15'][itemCode]
            else:
                if itemName in itemMonthWeights['20mwght'].columns:
                    return itemMonthWeights['20mwght'][itemName][idx.month-1]
                else:
                    return itemFixWeights['weight20'][itemCode]
    else:
        na_check = df_index[col].isna()
        if na_check.iloc[i]:
            return None
        else:
            if idx <= datetime(2007, 12, 31):
                if itemName in itemMonthWeights['05mwght'].columns:
                    return itemMonthWeights['05mwght'][itemName][idx.month-1]
                else:
                    return itemFixWeights['weight05'][itemCode]
            elif idx <= datetime(2009, 12, 31):
                if itemName in itemMonthWeights['05mwght'].columns:
                    return itemMonthWeights['05mwght'][itemName][idx.month-1]
                else:
                    return itemFixWeights['weight05r'][itemCode]
            elif idx <= datetime(2014, 12, 31):
                if itemName in itemMonthWeights['10mwght'].columns:
                    return itemMonthWeights['10mwght'][itemName][idx.month-1]
                else:
                    return itemFixWeights['weight10'][itemCode]
            elif idx <= datetime(2019, 12, 31):
                if itemName in itemMonthWeights['15mwght'].columns:
                    return itemMonthWeights['15mwght'][itemName][idx.month-1]
                else:
                    return itemFixWeights['weight15'][itemCode]
            else:
                if itemName in itemMonthWeights['20mwght'].columns:
                    return itemMonthWeights['20mwght'][itemName][idx.month-1]
                else:
                    return itemFixWeights['weight20'][itemCode]


# weight (yoy)
df_weight_yoy = temp_df.copy()
df_weight_yoy.columns = pd.MultiIndex.from_arrays(itemList[['ItemCode','Name','Cat','Type']].T.values)
for col in tqdm(df_weight_yoy.columns):
    df_weight_yoy[col] = [set_weight(i, idx, col) for i, idx in enumerate(df_weight_yoy.index)]

# weight (index)
df_weight_index = temp_df.copy()
df_weight_index.columns = pd.MultiIndex.from_arrays(itemList[['ItemCode','Name','Cat','Type']].T.values)
for col in tqdm(df_weight_index.columns):
    df_weight_index[col] = [set_weight(i, idx, col, type='index') for i, idx in enumerate(df_weight_index.index)]

In [None]:
# yoy excluding special factors (VAT, energy, education, travel)
yoy_adjustments = pd.read_excel('./cpi_special_factor.xlsx', engine="openpyxl", sheet_name=None, index_col=0, header=[0,1,2,3])
array_yoy_adj = df_yoy.values
for key in yoy_adjustments.keys():
    adj_array = yoy_adjustments[key].values
    adj_array = np.nan_to_num(adj_array)
    array_yoy_adj = array_yoy_adj + adj_array

df_yoy_adj = pd.DataFrame(
    data=array_yoy_adj,
    columns=df_yoy.columns,
    index=df_yoy.index
)

In [None]:
#save
dfs = {
    'yoy':df_yoy,
    'index':df_index,
    'yoy_weight':df_weight_yoy,
    'index_weight':df_weight_index,
    'yoy_adj':df_yoy_adj
}
with pd.ExcelWriter('./cpi_all.xlsx') as writer:
    for df_name, df in dfs.items():
        df.to_excel(writer, sheet_name=df_name)

# CPI (special factors adjusted)

In [None]:
cpi_dfs = pd.read_excel('./cpi_all.xlsx', engine="openpyxl", sheet_name=None, index_col=0, skiprows=[0,2,3,4])
df_cpi_all = pd.DataFrame()
for ix, key in enumerate(cpi_dfs.keys()):
    temp_df = cpi_dfs[key].reset_index().melt(id_vars='index')
    temp_df.columns = ['Date','Name',key]

    if ix == 0:
        df_cpi_all = temp_df.copy()
    else:
        df_cpi_all[key] = temp_df[key]

meta_cols = [
    'ItemCode',
    'Name',
    'Cat',
    'Type',
    'Overall',
    'Core',
    'BoJCore',
    'WesternCore',
    'CPIExImpRents',
    'CPIExImpRents&FF',
    'GoodsExFFE',
    'GenSerExRents',
    'Ser',
    'Foods',
    'ImpRents',
    'Energy',
    'FreshFoods',
    'freq_cat',
]
df_cpi_all = df_cpi_all.merge(itemList[meta_cols], on='Name', how='left')
df_cpi_all['Atlanta'] = ['Flexible' if x == 'High' else 'Sticky' for x in df_cpi_all['freq_cat']]
df_cpi_all.to_parquet('./df_cpi_all.parquet', compression='zstd')
df_cpi_all.to_csv('./df_cpi_all.csv', encoding='shift-jis')

In [None]:
# calculate yoy
df_cpi_all['yoy:weight'] = df_cpi_all['yoy_adj']*df_cpi_all['yoy_weight']
cpi_yoy_df = pd.DataFrame(index=cpi_dfs['yoy'].index)

# Overall/Core/BoJ core/CPI ex imputed rents and FF
cal_cols = [
    'Overall',
    'Core',
    'BoJCore',
    'WesternCore',
    'CPIExImpRents',
    'CPIExImpRents&FF',
    'GoodsExFFE',
    'GenSerExRents',
    'Ser',
    'Foods',
    'ImpRents',
    'Energy',
    'FreshFoods',
]

for col in cal_cols:
    temp_df = df_cpi_all.copy()[df_cpi_all[col] == 1]
    temp_agg_yoy = temp_df.groupby(['Date'])[['yoy_weight','yoy:weight']].sum()
    temp_agg_yoy[col] = temp_agg_yoy['yoy:weight']/temp_agg_yoy['yoy_weight']
    cpi_yoy_df[col] = temp_agg_yoy[col]

cpi_yoy_df.dropna(inplace=True, how='all')

# aggregate by category
cat_cols = ['Type','Cat','freq_cat','Atlanta']
for cat_col in cat_cols:
    temp_df = df_cpi_all.groupby(['Date',cat_col])[['yoy_weight','yoy:weight']].sum()
    temp_df['yoy'] = temp_df['yoy:weight']/temp_df['yoy_weight']
    temp_df = pd.pivot_table(temp_df.reset_index(), values='yoy', index='Date', columns=cat_col)
    cpi_yoy_df = pd.concat([cpi_yoy_df, temp_df], axis=1)

# calculate (Western) core Flexible and Sticky
temp_df = df_cpi_all[df_cpi_all['WesternCore']==1]
temp_df = temp_df.groupby(['Date','Atlanta'])[['yoy_weight','yoy:weight']].sum()
temp_df['yoy'] = temp_df['yoy:weight']/temp_df['yoy_weight']
temp_df = pd.pivot_table(temp_df.reset_index(), values='yoy', index='Date', columns='Atlanta')
temp_df.columns = [f'WesternCore {x}' for x in temp_df.columns]
cpi_yoy_df = pd.concat([cpi_yoy_df, temp_df], axis=1)

# calculate (Western) core Flexible and Sticky
temp_df = df_cpi_all[df_cpi_all['BoJCore']==1]
temp_df = temp_df.groupby(['Date','Atlanta'])[['yoy_weight','yoy:weight']].sum()
temp_df['yoy'] = temp_df['yoy:weight']/temp_df['yoy_weight']
temp_df = pd.pivot_table(temp_df.reset_index(), values='yoy', index='Date', columns='Atlanta')
temp_df.columns = [f'BoJCore {x}' for x in temp_df.columns]
cpi_yoy_df = pd.concat([cpi_yoy_df, temp_df], axis=1)

# calculate flexible and sticky inflation by type
temp_df = df_cpi_all.groupby(['Date','Atlanta','Type'])[['yoy_weight','yoy:weight']].sum()
temp_df['yoy'] = temp_df['yoy:weight']/temp_df['yoy_weight']
temp_df = pd.pivot_table(temp_df.reset_index(), values='yoy', index='Date', columns=['Atlanta','Type'])
temp_df.columns = [f'{x[0]}:{x[1]}' for x in temp_df.columns]
cpi_yoy_df = pd.concat([cpi_yoy_df, temp_df], axis=1)

cpi_yoy_df.to_csv('./特殊要因除くCPI前年比.csv')


In [None]:
# calculate m/m
# first create item-level index with base year of 2008 (since there's no yoy adjustment in 2018)
# use 1980
index_baseMonth = datetime(2008,1,1)
baseMonth_loc = list(cpi_dfs['index'].index).index(index_baseMonth)

# aggregate
df_cpi_all['index:weight'] = df_cpi_all['index']*df_cpi_all['index_weight']
cpi_index_df = pd.DataFrame(index=cpi_dfs['index'].index)

# Overall/Core/BoJ core/CPI ex imputed rents and FF
cal_cols = [
    'Overall',
    'Core',
    'BoJCore',
    'WesternCore',
    'CPIExImpRents',
    'CPIExImpRents&FF',
    'GoodsExFFE',
    'GenSerExRents'
]

for col in cal_cols:
    temp_df = df_cpi_all.copy()[df_cpi_all[col] == 1]
    temp_agg_index = temp_df.groupby(['Date'])[['index_weight','index:weight']].sum()
    temp_agg_index[col] = temp_agg_index['index:weight']/temp_agg_index['index_weight']
    cpi_index_df[col] = temp_agg_index[col]


# aggregate by category
cat_cols = ['Type','Atlanta']
for cat_col in cat_cols:
    temp_df = df_cpi_all.groupby(['Date',cat_col])[['index_weight','index:weight']].sum()
    temp_df['index'] = temp_df['index:weight']/temp_df['index_weight']
    temp_df = pd.pivot_table(temp_df.reset_index(), values='index', index='Date', columns=cat_col)
    cpi_index_df = pd.concat([cpi_index_df, temp_df], axis=1)

# calculate (Western) core Flexible and Sticky
temp_df = df_cpi_all[df_cpi_all['WesternCore']==1]
temp_df = temp_df.groupby(['Date','Atlanta'])[['index_weight','index:weight']].sum()
temp_df['index'] = temp_df['index:weight']/temp_df['index_weight']
temp_df = pd.pivot_table(temp_df.reset_index(), values='index', index='Date', columns='Atlanta')
temp_df.columns = [f'WesternCore {x}' for x in temp_df.columns]
cpi_index_df = pd.concat([cpi_index_df, temp_df], axis=1)

# calculate (Western) core Flexible and Sticky
temp_df = df_cpi_all[df_cpi_all['BoJCore']==1]
temp_df = temp_df.groupby(['Date','Atlanta'])[['index_weight','index:weight']].sum()
temp_df['index'] = temp_df['index:weight']/temp_df['index_weight']
temp_df = pd.pivot_table(temp_df.reset_index(), values='index', index='Date', columns='Atlanta')
temp_df.columns = [f'BoJCore {x}' for x in temp_df.columns]
cpi_index_df = pd.concat([cpi_index_df, temp_df], axis=1)

# calculate flexible and sticky inflation by type
temp_df = df_cpi_all.groupby(['Date','Atlanta','Type'])[['index_weight','index:weight']].sum()
temp_df['index'] = temp_df['index:weight']/temp_df['index_weight']
temp_df = pd.pivot_table(temp_df.reset_index(), values='index', index='Date', columns=['Atlanta','Type'])
temp_df.columns = [f'{x[0]}:{x[1]}' for x in temp_df.columns]
cpi_index_df = pd.concat([cpi_index_df, temp_df], axis=1)

def calc_index_adj(index_array, yoy_adj_array, index_baseMonth=index_baseMonth):
    # print(index_array.shape, yoy_adj_array.shape)
    baseMonth_loc = list(cpi_index_df.index).index(index_baseMonth)
    temp_array = np.zeros(len(cpi_index_df))

    for ix in range(baseMonth_loc, baseMonth_loc+12):
        temp_array[ix] = index_array[ix]
    for ix in range(baseMonth_loc+12, len(temp_array)):
        temp_array[ix] = temp_array[ix-12]*(1+yoy_adj_array[ix-12]/100)
    for ix in reversed(range(baseMonth_loc)):
        temp_array[ix] = temp_array[ix+12]/(1+yoy_adj_array[ix]/100)

    return temp_array

for col in cpi_index_df.columns:
    cpi_index_df[f'{col}_adj'] = calc_index_adj(cpi_index_df[col].values, cpi_yoy_df[col].values)


index_base = np.nanmean(cpi_index_df[(cpi_index_df.index >= datetime(2020,1,1))&(cpi_index_df.index <= datetime(2020,12,31))], axis=0)
cpi_index_df = cpi_index_df/index_base*100
cpi_index_df = cpi_index_df[[x for x in cpi_index_df.columns if x.endswith('_adj')]]
cpi_index_df.columns = [x.replace('_adj', '') for x in cpi_index_df.columns]
cpi_index_df.to_csv('./集計指数NSA.csv')

# # seasonal adjustment will be done on Eviews (the following code not working)
# x12path = r"C:\Users\x01494487\Desktop\Python\cpi_dl\x13as_ascii-v1-1-b61\x13as"
# cpi_index_sa_df = cpi_index_df.copy()
# for col in tqdm(cpi_index_sa_df.columns):
#     cpi_index_sa_df = sm.tsa.x13_arima_analysis(cpi_index_df[col], x12path=x12path, prefer_x13=True).seasadj

# load sa series
# cpi_index_sa_df = pd.read_excel('./集計指数SA.xlsx', sheet_name='data')

# Monthly Download

In [None]:
#DL (BBG)
tickerList = pd.read_excel('./tickerList.xlsx', sheet_name='tickerList')
dfs = {}
dlItems = tickerList.columns[1:]
dt = datetime.today()
start_date = "2020-01-01"
end_date = dt.replace(day = calendar.monthrange(dt.year, dt.month)[1])
end_date = f'{end_date.year}-{str(end_date.month).zfill(2)}-{end_date.day}'
for dlItem in tqdm(dlItems):
    temp_df = blp.bdh(
        tickers=tickerList[dlItem],
        flds = ["PX_LAST"],
        start_date = start_date,
        end_date = end_date,
        **kwargs
        )

    # 浄化槽清掃代（除いてもいいが列数を揃えたいのでごちゃごちゃやる）
    if 't_' in dlItem:
        temp_df.columns = [x for x in tickerList['cpi_item'] if x!= '浄化槽清掃代']
        temp_df['浄化槽清掃代'] = 0
    else:
        temp_df.columns = tickerList['cpi_item']
    temp_df = temp_df[tickerList['cpi_item']] #改めて並び替える
    dfs[dlItem] = temp_df

dfs['n_index_3digit'] = df_n_3dgit
dfs['n_yoy_3digit'] = df_n_3dgit.pct_change(periods=12, fill_method=None)*100
dfs['n_yoy_manual'] = dfs['n_index'].pct_change(periods=12, fill_method=None)*100

In [None]:
#DL (BBG: Aggregates)
tickerList = pd.read_excel('./tickerList_agg.xlsx', sheet_name='tickerList')
dlItems = tickerList.columns[1:]
dt = datetime.today()
start_date = "2020-01-01"
end_date = dt.replace(day = calendar.monthrange(dt.year, dt.month)[1])
end_date = f'{end_date.year}-{str(end_date.month).zfill(2)}-{end_date.day}'
for dlItem in tqdm(dlItems):
    temp_df = blp.bdh(
        tickers=tickerList[dlItem].dropna(),
        flds = ["PX_LAST"],
        start_date = start_date,
        end_date = end_date,
        **kwargs
        )
    dfs[dlItem] = temp_df
    temp_df.columns = tickerList['cpi_item'][-len(tickerList[dlItem].dropna()):]

In [None]:
#save
with pd.ExcelWriter('./cpi_2020.xlsx') as writer:
    for df_name, df in dfs.items():
        df.to_excel(writer, sheet_name=df_name)

# High Labor Cost

In [None]:
# Nationide
dfs = pd.read_excel('./cpi_2020.xlsx', engine="openpyxl", sheet_name=None, index_col=0)
weight_dict = pd.read_excel('./weights_flags.xlsx', engine="openpyxl", sheet_name=None)
calc_dict ={
    '全体':'HLS',
    '外食':'HLS/EatOut',
    '家事':'HLS/House',
    '医療・福祉':'HLS/Med',
    '教育':'HLS/Edu',
    '通信・教養娯楽':'HLS/TelRec',
    '低人件費率':'LLS'
}

df_hls_n = pd.DataFrame()
for key, val in calc_dict.items():
    temp_weight = (weight_dict['N_2020'].values * weight_dict['Flags'][val].fillna(0).values)

    hl_weight = np.zeros(dfs['n_index'].shape)
    for ix in range(hl_weight.shape[0]):
        hl_weight[ix] = temp_weight[dfs['n_index'].index[ix].month - 1]
        hl_weight[ix] = hl_weight[ix]/hl_weight[ix].sum()

    df_hls_n[key] = (np.nan_to_num(dfs['n_index'].values) * hl_weight).sum(axis=1)

# Tokyo
df_hls_t = pd.DataFrame()
for key, val in calc_dict.items():
    temp_weight = (weight_dict['T_2020'].values * weight_dict['Flags'][val].fillna(0).values)

    hl_weight = np.zeros(dfs['t_index'].shape)
    for ix in range(hl_weight.shape[0]):
        hl_weight[ix] = temp_weight[dfs['t_index'].index[ix].month - 1]
        hl_weight[ix] = hl_weight[ix]/hl_weight[ix].sum()

    df_hls_t[key] = (np.nan_to_num(dfs['t_index'].values) * hl_weight).sum(axis=1)


#save
save_dir = r'\\intranet.barcapint.com\dfs-apac\Group\TKY\ops\economics\Hashimoto\06_Commentary\CPI\hls'
df_hls_n.to_csv(f'{save_dir}/hls_n.csv', index=False,encoding='shift-jis')
df_hls_t.to_csv(f'{save_dir}/hls_t.csv', index=False,encoding='shift-jis')

In [None]:
# hls flag
hls_flags = weight_dict['Flags']

# calculate kernel density (nationwide)
df_wide = dfs['n_yoy'].reset_index()
df_long = df_wide.melt(id_vars=['index']).dropna()
df_long.columns = ['Date','Item','YoY']
df_long = df_long.merge(hls_flags, how='left', on='Item')
df_long['Year'] = [x.year for x in df_long['Date']]
df_long['YearQ'] = [f'{x.year}Q{(x.month-1)//3+1}' for x in df_long['Date']]
df_long = df_long[df_long['HLS'].isin([0,1])]

# Yearly
p = itertools.product(set(df_long['HLS']), sorted(set(df_long['Year'])))

df_results = pd.DataFrame()
x_min = -4
x_max = +10
x = np.arange(x_min, x_max+0.2, 0.2)
df_results['x'] = x
x = x.reshape(x.shape[0], 1)

for v in p:
    df_temp = df_long[(df_long['HLS']==v[0])&(df_long['Year']==v[1])]
    yoy_array = np.array(df_temp['YoY'])
    kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(yoy_array.reshape(yoy_array.shape[0], 1))
    log_density = kde.score_samples(x)
    df_results[v] = np.exp(log_density)

df_results.to_csv(f'{save_dir}/hls_n_kd_y.csv', index=False)

# Quarterly
p = itertools.product(set(df_long['HLS']), sorted(set(df_long['YearQ'])))

df_results = pd.DataFrame()
x_min = -4
x_max = +10
x = np.arange(x_min, x_max+0.2, 0.2)
df_results['x'] = x
x = x.reshape(x.shape[0], 1)

for v in p:
    df_temp = df_long[(df_long['HLS']==v[0])&(df_long['YearQ']==v[1])]
    yoy_array = np.array(df_temp['YoY'])
    kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(yoy_array.reshape(yoy_array.shape[0], 1))
    log_density = kde.score_samples(x)
    df_results[v] = np.exp(log_density)

df_results.to_csv(f'{save_dir}/hls_n_kd_q.csv', index=False)

In [None]:
# calculate kernel density (Tokyo)
df_wide = dfs['t_yoy'].reset_index()
df_long = df_wide.melt(id_vars=['index']).dropna()
df_long.columns = ['Date','Item','YoY']
df_long = df_long.merge(hls_flags, how='left', on='Item')
df_long['Year'] = [x.year for x in df_long['Date']]
df_long['YearQ'] = [f'{x.year}Q{(x.month-1)//3+1}' for x in df_long['Date']]
df_long = df_long[df_long['HLS'].isin([0,1])]

# Yearly
p = itertools.product(set(df_long['HLS']), sorted(set(df_long['Year'])))

df_results = pd.DataFrame()
x_min = -4
x_max = +10
x = np.arange(x_min, x_max+0.2, 0.2)
df_results['x'] = x
x = x.reshape(x.shape[0], 1)

for v in p:
    df_temp = df_long[(df_long['HLS']==v[0])&(df_long['Year']==v[1])]
    yoy_array = np.array(df_temp['YoY'])
    kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(yoy_array.reshape(yoy_array.shape[0], 1))
    log_density = kde.score_samples(x)
    df_results[v] = np.exp(log_density)

df_results.to_csv(f'{save_dir}/hls_t_kd_y.csv', index=False)

# Quarterly
p = itertools.product(set(df_long['HLS']), sorted(set(df_long['YearQ'])))

df_results = pd.DataFrame()
x_min = -4
x_max = +10
x = np.arange(x_min, x_max+0.2, 0.2)
df_results['x'] = x
x = x.reshape(x.shape[0], 1)

for v in p:
    df_temp = df_long[(df_long['HLS']==v[0])&(df_long['YearQ']==v[1])]
    yoy_array = np.array(df_temp['YoY'])
    kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(yoy_array.reshape(yoy_array.shape[0], 1))
    log_density = kde.score_samples(x)
    df_results[v] = np.exp(log_density)

df_results.to_csv(f'{save_dir}/hls_t_kd_q.csv', index=False)

# Underlying inflation

In [None]:
# # load data
dfs = pd.read_excel('./cpi_2020.xlsx', engine="openpyxl", sheet_name=None, index_col=0)
weight_dict = pd.read_excel('./weights_flags.xlsx', engine="openpyxl", sheet_name=None)
Items = weight_dict['Flags']
CoreItems = Items[Items['Core']==1]['Item']

yoy = dfs['n_yoy_manual']
yoy = yoy[yoy.index >= '2022-01-01']
yoy_core = yoy[list(CoreItems)]
yoy_official = dfs['n_yoy']
yoy_official = yoy_official[yoy_official.index >= '2022-01-01']
yoy_official_core = yoy_official[list(CoreItems)]

weight_array = yoy.copy().values
weight_core_array = yoy_official_core.copy().values
for row in range(len(weight_array)):
    weight_array[row,:] = weight_dict['N_2020'].values[row % 12]
    temp_weight_df = weight_dict['N_2020'][list(CoreItems)]
    weight_core_array[row,:] = temp_weight_df.values[row % 12]

yoy_array = yoy.values
yoy_core_array = yoy_core.values
yoy_official_array = yoy_official.values
yoy_official_core_array = yoy_official_core.values

In [None]:
def weighted_quantile(values, quantiles, sample_weight=None,
                      values_sorted=False, old_style=False):
    """ Very close to numpy.percentile, but supports weights.
    NOTE: quantiles should be in [0, 1]!
    :param values: numpy.array with data
    :param quantiles: array-like with many quantiles needed
    :param sample_weight: array-like of the same length as `array`
    :param values_sorted: bool, if True, then will avoid sorting of
        initial array
    :param old_style: if True, will correct output to be consistent
        with numpy.percentile.
    :return: numpy.array with computed quantiles.
    """
    values = np.array(values)
    quantiles = np.array(quantiles)
    if sample_weight is None:
        sample_weight = np.ones(len(values))
    sample_weight = np.array(sample_weight)
    assert np.all(quantiles >= 0) and np.all(quantiles <= 1), \
        'quantiles should be in [0, 1]'

    if not values_sorted:
        sorter = np.argsort(values)
        values = values[sorter]
        sample_weight = sample_weight[sorter]

    # weighted_quantiles = np.cumsum(sample_weight) - 0.5 * sample_weight
    weighted_quantiles = np.cumsum(sample_weight)
    if old_style:
        # To be convenient with numpy.percentile
        weighted_quantiles -= weighted_quantiles[0]
        weighted_quantiles /= weighted_quantiles[-1]
    else:
        weighted_quantiles /= np.sum(sample_weight)
    return np.interp(quantiles, weighted_quantiles, values)

In [None]:
w_median = []
# w_median_bbg = []
t_mean = []
# t_mean_bbg = []
mode = []
for i in range(yoy_official_core_array.shape[0]):
    # temp_yoy = yoy_array[i,:][~np.isnan(yoy_array[i,:])]
    temp_yoy = yoy_core_array[i,:][~np.isnan(yoy_core_array[i,:])]
    # temp_yoy = yoy_official_core_array[i,:][~np.isnan(yoy_official_core_array[i,:])]
    temp_weight = weight_core_array[i,:][np.nonzero(weight_core_array[i,:])]
    wqtile = weighted_quantile(temp_yoy, quantiles=[0.1, 0.475, 0.5, 0.525, 0.9], sample_weight=temp_weight)
    v1 = wqtile[2]
    v2 = (wqtile[1]+wqtile[3])/2
    v3 = temp_yoy[(temp_yoy >= wqtile[1]) & (temp_yoy <= wqtile[3])].mean()
    v4 = np.average(temp_yoy[(temp_yoy >= wqtile[1]) & (temp_yoy <= wqtile[3])], weights=temp_weight[(temp_yoy >= wqtile[1]) & (temp_yoy <= wqtile[3])])
    # w_median.append(wqtile[2])
    # w_median.append((wqtile[1]+wqtile[3])/2)
    # w_median.append(temp_yoy[(temp_yoy >= wqtile[1]) & (temp_yoy <= wqtile[3])].mean())
    w_median.append(v2)
    # trimmed_array = (temp_yoy >= wqtile[1]) & (temp_yoy <= wqtile[3])
    # w_median.append(np.average(temp_yoy[trimmed_array], weights=temp_weight[trimmed_array]))


    # temp_yoy = yoy_official_core_array[i,:][~np.isnan(yoy_official_core_array[i,:])]
    trimmed_array = (temp_yoy >= wqtile[0]) & (temp_yoy <= wqtile[4])
    # print(i, trimmed_array.shape, temp_yoy.shape, temp_weight.shape)
    temp_t_mean = np.average(temp_yoy[trimmed_array], weights=temp_weight[trimmed_array])
    t_mean.append(temp_t_mean)

    # temp_yoy_bbg = yoy_bbg_array[i,:][~np.isnan(yoy_bbg_array[i,:])]
    # wqtile_bbg = weighted_quantile(temp_yoy_bbg, quantiles=[0.1, 0.475, 0.5, 0.525, 0.9], sample_weight=temp_weight)
    # w_median_bbg.append(wqtile_bbg[2])

    # trimmed_bbg_array = (temp_yoy > wqtile_bbg[0]) & (temp_yoy < wqtile_bbg[4])
    # temp_t_mean_bbg = np.average(temp_yoy_bbg[trimmed_bbg_array], weights=temp_weight[trimmed_bbg_array])
    # t_mean_bbg.append(temp_t_mean_bbg)

    # temp_yoy_manual = yoy_array[i,:][~np.isnan(yoy_array[i,:])]
    # a1, b1, loc1, scale1 = norminvgauss.fit(temp_yoy_manual)
    temp_yoy_official = yoy_official_core_array[i,:][~np.isnan(yoy_official_core_array[i,:])]
    # temp_yoy_official = temp_yoy_official[(temp_yoy_official>=-20) & (temp_yoy_official<=30)]
    a1, b1, loc1, scale1 = norminvgauss.fit(temp_yoy_official)
    if i == 0:
        a1, b1, loc1, scale1 = norminvgauss.fit(temp_yoy_official)
    else:
        a1, b1, loc1, scale1 = norminvgauss.fit(temp_yoy_official, loc=loc1, scale=scale1)
    x = np.arange(-20, 50+1, 0.1)
    pdf_array = norminvgauss.pdf(x, a1, b1, loc1, scale1)
    mode.append(x[pdf_array.argmax()])


In [None]:
result_df = pd.DataFrame({
    'w_median':w_median,
    # 'w_median_bbg':w_median_bbg,
    't_mean':t_mean,
    # 't_mean_bbg':t_mean_bbg
    'mode':mode
})

#save
save_dir = r'\\intranet.barcapint.com\dfs-apac\Group\TKY\ops\economics\Hashimoto\06_Commentary\CPI\underlying'
result_df.to_csv(f'{save_dir}/results.csv', index=False)

# Decomposition

In [None]:
# # load data
dfs = pd.read_excel('./cpi_2020.xlsx', engine="openpyxl", sheet_name=None, index_col=0)
sheets = ['n_yoycont','n_momcont','n_yoy_cont_agg','n_mom_cont_agg','t_yoycont','t_momcont','t_yoy_cont_agg','t_mom_cont_agg']

# choose last 15 months, transpose, then save as csv
save_dir = r'\\intranet.barcapint.com\dfs-apac\Group\TKY\ops\economics\Hashimoto\06_Commentary\CPI\contribution'

for sheet in tqdm(sheets):
    df_temp = dfs[sheet].tail(15)
    df_temp.sort_index(ascending=False).T.to_csv(f'{save_dir}/{sheet}.csv', encoding='shift-jis')