# Setup: Parameters, Imports

In [2]:
# Parameters

## Years
start_year = 2019
end_year = 2022
base_year = start_year
years = range(start_year, end_year + 1)

## Grouping
young_age_cutoff=25
old_age_threshold=65

## Indexing
price_variable = 'mehir' # 'mehir' or 'omdan'

## Output
top_n = 5
comparison_year = end_year
comparison_level = 'primary'

## Folder Names
cex_data_folder="/Users/roykisluk/Downloads/Consumer_Expenditure_Survey/"
folder_names_pathname='Data_clean/CEX_folder_names.csv'
age_groups_pathname='Data_clean/age_groups.csv'
prodcode_dict_pathname = 'Data_clean/prodcode_dictionary_c3-c399.csv'
    
## Libraries
import pandas as pd
import pyreadstat  
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML

# Import Data

In [28]:
# Load folder names
folder_names_df = pd.read_csv(folder_names_pathname)

# Load age groups
age_groups_df = pd.read_csv(age_groups_pathname)
young_age_group_id = age_groups_df[(age_groups_df['min_age'] <= young_age_cutoff) & (age_groups_df['max_age'] >= young_age_cutoff)].index[0] + 1
old_age_group_id = age_groups_df[(age_groups_df['min_age'] <= old_age_threshold) & (age_groups_df['max_age'] >= old_age_threshold)].index[0] + 1

# Load household data for each year
dfs_mb = {}
for year in years:
    subfolder = folder_names_df.loc[folder_names_df['Year'] == year, 'Folder_Name'].values[0]
    data_HH_pathname = f"{cex_data_folder}{subfolder}/{subfolder}datamb.sas7bdat"
    df, meta = pyreadstat.read_sas7bdat(data_HH_pathname)
    df.columns = df.columns.str.lower()
    if 'gil' in df.columns:
        df.rename(columns={'gil': 'age_group'}, inplace=True)
    dfs_mb[year] = df

# Load individual data for each year
dfs_prat = {}
for year in years:
    subfolder = folder_names_df.loc[folder_names_df['Year'] == year, 'Folder_Name'].values[0]
    data_IND_pathname = f"{cex_data_folder}{subfolder}/{subfolder}dataprat.sas7bdat"
    df, meta = pyreadstat.read_sas7bdat(data_IND_pathname)
    df.columns = df.columns.str.lower()
    if 'gil' in df.columns:
        df.rename(columns={'gil': 'age_group'}, inplace=True)
    dfs_prat[year] = df

# Load expenses data for each year
dfs_prod = {}
for year in years:
    subfolder = folder_names_df.loc[folder_names_df['Year'] == year, 'Folder_Name'].values[0]
    data_prices_pathname = f"{cex_data_folder}{subfolder}/{subfolder}dataprod.sas7bdat"
    df, meta = pyreadstat.read_sas7bdat(data_prices_pathname)
    df.columns = df.columns.str.lower()
    dfs_prod[year] = df

# Load survey data for each year
dfs_survey = {}
for year in years:
    subfolder = folder_names_df.loc[folder_names_df['Year'] == year, 'Folder_Name'].values[0]
    data_prices_pathname = f"{cex_data_folder}{subfolder}/{subfolder}datayoman.sas7bdat"
    df, meta = pyreadstat.read_sas7bdat(data_prices_pathname)
    df.columns = df.columns.str.lower()
    dfs_survey[year] = df

# Grouping

## Data

In [30]:
Groups = {}
for year in years:
    Groups[year] = pd.DataFrame(dfs_mb[year]['misparmb'].unique(), columns=['misparmb'])

In [31]:
for year in years:
    dfs_mb_year = dfs_mb[year]
    dfs_prat_year = dfs_prat[year]

    nationality_map = {1: 'Jewish', 2: 'Arab'}
    observance_map = {1: 'Secular', 2: 'Conservative', 3: 'Religious', 4: 'Ultra-Orthodox', 5: 'Mixed'}

    Groups[year]['Nationality'] = dfs_mb_year['nationality'].map(nationality_map).fillna('Other')
    Groups[year]['Observance'] = dfs_mb_year['ramatdatiyut'].map(observance_map).fillna('Other')

    age_group_map = {age_group_id: 'Young' if age_group_id <= young_age_group_id else 'Old' if age_group_id >= old_age_group_id else 'Middle' for age_group_id in dfs_prat_year['age_group'].unique()}
    Groups[year]['Age_Group'] = dfs_prat_year.loc[dfs_prat_year['y_kalkali'] == 1, 'age_group'].map(age_group_map).values

    Groups[year]['Income_Decile'] = dfs_mb_year['decile'].fillna(0).astype(int)

    Groups[year]['Income_Quintile'] = pd.cut(dfs_mb_year['decile'], bins=[0, 2, 4, 6, 8, 10], labels=[1, 2, 3, 4, 5])

    Groups[year]['SES_Quintile'] = dfs_mb_year['cluster'].apply(lambda x: x if x in range(1, 6) else np.nan).fillna(0).astype(int)
    Groups[year]['SES_Tertile'] = dfs_mb_year['cluster'].apply(lambda x: 1 if x in [1, 2] else 2 if x == 3 else 3 if x in [4, 5] else np.nan).fillna(0).astype(int)

    Groups[year]['Children'] = dfs_mb_year['nefashotad18'].fillna(0).astype(int)
    Groups[year]['Family_Size'] = Groups[year]['Children'].apply(lambda x: 'no children' if x == 0 else '1 to 3' if x in [1, 2, 3] else '4 plus')


## Groups dataframes headers

In [33]:
for year in years:
    display(HTML(f"<h2>Groups for Year {year}</h2>"))
    display(HTML(Groups[year].head().to_html(index=False)))
    print(f"Number of observations: {len(dfs_mb[year])}")


misparmb,Nationality,Observance,Age_Group,Income_Decile,Income_Quintile,SES_Quintile,SES_Tertile,Children,Family_Size
1002.0,Jewish,Religious,Old,5,3,3,2,0,no children
1003.0,Jewish,Conservative,Young,9,5,3,2,0,no children
1004.0,Jewish,Conservative,Middle,7,4,3,2,0,no children
1005.0,Jewish,Religious,Old,10,5,3,2,0,no children
1006.0,Jewish,Religious,Old,6,3,3,2,0,no children


Number of observations: 7827


misparmb,Nationality,Observance,Age_Group,Income_Decile,Income_Quintile,SES_Quintile,SES_Tertile,Children,Family_Size
14449.0,Jewish,Secular,Middle,6,3,4,3,0,no children
14450.0,Jewish,Conservative,Old,4,2,3,2,0,no children
14451.0,Arab,Religious,Middle,1,1,2,1,2,1 to 3
14452.0,Arab,Religious,Old,3,2,2,1,0,no children
14453.0,Arab,Religious,Old,9,5,2,1,0,no children


Number of observations: 5593


misparmb,Nationality,Observance,Age_Group,Income_Decile,Income_Quintile,SES_Quintile,SES_Tertile,Children,Family_Size
28106.0,Jewish,Secular,Old,10,5,4,3,0,no children
28107.0,Jewish,Secular,Old,10,5,4,3,0,no children
28108.0,Jewish,Ultra-Orthodox,Middle,2,1,3,2,7,4 plus
28109.0,Jewish,Religious,Old,1,1,3,2,0,no children
28110.0,Jewish,Conservative,Old,4,2,3,2,0,no children


Number of observations: 6057


misparmb,Nationality,Observance,Age_Group,Income_Decile,Income_Quintile,SES_Quintile,SES_Tertile,Children,Family_Size
57089.0,Jewish,Conservative,Old,3,2,3,2,0,no children
57090.0,Jewish,Secular,Old,7,4,3,2,0,no children
57091.0,Jewish,Secular,Middle,2,1,3,2,3,1 to 3
57092.0,Jewish,Secular,Middle,4,2,3,2,1,1 to 3
57093.0,Arab,Secular,Middle,10,5,3,2,0,no children


Number of observations: 5478


## Plot groups distribution

In [None]:
# Get the columns to plot
columns_to_plot = [col for col in Groups[end_year].columns if col != 'misparmb']

# Calculate the number of rows needed
ncols = 3
nrows = (len(columns_to_plot) + ncols - 1) // ncols

# Create subplots
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 5, nrows * 5))

# Flatten the axes array for easy iteration
axes = axes.flatten()

# Plot each column
for ax, column in zip(axes, columns_to_plot):
    Groups[end_year][column].value_counts().sort_index().plot(kind='bar', ax=ax, color='skyblue')
    ax.set_title(f'Distribution of {column} in {end_year}')
    ax.set_xlabel(column)
    ax.set_ylabel('Count')

# Remove any unused subplots
for ax in axes[len(columns_to_plot):]:
    fig.delaxes(ax)

plt.tight_layout()
plt.show()

# Indexing

## Laspeyres Index

$$
I_{i}=\frac{\sum_{j\in L}{\frac{P_{ij}}{P_{oj}}(P_{oj}Q_{oj})}}{\sum_{j\in L}P_{oj}Q_{oj}}\times 100
$$

$$\text{For our purposes:}$$

$$
I_{ij}=\frac{P_{ij}}{P_{oj}}
$$
$$
W_{oj}=\frac{P_{oj}Q_{oj}}{\sum_{j\in L}P_{oj}Q_{oj}}
$$
$$
I_{i}=\sum_{j\in L}W_{oj}I_{ij}\times 100
$$



$$
\text{Where:}\\
I_{i}\text{  - Index for the current period}\\
Q_{oj}\text{  - Quantity of the good or service in the base period}\\
P_{oj}\text{  - Price of the good or service in the base period}\\
P_{ij}\text{  - Price of the good or service in the current period}\\
L\text{  - The set of all goods and services in the index basket}\\
$$

In [34]:
year = 2022
Groups[year]

Unnamed: 0,misparmb,Nationality,Observance,Age_Group,Income_Decile,Income_Quintile,SES_Quintile,SES_Tertile,Children,Family_Size
0,57089.0,Jewish,Conservative,Old,3,2,3,2,0,no children
1,57090.0,Jewish,Secular,Old,7,4,3,2,0,no children
2,57091.0,Jewish,Secular,Middle,2,1,3,2,3,1 to 3
3,57092.0,Jewish,Secular,Middle,4,2,3,2,1,1 to 3
4,57093.0,Arab,Secular,Middle,10,5,3,2,0,no children
...,...,...,...,...,...,...,...,...,...,...
5473,69827.0,Jewish,Conservative,Young,7,4,4,3,1,1 to 3
5474,80031.0,Jewish,Conservative,Middle,9,5,4,3,1,1 to 3
5475,80032.0,Jewish,Conservative,Middle,5,3,2,1,3,1 to 3
5476,80033.0,Jewish,Secular,Middle,9,5,4,3,2,1 to 3


In [37]:
dfs_survey[year]

Unnamed: 0,s_seker,misparmb,yom_kniya,hodesh_kniya,shnat_kniya,yom_bashavua,prodcode,kamut,ariza,chanut,mehir,mutzar_yoman,omdan
0,2022.0,57089.0,31.0,1.0,2022.0,2.0,304170.0,1.0,7.0,100.0,18.0,1.0,80.4820
1,2022.0,57089.0,31.0,1.0,2022.0,2.0,304139.0,2.0,3.0,100.0,4.0,1.0,17.8849
2,2022.0,57089.0,31.0,1.0,2022.0,2.0,381012.0,1.0,22.0,600.0,44.0,1.0,196.7340
3,2022.0,57089.0,31.0,1.0,2022.0,2.0,304014.0,1.0,19.0,100.0,6.0,1.0,26.8273
4,2022.0,57089.0,31.0,1.0,2022.0,2.0,304303.0,18.0,2.0,100.0,18.0,1.0,80.4820
...,...,...,...,...,...,...,...,...,...,...,...,...,...
278473,2022.0,69827.0,8.0,3.0,2023.0,4.0,384040.0,1.0,22.0,800.0,16.0,1.0,67.2921
278474,2022.0,80031.0,29.0,8.0,2022.0,2.0,309997.0,1.0,22.0,300.0,700.0,1.0,3012.6360
278475,2022.0,80032.0,4.0,7.0,2022.0,2.0,309997.0,1.0,2.0,300.0,500.0,1.0,2145.8550
278476,2022.0,80033.0,26.0,12.0,2022.0,2.0,309997.0,1.0,22.0,300.0,500.0,1.0,2127.9733


In [36]:
dfs_prod[year]

Unnamed: 0,s_seker,misparmb,prodcode,schum
0,2022.0,57089.0,1.0,5357.0
1,2022.0,57089.0,3.0,5160.0
2,2022.0,57089.0,13.0,723.0
3,2022.0,57089.0,14.0,4634.0
4,2022.0,57089.0,30.0,1259.0
...,...,...,...,...
621599,2022.0,83443.0,397067.0,2144.0
621600,2022.0,83443.0,411041.0,1483.0
621601,2022.0,83443.0,423053.0,61517.0
621602,2022.0,83443.0,423087.0,2924.0


In [None]:
total_consumption = 0.0
for j in range(0, len(df)):
    total_consumption += df['omdan'][j]

In [None]:


# Functions

def total_consumption_value(df): 
    total_consumption = 0.0
    for j in range(0, len(df)):
        total_consumption += df['omdan'][j]
    return total_consumption

def keep_shared_prodcodes(df1, df2):
    shared_prodcodes = set(df1['prodcode']).intersection(set(df2['prodcode']))
    df1_shared = df1[df1['prodcode'].isin(shared_prodcodes)].reset_index(drop=True)
    df2_shared = df2[df2['prodcode'].isin(shared_prodcodes)].reset_index(drop=True)
    return df1_shared, df2_shared

def weighting(df):
    weights = pd.DataFrame(df['prodcode'].unique(), columns=['prodcode'])
    weights['weight'] = 0.0
    total_consumption = total_consumption_value(df)
    for j in range(0, len(weights)):
        weights.loc[j, 'weight'] = df[df['prodcode'] == weights.loc[j, 'prodcode']]['omdan'].sum() / total_consumption
    return weights

def average_price(df):
    average_prices = pd.DataFrame(df['prodcode'].unique(), columns=['prodcode'])
    average_prices['price'] = 0.0
    for j in range(0, len(average_prices)):
        average_prices.loc[j, 'price'] = (df[df['prodcode'] == average_prices.loc[j, 'prodcode']]['mehir'] / df[df['prodcode'] == average_prices.loc[j, 'prodcode']]['kamut']).mean()
    return average_prices

def Laspeyres(df_base, df_current):
    index_df = pd.DataFrame(df_base['prodcode'].unique(), columns=['prodcode'])
    index_df['index'] = 0.0
    weights = weighting(df_base)
    average_prices_base = average_price(df_base)
    average_prices_current = average_price(df_current)
    index_df = index_df.merge(weights, on='prodcode', how='left')
    index_df = index_df.merge(average_prices_base, on='prodcode', how='left', suffixes=('', '_base'))
    index_df = index_df.merge(average_prices_current, on='prodcode', how='left', suffixes=('_base', '_current'))
    total_index = 0.0
    missing_base_prices = 0
    missing_current_prices = 0
    for j in range(len(index_df)):
        price_current = index_df.loc[j, 'price_current']
        price_base = index_df.loc[j, 'price_base']
        if price_base == 0 or pd.isna(price_base) or np.isinf(price_base):
            index_df.loc[j, 'index'] = factor * 100
            missing_base_prices += 1
            continue
        if price_current == 0 or pd.isna(price_current) or np.isinf(price_current):
            index_df.loc[j, 'index'] = factor * 100
            missing_current_prices += 1
            continue
        index_df.loc[j, 'index'] = (price_current / price_base) * 100
    for j in range(len(index_df)):
        weight = index_df.loc[j, 'weight']
        total_index += weight * index_df.loc[j, 'index']
    return index_df, total_index

def merge_to_secondary(df):
    df['prodcode_secondary'] = df['prodcode'].astype(str).str[:3]
    grouped = df.groupby('prodcode_secondary', group_keys=False).apply(
        lambda x: pd.Series({
            'price_index': np.average(x['index'], weights=x['weight']) if x['weight'].sum() > 0 else np.nan,
            'total_weight': x['weight'].sum()
        }),
        include_groups=False 
    ).reset_index()
    grouped.rename(columns={'prodcode_secondary': 'prodcode'}, inplace=True)
    grouped.rename(columns={'total_weight': 'weight'}, inplace=True)
    return grouped

def merge_to_primary(df):
    df['prodcode_primary'] = df['prodcode'].astype(str).str[:2]
    grouped = df.groupby('prodcode_primary', group_keys=False).apply(
        lambda x: pd.Series({
            'price_index': np.average(x['price_index'], weights=x['weight']) if x['weight'].sum() > 0 else np.nan,
            'total_weight': x['weight'].sum()
        }),
        include_groups=False
    ).reset_index()
    grouped.rename(columns={'prodcode_primary': 'prodcode'}, inplace=True)
    grouped.rename(columns={'total_weight': 'weight'}, inplace=True)
    return grouped



# Filter observations for relevant group
if group_mmb is not None:
    for year in years:
        dfs_survey[year] = dfs_survey[year][dfs_survey[year]['misparmb'].isin(group_mmb[year]['misparmb'])]

# Filter observations with prodcode that starts with 3
for year in years:
    dfs_survey[year] = dfs_survey[year][dfs_survey[year]['prodcode'].astype(str).str.startswith('3')].reset_index(drop=True)

# Calculate weights and price indexes
yearly_price_index = {}
df_price_index = {}
for year in years:
    df_base, df_current = keep_shared_prodcodes(dfs_survey[base_year], dfs_survey[year])
    df_price_index[year], yearly_price_index[year] = Laspeyres(df_base, df_current)

# Combine all years into a single dataframe
combined_df = pd.concat(df_price_index.values(), keys=df_price_index.keys(), names=['Year', 'Index']).reset_index(level='Index', drop=True).reset_index()
combined_df = combined_df[['Year', 'prodcode', 'index', 'weight']]

# Merge to secondary and primary categories
df_secondary = {}
df_primary = {}
for year in years:
    df_secondary[year] = merge_to_secondary(df_price_index[year])
    df_primary[year] = merge_to_primary(df_secondary[year])

# Combine secondary and primary categories into a single dataframe
combined_secondary_df = pd.concat(df_secondary.values(), keys=df_secondary.keys(), names=['Year', 'Index']).reset_index(level='Index', drop=True).reset_index()
combined_primary_df = pd.concat(df_primary.values(), keys=df_primary.keys(), names=['Year', 'Index']).reset_index(level='Index', drop=True).reset_index()

# Keep only the necessary columns
combined_secondary_df = combined_secondary_df[['Year', 'prodcode', 'price_index', 'weight']]
combined_primary_df = combined_primary_df[['Year', 'prodcode', 'price_index', 'weight']]

# Load prodcode dictionary
prodcode_dict_df = pd.read_csv(prodcode_dict_pathname)

# Remove description column if it already exists
if 'description' in combined_secondary_df.columns:
    combined_secondary_df = combined_secondary_df.drop(columns=['description'])
if 'description' in combined_primary_df.columns:
    combined_primary_df = combined_primary_df.drop(columns=['description'])

# Convert prodcode to string in both dataframes before merging
prodcode_dict_df['prodcode'] = prodcode_dict_df['prodcode'].astype(str)
combined_secondary_df['prodcode'] = combined_secondary_df['prodcode'].astype(str)

# Merge descriptions into combined_secondary_df
combined_secondary_df = combined_secondary_df.merge(prodcode_dict_df, on='prodcode', how='left')

# Merge descriptions into combined_primary_df
combined_primary_df = combined_primary_df.merge(prodcode_dict_df, on='prodcode', how='left')


