# Setup

In [None]:
# Parameters

## Years
start_year = 2019
end_year = 2022
base_year = start_year
years = range(start_year, end_year + 1)

## Grouping
young_age_cutoff=25
old_age_threshold=65

## Indexing
price_variable = 'mehir' # 'mehir' or 'omdan'

## Output
top_n = 5
comparison_year = end_year
comparison_level = 'primary'

## Folder Names
cex_data_folder="/Users/roykisluk/Downloads/Consumer_Expenditure_Survey/"
folder_names_pathname='Data_clean/CEX_folder_names.csv'
age_groups_pathname='Data_clean/age_groups.csv'
prodcode_dict_pathname = 'Data_clean/prodcode_dictionary_c3-c399.csv'
    
## Libraries
import pandas as pd
import pyreadstat  
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML

# Data

In [87]:
# Load folder names
folder_names_df = pd.read_csv(folder_names_pathname)

# Load age groups
age_groups_df = pd.read_csv(age_groups_pathname)
young_age_group_id = age_groups_df[(age_groups_df['min_age'] <= young_age_cutoff) & (age_groups_df['max_age'] >= young_age_cutoff)].index[0] + 1
old_age_group_id = age_groups_df[(age_groups_df['min_age'] <= old_age_threshold) & (age_groups_df['max_age'] >= old_age_threshold)].index[0] + 1

# Load household data for each year
dfs_mb = {}
for year in years:
    subfolder = folder_names_df.loc[folder_names_df['Year'] == year, 'Folder_Name'].values[0]
    data_HH_pathname = f"{cex_data_folder}{subfolder}/{subfolder}datamb.sas7bdat"
    df, meta = pyreadstat.read_sas7bdat(data_HH_pathname)
    df.columns = df.columns.str.lower()
    if 'gil' in df.columns:
        df.rename(columns={'gil': 'age_group'}, inplace=True)
    df['misparmb'] = df['misparmb'].astype(int)
    dfs_mb[year] = df

# Load individual data for each year
dfs_prat = {}
for year in years:
    subfolder = folder_names_df.loc[folder_names_df['Year'] == year, 'Folder_Name'].values[0]
    data_IND_pathname = f"{cex_data_folder}{subfolder}/{subfolder}dataprat.sas7bdat"
    df, meta = pyreadstat.read_sas7bdat(data_IND_pathname)
    df.columns = df.columns.str.lower()
    if 'gil' in df.columns:
        df.rename(columns={'gil': 'age_group'}, inplace=True)
    df['misparmb'] = df['misparmb'].astype(int)
    dfs_prat[year] = df

# Load expenses data for each year
dfs_prod = {}
for year in years:
    subfolder = folder_names_df.loc[folder_names_df['Year'] == year, 'Folder_Name'].values[0]
    data_prices_pathname = f"{cex_data_folder}{subfolder}/{subfolder}dataprod.sas7bdat"
    df, meta = pyreadstat.read_sas7bdat(data_prices_pathname)
    df.columns = df.columns.str.lower()
    df['misparmb'] = df['misparmb'].astype(int)
    df['prodcode'] = df['prodcode'].astype(int).astype(str)
    dfs_prod[year] = df

# Load survey data for each year
dfs_survey = {}
for year in years:
    subfolder = folder_names_df.loc[folder_names_df['Year'] == year, 'Folder_Name'].values[0]
    data_prices_pathname = f"{cex_data_folder}{subfolder}/{subfolder}datayoman.sas7bdat"
    df, meta = pyreadstat.read_sas7bdat(data_prices_pathname)
    df.columns = df.columns.str.lower()
    df['misparmb'] = df['misparmb'].astype(int)
    df['prodcode'] = df['prodcode'].astype(int).astype(str)
    dfs_survey[year] = df

# Grouping

## Data

In [None]:
Groups = {}
for year in years:
    Groups[year] = pd.DataFrame(dfs_mb[year]['misparmb'].unique(), columns=['misparmb'])

In [None]:
for year in years:
    dfs_mb_year = dfs_mb[year]
    dfs_prat_year = dfs_prat[year]

    nationality_map = {1: 'Jewish', 2: 'Arab'}
    observance_map = {1: 'Secular', 2: 'Conservative', 3: 'Religious', 4: 'Ultra-Orthodox', 5: 'Mixed'}

    Groups[year]['Nationality'] = dfs_mb_year['nationality'].map(nationality_map).fillna('Other')
    Groups[year]['Observance'] = dfs_mb_year['ramatdatiyut'].map(observance_map).fillna('Other')

    age_group_map = {age_group_id: 'Young' if age_group_id <= young_age_group_id else 'Old' if age_group_id >= old_age_group_id else 'Middle' for age_group_id in dfs_prat_year['age_group'].unique()}
    Groups[year]['Age_Group'] = dfs_prat_year.loc[dfs_prat_year['y_kalkali'] == 1, 'age_group'].map(age_group_map).values

    Groups[year]['Income_Decile'] = dfs_mb_year['decile'].fillna(0).astype(int)

    Groups[year]['Income_Quintile'] = pd.cut(dfs_mb_year['decile'], bins=[0, 2, 4, 6, 8, 10], labels=[1, 2, 3, 4, 5])

    Groups[year]['SES_Quintile'] = dfs_mb_year['cluster'].apply(lambda x: x if x in range(1, 6) else np.nan).fillna(0).astype(int)
    Groups[year]['SES_Tertile'] = dfs_mb_year['cluster'].apply(lambda x: 1 if x in [1, 2] else 2 if x == 3 else 3 if x in [4, 5] else np.nan).fillna(0).astype(int)

    Groups[year]['Children'] = dfs_mb_year['nefashotad18'].fillna(0).astype(int)
    Groups[year]['Family_Size'] = Groups[year]['Children'].apply(lambda x: 'no children' if x == 0 else '1 to 3' if x in [1, 2, 3] else '4 plus')


In [None]:
'''
groups_mmb = {}
for year in years:
    for group in Groups[year].columns[1:]:
        for subgroup in Groups[year][group].unique():
            groups_mmb[(year, group, subgroup)] = Groups[year][Groups[year][group] == subgroup]['misparmb'].values
'''

## Groups Dataframes Headers

In [None]:
for year in years:
    display(HTML(f"<h2>Groups for Year {year}</h2>"))
    display(HTML(Groups[year].head().to_html(index=False)))
    print(f"Number of observations: {len(dfs_mb[year])}")


## Plot Groups Distribution

In [None]:
# Get the columns to plot
columns_to_plot = [col for col in Groups[end_year].columns if col != 'misparmb']

# Calculate the number of rows needed
ncols = 3
nrows = (len(columns_to_plot) + ncols - 1) // ncols

# Create subplots
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 5, nrows * 5))

# Flatten the axes array for easy iteration
axes = axes.flatten()

# Plot each column
for ax, column in zip(axes, columns_to_plot):
    Groups[end_year][column].value_counts().sort_index().plot(kind='bar', ax=ax, color='skyblue')
    ax.set_title(f'Distribution of {column} in {end_year}')
    ax.set_xlabel(column)
    ax.set_ylabel('Count')

# Remove any unused subplots
for ax in axes[len(columns_to_plot):]:
    fig.delaxes(ax)

plt.tight_layout()
plt.show()

# Indexing

## Laspeyres Index

$$
I_{i}=\frac{\sum_{j\in L}{\frac{P_{ij}}{P_{oj}}(P_{oj}Q_{oj})}}{\sum_{j\in L}P_{oj}Q_{oj}}\times 100
$$

$$\text{For our purposes:}$$

$$
I_{ij}=\frac{P_{ij}}{P_{oj}}
$$
$$
W_{oj}=\frac{P_{oj}Q_{oj}}{\sum_{j\in L}P_{oj}Q_{oj}}
$$
$$
I_{i}=\sum_{j\in L}W_{oj}I_{ij}\times 100
$$



$$
\text{Where:}\\
I_{i}\text{  - Index for the current period}\\
Q_{oj}\text{  - Quantity of the good or service in the base period}\\
P_{oj}\text{  - Price of the good or service in the base period}\\
P_{ij}\text{  - Price of the good or service in the current period}\\
L\text{  - The set of all goods and services in the index basket}\\
$$

### Calculate Weights

In [88]:
def calculate_weights(product_level, year, group_mmb):
    # Expenses dataframe for consumption expenses only
    expenses_df = dfs_prod[year][dfs_prod[year]['prodcode'].astype(str).str.startswith('3')]

    # Filter only IDs that match the group
    expenses_df = expenses_df[expenses_df['misparmb'].isin(group_mmb)].reset_index(drop=True)

    # Keep only the product codes at the correct product level
    expenses_df = expenses_df[expenses_df['prodcode'].str.len() == product_level]

    # Sum the expense for each prodcode
    expenses_df = expenses_df.groupby('prodcode')['schum'].sum().reset_index()

    # Calculate weights
    expenses_df['weight'] = expenses_df['schum'] / expenses_df['schum'].sum()

    return expenses_df

In [89]:
product_level = 6
year = base_year

weights = {}
for group in Groups[year].columns[1:]:
    for subgroup in Groups[year][group].unique():
        mmb = Groups[year][Groups[year][group] == subgroup]['misparmb']
        weights[subgroup] = calculate_weights(product_level, year, mmb)

### Prices

In [90]:
def calculate_prices(product_level, year, group_mmb):
    # Prices dataframe for consumption expenses only
    prices_df = dfs_survey[year][dfs_survey[year]['prodcode'].astype(str).str.startswith('3')].copy()

    # Filter only IDs that match the group
    prices_df = prices_df[prices_df['misparmb'].isin(group_mmb)].reset_index(drop=True)

    # Keep only the product codes at the correct product level
    prices_df = prices_df[prices_df['prodcode'].str.len() == product_level]

    # Calculate prices
    prices_df['price'] = prices_df['mehir'].fillna(0) / prices_df['kamut'].fillna(1)

    # Keep only prodcode and price columns
    prices_df = prices_df[['prodcode', 'price']]

    return prices_df

In [None]:
prices = {}
for year in years:
    prices[year] = {}
    for group in Groups[year].columns[1:]:
        for subgroup in Groups[year][group].unique():
            mmb = Groups[year][Groups[year][group] == subgroup]['misparmb']
            prices[year][subgroup] = calculate_prices(product_level, year, mmb)

{'Jewish':        prodcode      price
 0        304113  10.000000
 1        304170   5.000000
 2        335075  10.000000
 3        317065  40.000000
 4        317131  60.000000
 ...         ...        ...
 323877   309021   4.933333
 323878   302323  11.900000
 323879   303057   4.975000
 323880   305011   4.500000
 323881   335208   0.100000
 
 [323882 rows x 2 columns],
 'Arab':       prodcode      price
 0       304014   5.000000
 1       300079   1.000000
 2       304154   0.075000
 3       300418  25.000000
 4       303099  66.666667
 ...        ...        ...
 51851   305078  20.000000
 51852   300319  20.000000
 51853   300319  10.000000
 51854   300335  20.000000
 51855   300442  20.000000
 
 [51856 rows x 2 columns],
 'Other':      prodcode      price
 0      300236  13.900000
 1      304154  14.900000
 2      312090   7.904762
 3      312025   7.000000
 4      393025  18.900000
 ...       ...        ...
 1748   300046  10.900000
 1749   335018  85.000000
 1750   308049  25.0