In [None]:
#### Set the right directory
#### Need to put in C drive since R-drive cannot hold big file!

%cd "C:\GfK"

In [None]:
### Import NumPy and Pandas

import pandas as pd
import numpy as np

In [None]:
### We want to use the initial file with all the point of sale FMCG purchases from the 2nd half of 2019

### Read in the FMCG_2019 file
fmcg_2019_post = pd.read_csv('finaldata_FMCG_2019.csv')

df19 = fmcg_2019_post.copy(:)

In [None]:
### The column 'wert' that contains the total value of the point of sale purchase is defined in cents 
### so we normalise it euros, by dividing all values of the 'wert' column by 100

df19['wert'] = df19['wert']/100

### We then compute the unit price of each item purchased by dividing the total expenditure for that
### item by the total number of units purchased (in German 'anzahl')

df19['item_price'] = df19['wert']/df19['anzahl']

In [None]:
### To compute the price basket of households in 2020 H2, we must compute the average price and quantity
### of each fast moving good (artikelschluessel) for each household (hhkey) purchased in 2019 H2. 

fmcg_hh_avg_2019_price = df19.groupby(['hhkey','artikelschluessel'])['item_price'].agg('mean')
fmcg_hh_avg_2019_quantity = df19.groupby(['hhkey','artikelschluessel'])['anzahl'].agg('mean')

In [None]:
### For the 2019 dataset, we want to have 1 record for each good that the household purchased
### with their average price and quantity for that good.

### We sort the records for each household and specific good
df19.sort_values(['hhkey','artikelschluessel'])

### We now just take the first record for each household and each good that they purchased and drop the rest
df19 = df19.groupby(['hhkey','artikelschluessel']).head(1).reset_index(drop=True)

### We only need to keep the columns of the household id,, the good id and the average price and quantity 
### purchased for that specific good.
df19 = df19(['hhkey','artikelschluessel','fmcg_hh_avg_2019_price','fmcg_hh_avg_2019_quantity'])

In [None]:
### We now go to the FMCG purchases from the 2nd half of 2020

### Read in the FMCG_2020 file
fmcg_2020_post = pd.read_csv('finaldata_FMCG_2020.csv')

df1920 = fmcg_2020_post.copy(:)

In [None]:
### Now we can merge the 2020 FMCG point of sale data with the df19 dataframe where we merge on the household 
### and the corresponding good in 2019. Thus we do an inner join

df1920 = df1920.merge('df19', how='inner', on=['hhkey','artikelschluessel'], indicator = True)

### Set condition that we only want household records in both dataframes (the column hhkey and artikelschluessel
### exists in both dataframes)
inner_join = df1920['_merge'] == 'both'

### Drop all records not in both dataframes from the condition above
df1920= df1920.loc[inner_join]

In [None]:
### The column 'wert' that contains the total value of the point of sale purchase is defined in cents 
### so we normalise it euros, by dividing all values of the 'wert' column by 100

df1920['wert'] = df1920['wert']/100

### We then compute the unit price of each item purchased by dividing the total expenditure for that
### item by the total number of units purchased (in German 'anzahl')

df1920['item_price'] = df1920['wert']/df1920['anzahl']

In [None]:
### To compute the price basket, we need to compute the average price that the household purchased for the
### good in 2020. Note that we are computing the Laspeyres price basket which fixes the quantity which in 
### our case is the average quantity purchased in 2019 H2
fmcg_hh_avg_2020_price = df1920.groupby(['hhkey','artikelschluessel'])['item_price'].agg('mean')

In [None]:
### To compute the price basket we first keep 1 record per good and household. We sort all the records by household and 
### good id.Then we take the first record for each household and each good that they purchased and drop the rest

df1920 = df1920.groupby(['hhkey','artikelschluessel']).head(1).reset_index(drop=True)

In [None]:
### To compute the price basket, we need to compute the change in the average price of the good that the household
### paid in 2020 compared to 2019 and multiply it by the average 2019 quantity (See Laspeyres Index for the formula)

fmcg_2020_part = (np.log10(fmcg_hh_avg_2020_price) - np.log10(fmcg_hh_avg_2019_price))*fmcg_hh_avg_2019_quantity
fmcg_2019_part = fmcg_hh_avg_2019_price*fmcg_hh_avg_2019_quantity

In [None]:
### We must now aggregate the price changes across all goods for each household before we divide 2020 by 2019

fmcg_hh_avg_2020_total = df1920.groupby('hhkey')['fmcg_2020_part'].agg('sum')
fmcg_hh_avg_2019_total = df1920.groupby('hhkey')['fmcg_2020_part'].agg('sum')

In [None]:
### We compute the price basket now by dividing the 2020 change of price for the 2019 average quantity by
### the 2019 average price and quantity

fmcg1920_pricebasket = fmcg_hh_avg_2020_total/fmcg_hh_avg_2019_total

### We only need to keep one record per household with their price basket
df1920 = df1920.groupby('hhkey').head(1).reset_index(drop=True)

### Save the file
pd.to_csv('FMCG_1920.csv', index_col = False)