In [None]:
#### Set the right directory
#### Need to put in C drive since R-drive cannot hold big file!

%cd "C:\GfK"

In [None]:
### Import NumPy and Pandas

import pandas as pd
import numpy as np

In [None]:
### We want to use the initial file with all the point of sale FMCG purchases and aggregate all the transactions 
### in the 2nd half of 2020

### Read in the FMCG_2020 file
fmcg_2020_post = pd.read_csv('finaldata_FMCG_2020.csv')

df = fmcg_2020_post.copy(:)

In [None]:
### The column 'wert' that contains the total value of the point of sale purchase is defined in cents 
### so we normalise it euros, by dividing all values of the 'wert' column by 100

df['wert'] = df['wert']/100

In [None]:
### We do the aggregation of all point of sale FMCG transations by summing up all transactions from the column, 'wert' 
### by a household using the household identifier 'hhkey' and we save it as a dataframe fmcg_hh_agg_values which only
### has 1 record with the aggregated fmcg expenditure value per household

fmcg_hh_agg_values = df.groupby('hhkey')['wert'].agg('sum')

### Re-name this aggregated column from 'wert' to fmcg_2_euro so there are no issues with the merging below
fmcg_hh_agg_values.rename(columns={'wert':'fmcg_2_euro'}, axis = 1)


In [None]:
### We go back to our original dataframe with all the transactions, df 
### As all the relevant background household information is the same for all columns for each transaction
### we only now need 1 record of these columns for each household because we want to merge it with the aggregated
### fmcg purchases from the 2nd half of 2020 dataframe created above, fmcg_hh_agg_values,
### which as mentioned above has 1 value per household

### We sort the records s.t. all records within 1 household, follow one another
df.sort_values('hhkey')

### We now just take the first record for each household and drop the rest
df = df.groupby('hhkey').head(1).reset_index(drop=True)

In [None]:
### Now we can merge this dataframe with the fmcg_hh_agg_values dataframe very simply

fmcg_all_one_per_hh = df.merge('fmcg_hh_agg_values', how='inner', on='hhkey', indicator = True)

### Set condition that we only want household records in both dataframes (the column hhkey exists in both dataframes)
inner_join = fmcg_all_one_per_hh['_merge'] == 'both'

### Drop all records not in both dataframes from the condition above
fmcg_all_one_per_hh = fmcg_all_one_per_hh.loc[inner_join]

### Save the file to be used in the next step
pd.to_csv('finaldata_FMCG_2020.csv', index_col = False)