In [None]:
#### Set the right directory
#### Need to put in C drive since R-drive cannot hold big file!

%cd "C:\GfK"

In [None]:
### Import Pandas to read in the csv files

import pandas as pd
import numpy as np

In [None]:
### We read in the CSV file with all the FMCG expenditures in the 2nd half of 2020
### The file is to big to make a copy so we just do the operations right the first time
df0 = pd.read_csv('Kaufinformationen_FMCG_2020.csv)
                  
### We set that the first row is actually the set of columns we want to use
df0.columns = df0.iloc[0]
df0 = df.reindex(df0.index.drop(0)).reset_index(drop=True)
df0.columns.name = None

### We merge this information from each expenditure record with the associated background characteristics
### of the household. We use a left join on the household identifier key, hhkey, that is common to both
### dataframes as there is a many to one relationship on the hhkey and it makes sense to then use a left join                  
df = df0.merge('Haushaltsinformationen_FMCG_2020', how='left', on='hhkey', indicator = True)

### Set condition that we only want to keep all expenditure records that are in both dataframes and merged
### successfully, hence we use the '_merge' column and specify that we want the records where _merge=='both'
in_both_df = df['_merge'] == 'both'

### Filter the dataframe with the 'in_both_df' condition above
df = df.loc[in_both_df]


In [None]:
### Before we go further, we modify the dates in our dataframe. The 'datum' column ('datum' is date in German)
### is of the form YYYYMMDD so we extract all the relevant information to make the time dimension in our dataframe
### useful for analysis later

### Create a new column called 'datevar' that takes the 'datum' and converts the entries into a Pandas date time column
df['datevar'] = pd.to_datetime(df['datum'], format = "%Y%m%d")

### Now we can easily extract the Year, Month, Week and Day columns from the 'datevar' column described above
df['Year'] = df['datevar'].dt.strftime('%Y')
df['Month'] = df['datevar'].dt.strftime('%m')
df['Week'] = pd.to_datetime(df['datevar']).dt.isocalendar().week
df['Day'] = df['datevar'].dt.strftime('%d')

In [None]:
### Save the file to be used in the next step
pd.to_csv('finaldata_FMCG_2020.csv', index_col = False)