In [6]:
import pandas as pd

from parental_care.config import BLD
from parental_care.config import SRC

In [13]:
wave_number = 1
module = "ep"
SRC / f"sharew{wave_number}/sharew{wave_number}_rel8-0-0_{module}.dta"

PosixPath('/home/sebastian/Projects/parental_care/src/parental_care/sharew1/sharew1_rel8-0-0_ep.dta')

In [29]:
def process_wave(wave_number, country_code, data_modules):
    wave_data = {}
    print(f"\nWave {wave_number}\n")

    for module in data_modules.keys():
        module_file = SRC / f"data/sharew{wave_number}/sharew{wave_number}_rel8-0-0_{module}.dta"

        # Read and filter data
        wave_module = pd.read_stata(module_file)
        wave_module = wave_module[wave_module['country'] == country_code]
        module_vars = ['mergeid'] + data_modules[module]

        # Rename variables if they exist
        lookup = {
            'sp009_1': 'sp009_1sp',
            'sp009_2': 'sp009_2sp',
            'sp009_3': 'sp009_3sp',
            'sp019d2': 'sp019d2sp',
            'sp019d3': 'sp019d3sp',
            'sp019d4': 'sp019d4sp',
            'sp019d5': 'sp019d5sp',
            'sp019d6': 'sp019d6sp',
            'sp019d7': 'sp019d7sp'
        }
        # wave_module.rename(columns=lookup, inplace=True)
        # wave_module.rename(columns=lambda x: x.replace("^re011", "sl_re011"), inplace=True, subset=wave_module.columns.str.startswith("re011"))
        # wave_module.rename(columns=lambda x: x.replace("^re016", "sl_re016"), inplace=True, subset=wave_module.columns.str.startswith("re016"))
        # wave_module.rename(columns=lambda x: x.replace("^re026", "sl_re026"), inplace=True, subset=wave_module.columns.str.startswith("re026"))

        # Create dictionaries for renaming
        column_replacements = {
            **lookup,
            **{col: col.replace("^re011", "sl_re011") for col in wave_module.columns if col.startswith("re011")},
            **{col: col.replace("^re016", "sl_re016") for col in wave_module.columns if col.startswith("re016")},
            **{col: col.replace("^re026", "sl_re026") for col in wave_module.columns if col.startswith("re026")}
        }

        # Rename columns using the dictionary
        wave_module.rename(columns=column_replacements, inplace=True)

        
        # Select columns
        wave_module = wave_module[module_vars]

        wave_data[module] = wave_module

        # Print columns which are not used in specific waves
        list1 = wave_module.columns
        list2 = module_vars

        if not list1.equals(list2):
            print(module)
            print(list(set(list2) - set(list1)))

    # Join all data modules
    merged_data = wave_data[list(data_modules.keys())[0]]
    for i in range(1, len(data_modules)):
        merged_data = merged_data.merge(wave_data[list(data_modules.keys())[i]], on="mergeid", how="outer")

    merged_data['wave'] = wave_number
    return merged_data

In [30]:
# Example usage for wave 1 and country code 12
data_modules = {
    'cv_r': ["int_year", "int_month", "gender", "mobirth", "yrbirth", "age_int", "hhsize"],
    'dn': ['dn002_', 'dn003_', 'dn010_', 'dn041_', 'dn009_', 'dn014_', 'dn015_', 'dn016_',
           'dn026_1', 'dn026_2', 'dn033_1', 'dn033_2', 'dn028_1', 'dn028_2', 'dn030_1', 'dn030_2',
           'dn127_1', 'dn127_2', 'dn032_1', 'dn032_2',
           'dn012d1', 'dn012d2', 'dn012d3', 'dn012d4', 'dn012d5',
           'dn012d6', 'dn012d7', 'dn012d8', 'dn012d9', 'dn012d10',
           'dn012d11', 'dn012d12', 'dn012d13', 'dn012d14', 'dn012d15',
           'dn012d16', 'dn012d17', 'dn012d18', 'dn012d19', 'dn012d20',
           'dn012d95', 'dn012dno', 'dn012dot'],
    'ep': ['ep005_', 'ep002_', 'ep328_', 'ep329_'],
    'sp': ['sp008_', 'sp018_', 'sp009_1', 'sp009_2', 'sp009_3', 'sp010d1_1', 'sp010d1_2', 'sp010d1_3',
           'sp011_1', 'sp011_2', 'sp011_3',
           'sp019d2', 'sp019d3', 'sp019d4', 'sp019d5', 'sp019d6', 'sp019d7'],
    'gv_isced': ['isced1997_r'],
    'ch': ['ch001_']
}

# Add wave number to ep module
wave_number = 1
ep_module = ['ep213_' + str(i) for i in range(1, 17)]
data_modules['ep'] += ep_module

In [31]:
data_modules

{'cv_r': ['int_year',
  'int_month',
  'gender',
  'mobirth',
  'yrbirth',
  'age_int',
  'hhsize'],
 'dn': ['dn002_',
  'dn003_',
  'dn010_',
  'dn041_',
  'dn009_',
  'dn014_',
  'dn015_',
  'dn016_',
  'dn026_1',
  'dn026_2',
  'dn033_1',
  'dn033_2',
  'dn028_1',
  'dn028_2',
  'dn030_1',
  'dn030_2',
  'dn127_1',
  'dn127_2',
  'dn032_1',
  'dn032_2',
  'dn012d1',
  'dn012d2',
  'dn012d3',
  'dn012d4',
  'dn012d5',
  'dn012d6',
  'dn012d7',
  'dn012d8',
  'dn012d9',
  'dn012d10',
  'dn012d11',
  'dn012d12',
  'dn012d13',
  'dn012d14',
  'dn012d15',
  'dn012d16',
  'dn012d17',
  'dn012d18',
  'dn012d19',
  'dn012d20',
  'dn012d95',
  'dn012dno',
  'dn012dot'],
 'ep': ['ep005_',
  'ep002_',
  'ep328_',
  'ep329_',
  'ep213_1',
  'ep213_2',
  'ep213_3',
  'ep213_4',
  'ep213_5',
  'ep213_6',
  'ep213_7',
  'ep213_8',
  'ep213_9',
  'ep213_10',
  'ep213_11',
  'ep213_12',
  'ep213_13',
  'ep213_14',
  'ep213_15',
  'ep213_16'],
 'sp': ['sp008_',
  'sp018_',
  'sp009_1',
  'sp009_2',
 

In [32]:
# Retrospective wave 3 variables
re_vars = [
    f"sl_re011_{i}" for i in range(1, 21)] + [
    f"sl_re016_{i}" for i in range(1, 21)] + [
    f"sl_re026_{i}" for i in range(1, 21)] + [
    f"sl_re018_{i}" for i in range(1, 8)] + [
    f"sl_re020_{i}" for i in range(1, 8)]

# Data modules for wave 3
data_modules_wave3 = {
    'cv_r': ["int_year", "int_month", "gender", "mobirth", "yrbirth", "age_int", "hhsize"],
    're': re_vars
}

# Separate modules for partly retrospective wave 7
data_modules_wave7 = data_modules.copy()
data_modules_wave7['re'] = re_vars

In [33]:
wave1_data = process_wave(wave_number=1, country_code=12, data_modules=data_modules)


Wave 1

cv_r
[]


KeyError: "['dn041_', 'dn127_1', 'dn127_2', 'dn012d15', 'dn012d16', 'dn012d17', 'dn012d18', 'dn012d19', 'dn012d20'] not in index"