In [174]:
import pandas as pd
import numpy as np
import re

from parental_care.config import BLD
from parental_care.config import SRC

In [2]:
def filter_nested_dict(original_dict, keys_to_remove):
    return {
        key: [value for value in values if value not in keys_to_remove.get(key, [])]
        if key in keys_to_remove
        else values
        for key, values in original_dict.items()
    }

In [3]:
def process_wave(wave_number, data_modules):
    wave_data = {}

    for module in data_modules.keys():
        print(f"module: {module}")
        module_file = (
            SRC / f"data/sharew{wave_number}/sharew{wave_number}_rel8-0-0_{module}.dta"
        )

        # Read and filter
        if module == "re" and wave_number == 7:
            wave_module = pd.read_stata(module_file, convert_categoricals=False)
            wave_module = wave_module[wave_module["country"] == 12]

            #lookup = {
            #    f"re{var[2:]}": f"sl{var[2:]}"
            #    for var in data_modules["re"]
            #    if var.startswith("sl")
            #}
            lookup = {
                f"{var[3:]}": f"{var}"
                for var in data_modules["re"]
                if var.startswith("sl")
            }
        else:
            wave_module = pd.read_stata(module_file, convert_categoricals=True)
            wave_module = wave_module[wave_module["country"] == "Germany"]

            lookup = {
                "sp009_1sp": "sp009_1",
                "sp009_2sp": "sp009_2",
                "sp009_3sp": "sp009_3",
                "sp019d2sp": "sp019d2",
                "sp019d3sp": "sp019d3",
                "sp019d4sp": "sp019d4",
                "sp019d5sp": "sp019d5",
                "sp019d6sp": "sp019d6",
                "sp019d7sp": "sp019d7",
            }

        # Rename columns using the dictionary
        wave_module.rename(columns=lookup, inplace=True)

        module_vars = ["mergeid"] + data_modules[module]

        # Select columns
        wave_module = wave_module[module_vars]

        wave_data[module] = wave_module

        print(wave_module.shape)

    add_wealth_data = "gv_imputations" in data_modules
    merged_data = wave_data["cv_r"]

    data_modules.pop("cv_r")
    data_modules.pop("gv_imputations", None)

    for module_key in data_modules.keys():
        merged_data = merged_data.merge(
            wave_data[module_key], on="mergeid", how="outer"
        )

    if add_wealth_data:
        merged_data = merged_data.merge(
            wave_data["gv_imputations"], on="mergeid", how="left"
        )

    merged_data["wave"] = wave_number

    return merged_data

In [4]:
all_variables = {
    "cv_r": [
        "int_year",
        "int_month",
        "gender",
        "mobirth",
        "yrbirth",
        "age_int",
        "hhsize",
    ],
    "dn": [
        "dn002_",
        "dn003_",
        "dn010_",
        "dn041_",
        "dn009_",
        "dn014_",
        "dn015_",
        "dn016_",
        "dn026_1",
        "dn026_2",
        "dn033_1",
        "dn033_2",
        "dn028_1",
        "dn028_2",
        "dn030_1",
        "dn030_2",
        "dn127_1",
        "dn127_2",
        "dn032_1",
        "dn032_2",
        "dn012d1",
        "dn012d2",
        "dn012d3",
        "dn012d4",
        "dn012d5",
        "dn012d6",
        "dn012d7",
        "dn012d8",
        "dn012d9",
        "dn012d10",
        "dn012d11",
        "dn012d12",
        "dn012d13",
        "dn012d14",
        "dn012d15",
        "dn012d16",
        "dn012d17",
        "dn012d18",
        "dn012d19",
        "dn012d20",
        #"dn012d95",
        "dn012dno",
        "dn012dot",
    ],
    "ep": [
        "ep005_",
        "ep002_",
        "ep328_",
        "ep329_",
        "ep213_1",
        #"ep213_2",
        #"ep213_3",
        #"ep213_4",
        #"ep213_5",
        #"ep213_6",
        #"ep213_7",
        #"ep213_8",
        #"ep213_9",
        #"ep213_10",
        #"ep213_11",
        #"ep213_12",
        #"ep213_13",
        #"ep213_14",
        #"ep213_15",
        #"ep213_16",
    ],
    "sp": [
        "sp008_",
        "sp018_",
        "sp009_1",
        "sp009_2",
        "sp009_3",
        "sp010d1_1",
        "sp010d1_2",
        "sp010d1_3",
        "sp011_1",
        "sp011_2",
        "sp011_3",
        "sp019d2",
        "sp019d3",
        "sp019d4",
        "sp019d5",
        "sp019d6",
        "sp019d7",
    ],
    "gv_isced": ["isced1997_r"],
    #"gv_imputations": [
    #    "hnetw"
    #],  # household net worth = total gross financial assets + total real assets - total libailities
    "ch": ["ch001_"],
}

In [5]:
keys_to_remove_wave1 = {
    "dn": [
        "dn041_",
        "dn127_1",
        "dn127_2",
        "dn012d15",
        "dn012d16",
        "dn012d17",
        "dn012d18",
        "dn012d19",
        "dn012d20",
    ],
    "ep": [
        "ep328_",
        "ep329_",
        "ep213_12",
        "ep213_13",
        "ep213_14",
        "ep213_15",
        "ep213_16",
    ],
}

keys_to_remove_wave2 = {
    "dn": [
        "dn127_1",
        "dn127_2",
        "dn012d15",
        "dn012d16",
        "dn012d17",
        "dn012d18",
        "dn012d19",
        "dn012d20",
    ],
}


keys_to_remove_wave4 = {
    "dn": [
        "dn127_1",
        "dn127_2",
        "dn012d14",
        "dn012d15",
        "dn012d16",
        "dn012d17",
        "dn012d18",
        "dn012d19",
        "dn012d20",
    ],
    "sp": [
        "sp010d1_1",
        "sp010d1_2",
        "sp010d1_3",
    ],
}


keys_to_remove_wave5 = {
    "dn": [
        "dn127_1",
        "dn127_2",
        "dn012d20",
        "dn012dno",
    ],
    "sp": [
        "sp010d1_1",
        "sp010d1_2",
        "sp010d1_3",
    ],
}

keys_to_remove_wave6 = {
    "dn": [
        "dn012dno",
    ],
    "ep": ["ep213_14", "ep213_15", "ep213_16"],
}

keys_to_remove_wave7 = {
    "dn": [
        "dn012dno",
    ],
    "ep": ["ep213_14", "ep213_15", "ep213_16"],
    "re": [
        "sl_re018_1",
        "sl_re018_2",
        "sl_re018_3",
        "sl_re018_4",
        "sl_re018_5",
        "sl_re018_6",
        "sl_re018_7",
        "sl_re020_1",
        "sl_re020_2",
        "sl_re020_3",
        "sl_re020_4",
        "sl_re020_5",
        "sl_re020_6",
        "sl_re020_7",
    ],
}

keys_to_remove_wave8 = {
    "dn": [
        "dn012dno",
    ],
    "ep": ["ep213_14", "ep213_15", "ep213_16"],
}

In [6]:
# Retrospective waves
re_vars = (
    [f"sl_re011_{i}" for i in range(1, 21)]
    + [f"sl_re016_{i}" for i in range(1, 21)]
    + [f"sl_re026_{i}" for i in range(1, 21)]
    + [f"sl_re018_{i}" for i in range(1, 8)]
    + [f"sl_re020_{i}" for i in range(1, 8)]
)

# Data modules for wave 3
variables_wave3 = {
    "cv_r": [
        "int_year",
        "int_month",
        "gender",
        "mobirth",
        "yrbirth",
        "age_int",
        "hhsize",
    ],
    "re": re_vars,
}

# Separate modules for partly retrospective wave 7
variables_wave7 = filter_nested_dict(all_variables | {"re": re_vars}, keys_to_remove_wave7) 

In [7]:
variables_wave1 = filter_nested_dict(all_variables, keys_to_remove_wave1)
variables_wave2 = filter_nested_dict(all_variables, keys_to_remove_wave2)
variables_wave4 = filter_nested_dict(all_variables, keys_to_remove_wave4)
variables_wave5 = filter_nested_dict(all_variables, keys_to_remove_wave5)
variables_wave6 = filter_nested_dict(all_variables, keys_to_remove_wave6)
variables_wave8 = filter_nested_dict(all_variables, keys_to_remove_wave8)

In [8]:
wave1 = process_wave(wave_number=1, data_modules=variables_wave1)
wave2 = process_wave(wave_number=2, data_modules=variables_wave2)
wave3 = process_wave(wave_number=3, data_modules=variables_wave3)
wave4 = process_wave(wave_number=4, data_modules=variables_wave4)
wave5 = process_wave(wave_number=5, data_modules=variables_wave5)
wave6 = process_wave(wave_number=6, data_modules=variables_wave6)
wave7 = process_wave(wave_number=7, data_modules=variables_wave7)
wave8 = process_wave(wave_number=8, data_modules=variables_wave8)

module: cv_r
(3920, 8)
module: dn
(2995, 34)
module: ep
(2995, 4)
module: sp
(2995, 18)
module: gv_isced
(2995, 2)
module: ch
(2995, 2)
module: cv_r
(3504, 8)
module: dn
(2628, 35)
module: ep
(2628, 6)
module: sp
(2628, 18)
module: gv_isced
(2628, 2)
module: ch
(2628, 2)
module: cv_r
(2501, 8)
module: re
(1918, 75)
module: cv_r
(2146, 8)
module: dn
(1619, 34)
module: ep
(1619, 6)
module: sp
(1619, 15)
module: gv_isced
(1619, 2)
module: ch
(1619, 2)
module: cv_r
(7674, 8)
module: dn
(5750, 39)
module: ep
(5750, 6)
module: sp
(5750, 15)
module: gv_isced
(5750, 2)
module: ch
(5750, 2)
module: cv_r
(5787, 8)
module: dn
(4411, 42)
module: ep
(4411, 6)
module: sp
(4411, 18)
module: gv_isced
(4411, 2)
module: ch
(4411, 2)
module: cv_r
(4928, 8)
module: dn
(3820, 42)
module: ep
(3820, 6)
module: sp
(3820, 18)
module: gv_isced
(3820, 2)
module: ch
(3820, 2)
module: re
(3820, 61)
module: cv_r
(4202, 8)
module: dn
(2878, 42)
module: ep
(2878, 6)
module: sp
(2878, 18)
module: gv_isced
(2878, 2)
mo

In [102]:
waves_list = [wave1, wave2, wave3, wave4, wave5, wave6, wave7, wave8]

# Drop all nan rows
for i, df in enumerate(waves_list):
    waves_list[i] = df.dropna(how='all', axis=0, inplace=False)
    #waves_list[i] = df.dropna(axis=1, how='all')

In [103]:
def merge_wave_datasets(wave_datasets):
    # Combine the data frames in wave_datasets into one data frame
    #     combined_data = pd.concat(wave_datasets)
    combined_data = pd.concat(wave_datasets, axis=0, ignore_index=True)

    # Filter out rows where the 'int_year' column is not equal to -9
    #combined_data = combined_data[combined_data["int_year"] != -9]
    combined_data = combined_data[combined_data["int_year"] != "Not applicable"]

    # Sort the data frame by 'mergeid' and 'int_year'
    combined_data = combined_data.sort_values(by=["mergeid", "int_year"])

    return combined_data

In [104]:
data = merge_wave_datasets(waves_list)

  combined_data = pd.concat(wave_datasets, axis=0, ignore_index=True)


In [105]:
data.shape

(26593, 149)

In [106]:
sum(pd.crosstab(data["int_year"], columns='Count')["Count"])

26590

In [107]:
len(list(data))

149

In [108]:
nan_dropped = [
    "mergeid",
    "int_year",
    "int_month",
    "gender",
    "mobirth",
    "yrbirth",
    "age_int",
    "hhsize",
    "dn002_",
    "dn003_",
    "dn010_",
    "dn009_",
    "dn014_",
    "dn015_",
    "dn016_",
    "dn026_1",
    "dn026_2",
    "dn033_1",
    "dn033_2",
    "dn028_1",
    "dn028_2",
    "dn030_1",
    "dn030_2",
    "dn032_1",
    "dn032_2",
    "dn012d1",
    "dn012d2",
    "dn012d3",
    "dn012d4",
    "dn012d5",
    "dn012d6",
    "dn012d7",
    "dn012d8",
    "dn012d9",
    "dn012d10",
    "dn012d11",
    "dn012d12",
    "dn012d13",
    "dn012d14",
    "dn012dno",
    "dn012dot",
    "ep005_",
    "ep002_",
    "ep213_1",
    "sp008_",
    "sp018_",
    "sp009_1",
    "sp009_2",
    "sp009_3",
    "sp010d1_1",
    "sp010d1_2",
    "sp010d1_3",
    "sp011_1",
    "sp011_2",
    "sp011_3",
    "sp019d2",
    "sp019d3",
    "sp019d4",
    "sp019d5",
    "sp019d6",
    "sp019d7",
    "isced1997_r",
    "ch001_",
    "wave",
    "dn041_",
    "ep328_",
    "ep329_",
    "sl_re011_1",
    "sl_re011_2",
    "sl_re011_3",
    "sl_re011_4",
    "sl_re011_5",
    "sl_re011_6",
    "sl_re011_7",
    "sl_re011_8",
    "sl_re011_9",
    "sl_re011_10",
    "sl_re011_11",
    "sl_re016_1",
    "sl_re016_2",
    "sl_re016_3",
    "sl_re016_4",
    "sl_re016_5",
    "sl_re016_6",
    "sl_re016_7",
    "sl_re016_8",
    "sl_re016_9",
    "sl_re016_10",
    "sl_re016_11",
    "sl_re026_1",
    "sl_re026_2",
    "sl_re026_3",
    "sl_re026_4",
    "sl_re026_5",
    "sl_re026_6",
    "sl_re026_7",
    "sl_re026_8",
    "sl_re026_9",
    "sl_re026_10",
    "sl_re026_11",
    "sl_re018_1",
    "sl_re018_2",
    "sl_re018_3",
    "sl_re018_4",
    "sl_re018_5",
    "sl_re018_6",
    "sl_re018_7",
    "sl_re020_1",
    "sl_re020_2",
    "sl_re020_3",
    "sl_re020_4",
    "sl_re020_5",
    "sl_re020_6",
    "sl_re020_7",
    "dn012d15",
    "dn012d16",
    "dn012d17",
    "dn012d18",
    "dn012d19",
    "dn127_1",
    "dn127_2",
    "dn012d20",
    "sl_re011_12",
    "sl_re011_13",
    "sl_re016_12",
    "sl_re016_13",
    "sl_re026_12",
    "sl_re026_13",
]

In [111]:
set_empty_cols = {
    "sl_re011_14",
    "sl_re011_15",
    "sl_re011_16",
    "sl_re011_17",
    "sl_re011_18",
    "sl_re011_19",
    "sl_re011_20",
    "sl_re016_14",
    "sl_re016_15",
    "sl_re016_16",
    "sl_re016_17",
    "sl_re016_18",
    "sl_re016_19",
    "sl_re016_20",
    "sl_re026_14",
    "sl_re026_15",
    "sl_re026_16",
    "sl_re026_17",
    "sl_re026_18",
    "sl_re026_19",
    "sl_re026_20",
}

In [99]:
r_list

['mergeid',
 'int_year',
 'int_month',
 'gender',
 'mobirth',
 'yrbirth',
 'age_int',
 'hhsize',
 'dn002_',
 'dn003_',
 'dn010_',
 'dn009_',
 'dn014_',
 'dn015_',
 'dn016_',
 'dn026_1',
 'dn026_2',
 'dn033_1',
 'dn033_2',
 'dn028_1',
 'dn028_2',
 'dn030_1',
 'dn030_2',
 'dn032_1',
 'dn032_2',
 'dn012d1',
 'dn012d2',
 'dn012d3',
 'dn012d4',
 'dn012d5',
 'dn012d6',
 'dn012d7',
 'dn012d8',
 'dn012d9',
 'dn012d10',
 'dn012d11',
 'dn012d12',
 'dn012d13',
 'dn012d14',
 'dn012d15',
 'dn012d16',
 'dn012d17',
 'dn012d18',
 'dn012d19',
 'dn012d20',
 'dn012dno',
 'dn012dot',
 'ep005_',
 'ep002_',
 'ep213_1',
 'sp008_',
 'sp018_',
 'sp009_1',
 'sp009_2',
 'sp009_3',
 'sp010d1_1',
 'sp010d1_2',
 'sp010d1_3',
 'sp011_1',
 'sp011_2',
 'sp011_3',
 'sp019d2',
 'sp019d3',
 'sp019d4',
 'sp019d5',
 'sp019d6',
 'sp019d7',
 'isced1997_r',
 'ch001_',
 'wave',
 'dn041_',
 'ep328_',
 'ep329_',
 'sl_re011_1',
 'sl_re011_2',
 'sl_re011_3',
 'sl_re011_4',
 'sl_re011_5',
 'sl_re011_6',
 'sl_re011_7',
 'sl_re011_8'

In [208]:
def table(df_col):
    return pd.crosstab(df_col, columns='Count')["Count"]

# 2) Data preparation

## Discrete state variables

###### states which are interpolated
- labor market experience (0, 30)
- years in retirement (0, 6)
- years in intensive care (0, 5)
- father age (70, 90)
- mother age (70, 90)
###### states which are not interpolated
- individuals’ type (1, 2)
- father died last period (0, 1)
- mother died last period (0, 1)
- father alive (0, 1)
- mother alive (0, 1)
- health of father (1, 2, 3)
- health of mother (1, 2, 3)
- existence of siblings (0, 1) --> **change to existence of sister?**
- parents live close by (0, 1)
- married (0, 1)
- education (low, high)

In [295]:
# only females
dat = data.copy()

# Filter for females
dat = dat[dat['gender'] == "Female"]
dat.shape

(14153, 149)

In [296]:
dat["dn012d1"].describe()

count             4726
unique               4
top       Not selected
freq              3256
Name: dn012d1, dtype: object

In [297]:
table(dat["dn012d1"])

dn012d1
Refusal            1
Don't know         2
Not selected    3256
Selected        1467
Name: Count, dtype: int64

In [298]:
# Create a mapping dictionary from month names to integers
month_mapping = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    'December': 12
}

# Convert 'int_month' and 'mobirth' columns to integers
dat['int_month'] = dat['int_month'].map(month_mapping)
dat['mobirth'] = dat['mobirth'].map(month_mapping)

In [299]:
dat["age"] = np.where(
    dat["int_month"] >= dat["mobirth"],
    dat["int_year"] - dat["yrbirth"],
    dat["int_year"] - dat["yrbirth"] - 1,
)

In [300]:
dat["age"]

1          80
2          51
3920     53.0
7424     56.0
9925     58.0
         ... 
34659      71
19742    61.0
25530    64.0
30458    66.0
34660      68
Name: age, Length: 14153, dtype: object

In [301]:
# Rename the 'dn041_' column to 'years_educ'
dat.rename(columns={'dn041_': 'years_educ'}, inplace=True)

In [302]:
table(dat["years_educ"])

years_educ
0.0                              6
1.0                              7
2.0                             27
3.0                             60
4.0                             11
5.0                             26
6.0                             26
7.0                             31
8.0                            388
9.0                            157
10.0                           305
11.0                           727
12.0                           679
13.0                           499
14.0                           230
15.0                           214
16.0                           189
17.0                           117
18.0                            90
19.0                            57
20.0                            42
21.0                            23
22.0                            14
23.0                            15
24.0                             1
25.0                             8
1953.0                           1
1970.0                           1
Don't kno

In [303]:
# Use errors='coerce' to convert non-numeric values to NaN
dat['years_educ'] = pd.to_numeric(dat['years_educ'], errors='coerce')

# Use boolean indexing to drop rows with NaN values in 'years_educ'
dat = dat.dropna(subset=['years_educ'])

In [304]:
dat = dat[(dat['years_educ'] <= 25) | dat['years_educ'].isna()]

In [305]:
dat.shape

(3949, 150)

In [306]:
# Replace negative values in 'years_educ' with NaN and create 'high_educ' column
# dat['years_educ'] = np.where(dat['years_educ'] < 0, np.nan, dat['years_educ'])
dat['high_educ'] = np.where(dat['years_educ'] >= 15, 1, 0)

In [307]:
dat.shape

(3949, 151)

In [308]:
table(dat["dn012d1"])

dn012d1
Refusal            1
Don't know         2
Not selected    2360
Selected         720
Name: Count, dtype: int64

In [309]:
# Create a list of further education columns
further_educ = [
    "dn012d1",
    "dn012d2",
    "dn012d3",
    "dn012d4",
    "dn012d5",
    "dn012d6",
    "dn012d7",
    "dn012d8",
    "dn012d9",
    "dn012d10",
    "dn012d11",
    "dn012d12",
    "dn012d13",
    "dn012d14",
    "dn012d15",
    "dn012d16",
    "dn012d17",
    "dn012d18",
    "dn012d19",
    "dn012d20",
    #'dn012d95' # currently in education --> not needed
]

In [310]:
for var in further_educ:
    print(table(dat[var]))

dn012d1
Refusal            1
Don't know         2
Not selected    2360
Selected         720
Name: Count, dtype: int64
dn012d2
Refusal            1
Don't know         2
Not selected    2925
Selected         155
Name: Count, dtype: int64
dn012d3
Refusal            1
Don't know         2
Not selected    2950
Selected         130
Name: Count, dtype: int64
dn012d4
Refusal            1
Don't know         2
Not selected    2851
Selected         229
Name: Count, dtype: int64
dn012d5
Refusal            1
Don't know         2
Not selected    3004
Selected          76
Name: Count, dtype: int64
dn012d6
Refusal            1
Don't know         2
Not selected    2514
Selected         566
Name: Count, dtype: int64
dn012d7
Refusal            1
Don't know         2
Not selected    2384
Selected         696
Name: Count, dtype: int64
dn012d8
Refusal            1
Don't know         2
Not selected    2964
Selected         116
Name: Count, dtype: int64
dn012d9
Don't know         2
Not selected    2979
Refusa

In [311]:
table(dat['dn012dno'])

dn012dno
Not selected    440
Selected        120
Name: Count, dtype: int64

In [312]:
table(dat['dn012dot'])

dn012dot
Refusal            1
Don't know         2
Not selected    3069
Selected          11
Name: Count, dtype: int64

In [313]:
((dat["dn012dot"] == "Refusal") | (dat["dn012dot"] == "Don't know")).sum()

3

In [314]:
table(dat["dn012d1"])

dn012d1
Refusal            1
Don't know         2
Not selected    2360
Selected         720
Name: Count, dtype: int64

In [317]:
# Process each 'further_educ' column
for educ in further_educ:
    dat[educ] = np.where(
        (dat[educ] != "Selected") & (dat[educ] != "Not selected"), np.nan, dat[educ]
    )
    #number = int(re.search(r"\d+", educ).group())
    dat[educ] = np.where(dat[educ] == "Selected", 1, dat[educ])
    dat[educ] = np.where(dat[educ] == "Not selected", 0, dat[educ])

In [318]:
dat["dn012d20"].isna().sum()

3842

In [321]:
for var in further_educ:
    print(dat[var].sum())

720
155
130
229
76
566
696
116
101
91
63
32
19
67
15
21
17
130
15
0


In [285]:
# Process 'dn012dno' and 'dn012dot' columns
# dat["dn012dno"] = np.where(dat["dn012dno"] == "Not selected", np.nan, dat["dn012dno"])
dat["dn012dot"] = np.where(
    (dat["dn012dot"] == "Refusal") | (dat["dn012dot"] == "Don't know"),
    np.nan,
    dat["dn012dot"],
)
dat["dn012dno"] = np.where(dat["dn012dno"] == "Not selected", 0, dat["dn012dno"])
dat["dn012dno"] = np.where(dat["dn012dno"] == "Selected", 1, dat["dn012dno"])

In [287]:
#dat[list(dat.filter(like='dn012'))] = dat.filter(like='dn012').apply(pd.to_numeric)

In [326]:
def find_max_suffix(row):
    max_suffix = 0
    for col in further_educ:
        if row[col] == 1:
            suffix = int(col.split('dn012d')[-1])
            max_suffix = max(max_suffix, suffix)
    return max_suffix if max_suffix > 0 else np.nan

dat['further_educ_max'] = dat.apply(find_max_suffix, axis=1)

In [327]:
table(dat["further_educ_max"])

further_educ_max
1.0     675
2.0     124
3.0     114
4.0     193
5.0      70
6.0     493
7.0     638
8.0     105
9.0      96
10.0     86
11.0     57
12.0     30
13.0     17
14.0     64
15.0     13
16.0     19
17.0     17
18.0    125
19.0     15
Name: Count, dtype: int64

In [328]:
dat.shape

(3949, 152)

In [331]:
dat["high_educ_012"] = (
    (
        (dat["wave"].isin([1, 2, 4]) & (dat["further_educ_max"] >= 3))
        | (dat["wave"].between(5, 7) & (dat["further_educ_max"] >= 10))
    )
    .astype(int)
    .where(dat["wave"] != 3, None)
)

In [332]:
table(dat["high_educ_012"])

high_educ_012
0    3398
1     551
Name: Count, dtype: int64

In [333]:
# Create a new column "high_educ_comb" based on conditions
dat['high_educ_comb'] = ((dat['high_educ'] == 1) | (dat['high_educ_012'] == 1)).astype(int)

In [334]:
# Count the occurrences of each value in the "high_educ_comb" column
dat['high_educ_comb'].value_counts()

high_educ_comb
0    3018
1     931
Name: count, dtype: int64

In [337]:
table(dat["ep005_"])

ep005_
Refusal                                                                 1
Don't know                                                              1
Retired                                                              1613
Employed or self-employed (including working for family business)    1412
Unemployed                                                            161
Permanently sick or disabled                                          110
Homemaker                                                             555
Other                                                                  41
Name: Count, dtype: int64

In [338]:
table(dat["ep329_"])

ep329_
1919.0         1
1948.0         1
1952.0         1
1955.0         1
1959.0         1
              ..
2016.0         1
2018.0         2
2019.0         2
Don't know    26
Refusal        9
Name: Count, Length: 63, dtype: int64

In [342]:
dat["ep005_"] = np.where(
    (dat["ep005_"] == "Refusal") | (dat["ep005_"] == "Don't know"),
    np.nan,
    dat["ep005_"],
)

In [343]:
table(dat["ep005_"])

ep005_
Employed or self-employed (including working for family business)    1412
Homemaker                                                             555
Other                                                                  41
Permanently sick or disabled                                          110
Retired                                                              1613
Unemployed                                                            161
Name: Count, dtype: int64

In [344]:
dat["retired"] = np.where(
    (dat["ep005_"] == "Retired") | (~dat["ep329_"].isna()),
    1,
    np.where(
        (dat["ep005_"].isna())
        & (dat["ep329_"].isna()),
        np.nan,
        0,
    ),
)

In [345]:
table(dat["retired"])

retired
0.0    2279
1.0    1613
Name: Count, dtype: int64

In [335]:
#dat['dn012_max'] = dat.filter(like='dn012').max(skipna=True)

In [346]:
table(dat['ep329_'])

ep329_
1919.0         1
1948.0         1
1952.0         1
1955.0         1
1959.0         1
              ..
2016.0         1
2018.0         2
2019.0         2
Don't know    26
Refusal        9
Name: Count, Length: 63, dtype: int64

In [347]:
table(dat['ep328_'])

ep328_
Refusal        7
Don't know    31
January       96
February      54
March         74
April         70
May           60
June          60
July          79
August        70
September     81
October       70
November      59
December      49
Name: Count, dtype: int64

In [352]:
table(dat['ep329_'])

ep329_
1919.0     1
1948.0     1
1952.0     1
1955.0     1
1959.0     1
          ..
2013.0    17
2014.0     2
2016.0     1
2018.0     2
2019.0     2
Name: Count, Length: 61, dtype: int64

In [353]:
dat["int_year"]

3920     2006.0
12073      2013
12075      2013
12077      2013
12078      2013
          ...  
19732      2013
19736      2013
7421     2007.0
7423     2006.0
19742      2013
Name: int_year, Length: 3949, dtype: object

In [348]:
dat["ep329_"] = np.where(
    (dat["ep329_"] == "Refusal") | (dat["ep329_"] == "Don't know"),
    np.nan,
    dat["ep329_"],
)
dat["ep328_"] = np.where(
    (dat["ep328_"] == "Refusal") | (dat["ep328_"] == "Don't know"),
    np.nan,
    dat["ep328_"],
)

In [349]:
dat["years_since_retirement"] = np.where(
    (dat["retired"] == 1) | (~dat["ep329_"].isna()),
    dat["int_year"] - dat["ep329_"],
    np.where(dat["retired"] == 0, 0, np.nan),
)

In [351]:
dat["years_since_retirement"]

3920     0.0
12073    0.0
12075    0.0
12077    0.0
12078    0.0
        ... 
19732    0.0
19736    0.0
7421     0.0
7423     0.0
19742    0.0
Name: years_since_retirement, Length: 3949, dtype: object

In [354]:
# Translate the R code to Python with categorical data
dat["married"] = np.where(
    dat["dn014_"].isin(
        [
            "Married and living together with spouse",
            "Married, living separated from spouse",
        ]
    ),
    1,
    np.where(
        dat["dn014_"].isin(
            ["Registered partnership", "Never married", "Divorced", "Widowed"]
        ),
        0,
        np.nan,
    ),
)

dat["in_partnership"] = np.where(
    dat["dn014_"].isin(
        ["Married and living together with spouse", "Registered partnership"]
    ),
    1,
    np.where(
        dat["dn014_"].isin(
            [
                "Married, living separated from spouse",
                "Never married",
                "Divorced",
                "Widowed",
            ]
        ),
        0,
        np.nan,
    ),
)

dat["has_partner"] = np.where(
    (dat["married"] == 1) | (dat["in_partnership"] == 1), 1, 0
)

# Note: The Python code now considers "dn014_" as a categorical variable and maps its values before performing the operations.

In [355]:
table(dat["has_partner"])

has_partner
0    1677
1    2272
Name: Count, dtype: int64

In [356]:
table(dat["married"])

married
0.0     866
1.0    2255
Name: Count, dtype: int64