In [719]:
import pandas as pd
import numpy as np
import re

from parental_care.config import BLD
from parental_care.config import SRC

In [720]:
pd.set_option('display.max_columns', None)

In [721]:
def filter_nested_dict(original_dict, keys_to_remove):
    return {
        key: [value for value in values if value not in keys_to_remove.get(key, [])]
        if key in keys_to_remove
        else values
        for key, values in original_dict.items()
    }

In [722]:
def process_wave(wave_number, data_modules):
    wave_data = {}

    for module in data_modules.keys():
        print(f"module: {module}")
        module_file = (
            SRC / f"data/sharew{wave_number}/sharew{wave_number}_rel8-0-0_{module}.dta"
        )

        # Read and filter
        if module == "re" and wave_number == 7:
            wave_module = pd.read_stata(module_file, convert_categoricals=False)
            wave_module = wave_module[wave_module["country"] == 12]

            #lookup = {
            #    f"re{var[2:]}": f"sl{var[2:]}"
            #    for var in data_modules["re"]
            #    if var.startswith("sl")
            #}
            lookup = {
                f"{var[3:]}": f"{var}"
                for var in data_modules["re"]
                if var.startswith("sl")
            }
        else:
            wave_module = pd.read_stata(module_file, convert_categoricals=False)
            #wave_module = wave_module[wave_module["country"] == "Germany"]
            wave_module = wave_module[wave_module["country"] == 12]

            lookup = {
                "sp009_1sp": "sp009_1",
                "sp009_2sp": "sp009_2",
                "sp009_3sp": "sp009_3",
                "sp019d2sp": "sp019d2",
                "sp019d3sp": "sp019d3",
                "sp019d4sp": "sp019d4",
                "sp019d5sp": "sp019d5",
                "sp019d6sp": "sp019d6",
                "sp019d7sp": "sp019d7",
            }

        # Rename columns using the dictionary
        wave_module.rename(columns=lookup, inplace=True)

        module_vars = ["mergeid"] + data_modules[module]

        # Select columns
        wave_module = wave_module[module_vars]

        wave_data[module] = wave_module

        print(wave_module.shape)

    add_wealth_data = "gv_imputations" in data_modules
    merged_data = wave_data["cv_r"]

    data_modules.pop("cv_r")
    data_modules.pop("gv_imputations", None)

    for module_key in data_modules.keys():
        merged_data = merged_data.merge(
            wave_data[module_key], on="mergeid", how="outer"
        )

    if add_wealth_data:
        merged_data = merged_data.merge(
            wave_data["gv_imputations"], on="mergeid", how="left"
        )

    merged_data["wave"] = wave_number

    return merged_data

In [723]:
all_variables = {
    "cv_r": [
        "int_year",
        "int_month",
        "gender",
        "mobirth",
        "yrbirth",
        "age_int",
        "hhsize",
    ],
    "dn": [
        "dn002_",
        "dn003_",
        "dn010_",
        "dn041_",
        "dn009_",
        "dn014_",
        "dn015_",
        "dn016_",
        "dn026_1",
        "dn026_2",
        "dn033_1",
        "dn033_2",
        "dn027_1",
        "dn027_2",
        "dn028_1",
        "dn028_2",
        "dn030_1",
        "dn030_2",
        "dn127_1",
        "dn127_2",
        "dn032_1",
        "dn032_2",
        "dn012d1",
        "dn012d2",
        "dn012d3",
        "dn012d4",
        "dn012d5",
        "dn012d6",
        "dn012d7",
        "dn012d8",
        "dn012d9",
        "dn012d10",
        "dn012d11",
        "dn012d12",
        "dn012d13",
        "dn012d14",
        "dn012d15",
        "dn012d16",
        "dn012d17",
        "dn012d18",
        "dn012d19",
        "dn012d20",
        # "dn012d95",
        "dn012dno",
        "dn012dot",
    ],
    "ep": [
        "ep005_",
        "ep002_",
        "ep328_",
        "ep329_",
        "ep213_1",
        # "ep213_2",
        # "ep213_3",
        # "ep213_4",
        # "ep213_5",
        # "ep213_6",
        # "ep213_7",
        # "ep213_8",
        # "ep213_9",
        # "ep213_10",
        # "ep213_11",
        # "ep213_12",
        # "ep213_13",
        # "ep213_14",
        # "ep213_15",
        # "ep213_16",
    ],
    "sp": [
        "sp008_",
        "sp018_",
        "sp009_1",
        "sp009_2",
        "sp009_3",
        "sp010d1_1",
        "sp010d1_2",
        "sp010d1_3",
        "sp011_1",
        "sp011_2",
        "sp011_3",
        # "sp019d1",
        "sp019d2",
        "sp019d3",
        "sp019d4",
        "sp019d5",
        "sp019d6",
        "sp019d7",
        # "sp019d8",
        # "sp019d9",
        # "sp019d10",
        # "sp019d11",
        # "sp019d12",
        # "sp019d13",
        # "sp019d14",
        # "sp019d15",
        # "sp019d16",
        # "sp019d17",
        # "sp019d18",
        # "sp019d19",
        # "sp019d20",
    ],
    "gv_isced": ["isced1997_r"],
    # "gv_imputations": [
    #    "hnetw"
    # ],  # household net worth = total gross financial assets + total real assets - total libailities
    "ch": ["ch001_"],
}

In [724]:
keys_to_remove_wave1 = {
    "dn": [
        "dn041_",
        "dn127_1",
        "dn127_2",
        "dn012d15",
        "dn012d16",
        "dn012d17",
        "dn012d18",
        "dn012d19",
        "dn012d20",
    ],
    "ep": [
        "ep328_",
        "ep329_",
        "ep213_12",
        "ep213_13",
        "ep213_14",
        "ep213_15",
        "ep213_16",
    ],
}

keys_to_remove_wave2 = {
    "dn": [
        "dn127_1",
        "dn127_2",
        "dn012d15",
        "dn012d16",
        "dn012d17",
        "dn012d18",
        "dn012d19",
        "dn012d20",
    ],
}


keys_to_remove_wave4 = {
    "dn": [
        "dn127_1",
        "dn127_2",
        "dn012d14",
        "dn012d15",
        "dn012d16",
        "dn012d17",
        "dn012d18",
        "dn012d19",
        "dn012d20",
    ],
    "sp": [
        "sp010d1_1",
        "sp010d1_2",
        "sp010d1_3",
    ],
}


keys_to_remove_wave5 = {
    "dn": [
        "dn127_1",
        "dn127_2",
        "dn012d20",
        "dn012dno",
    ],
    "sp": [
        "sp010d1_1",
        "sp010d1_2",
        "sp010d1_3",
    ],
}

keys_to_remove_wave6 = {
    "dn": [
        "dn012dno",
    ],
    "ep": ["ep213_14", "ep213_15", "ep213_16"],
}

keys_to_remove_wave7 = {
    "dn": [
        "dn012dno",
    ],
    "ep": ["ep213_14", "ep213_15", "ep213_16"],
    "re": [
        "sl_re018_1",
        "sl_re018_2",
        "sl_re018_3",
        "sl_re018_4",
        "sl_re018_5",
        "sl_re018_6",
        "sl_re018_7",
        "sl_re020_1",
        "sl_re020_2",
        "sl_re020_3",
        "sl_re020_4",
        "sl_re020_5",
        "sl_re020_6",
        "sl_re020_7",
    ],
}

keys_to_remove_wave8 = {
    "dn": [
        "dn012dno",
    ],
    "ep": ["ep213_14", "ep213_15", "ep213_16"],
}

In [725]:
# Retrospective waves
re_vars = (
    [f"sl_re011_{i}" for i in range(1, 21)]
    + [f"sl_re016_{i}" for i in range(1, 21)]
    + [f"sl_re026_{i}" for i in range(1, 21)]
    + [f"sl_re018_{i}" for i in range(1, 8)]
    + [f"sl_re020_{i}" for i in range(1, 8)]
)

# Data modules for wave 3
variables_wave3 = {
    "cv_r": [
        "int_year",
        "int_month",
        "gender",
        "mobirth",
        "yrbirth",
        "age_int",
        "hhsize",
    ],
    "re": re_vars,
}

# Separate modules for partly retrospective wave 7
variables_wave7 = filter_nested_dict(all_variables | {"re": re_vars}, keys_to_remove_wave7) 

In [726]:
variables_wave1 = filter_nested_dict(all_variables, keys_to_remove_wave1)
variables_wave2 = filter_nested_dict(all_variables, keys_to_remove_wave2)
variables_wave4 = filter_nested_dict(all_variables, keys_to_remove_wave4)
variables_wave5 = filter_nested_dict(all_variables, keys_to_remove_wave5)
variables_wave6 = filter_nested_dict(all_variables, keys_to_remove_wave6)
variables_wave8 = filter_nested_dict(all_variables, keys_to_remove_wave8)

In [727]:
wave1 = process_wave(wave_number=1, data_modules=variables_wave1)
wave2 = process_wave(wave_number=2, data_modules=variables_wave2)
wave3 = process_wave(wave_number=3, data_modules=variables_wave3)
wave4 = process_wave(wave_number=4, data_modules=variables_wave4)
wave5 = process_wave(wave_number=5, data_modules=variables_wave5)
wave6 = process_wave(wave_number=6, data_modules=variables_wave6)
wave7 = process_wave(wave_number=7, data_modules=variables_wave7)
wave8 = process_wave(wave_number=8, data_modules=variables_wave8)

module: cv_r
(3920, 8)
module: dn
(2995, 36)
module: ep
(2995, 4)
module: sp
(2995, 18)
module: gv_isced
(2995, 2)
module: ch
(2995, 2)
module: cv_r
(3504, 8)
module: dn
(2628, 37)
module: ep
(2628, 6)
module: sp
(2628, 18)
module: gv_isced
(2628, 2)
module: ch
(2628, 2)
module: cv_r
(2501, 8)
module: re
(1918, 75)
module: cv_r
(2146, 8)
module: dn
(1619, 36)
module: ep
(1619, 6)
module: sp
(1619, 15)
module: gv_isced
(1619, 2)
module: ch
(1619, 2)
module: cv_r
(7674, 8)
module: dn
(5750, 41)
module: ep
(5750, 6)
module: sp
(5750, 15)
module: gv_isced
(5750, 2)
module: ch
(5750, 2)
module: cv_r
(5787, 8)
module: dn
(4411, 44)
module: ep
(4411, 6)
module: sp
(4411, 18)
module: gv_isced
(4411, 2)
module: ch
(4411, 2)
module: cv_r
(4928, 8)
module: dn
(3820, 44)
module: ep
(3820, 6)
module: sp
(3820, 18)
module: gv_isced
(3820, 2)
module: ch
(3820, 2)
module: re
(3820, 61)
module: cv_r
(4202, 8)
module: dn
(2878, 44)
module: ep
(2878, 6)
module: sp
(2878, 18)
module: gv_isced
(2878, 2)
mo

In [728]:
waves_list = [wave1, wave2, wave3, wave4, wave5, wave6, wave7, wave8]
#waves_list = [wave1, wave2, wave3, wave4, wave5, wave6, wave7]


# Drop all nan rows
for i, df in enumerate(waves_list):
    waves_list[i] = df.dropna(how='all', axis=0, inplace=False)
    #waves_list[i] = df.dropna(axis=1, how='all')

In [729]:
def merge_wave_datasets(wave_datasets):
    # Combine the data frames in wave_datasets into one data frame
    #     combined_data = pd.concat(wave_datasets)
    combined_data = pd.concat(wave_datasets, axis=0, ignore_index=True)

    # Filter out rows where the 'int_year' column is not equal to -9
    # combined_data = combined_data[combined_data["int_year"] != -9]
    # combined_data = combined_data[combined_data["int_year"] != "Not applicable"]
    combined_data = combined_data[combined_data["int_year"] != -9]

    # Sort the data frame by 'mergeid' and 'int_year'
    combined_data = combined_data.sort_values(by=["mergeid", "int_year"])

    return combined_data

In [730]:
data = merge_wave_datasets(waves_list)

In [731]:
data.shape

(26593, 151)

In [732]:
sum(pd.crosstab(data["int_year"], columns='Count')["Count"])

26590

In [733]:
len(list(data))

151

In [734]:
nan_dropped = [
    "mergeid",
    "int_year",
    "int_month",
    "gender",
    "mobirth",
    "yrbirth",
    "age_int",
    "hhsize",
    "dn002_",
    "dn003_",
    "dn010_",
    "dn009_",
    "dn014_",
    "dn015_",
    "dn016_",
    "dn026_1",
    "dn026_2",
    "dn033_1",
    "dn033_2",
    "dn028_1",
    "dn028_2",
    "dn030_1",
    "dn030_2",
    "dn032_1",
    "dn032_2",
    "dn012d1",
    "dn012d2",
    "dn012d3",
    "dn012d4",
    "dn012d5",
    "dn012d6",
    "dn012d7",
    "dn012d8",
    "dn012d9",
    "dn012d10",
    "dn012d11",
    "dn012d12",
    "dn012d13",
    "dn012d14",
    "dn012dno",
    "dn012dot",
    "ep005_",
    "ep002_",
    "ep213_1",
    "sp008_",
    "sp018_",
    "sp009_1",
    "sp009_2",
    "sp009_3",
    "sp010d1_1",
    "sp010d1_2",
    "sp010d1_3",
    "sp011_1",
    "sp011_2",
    "sp011_3",
    "sp019d2",
    "sp019d3",
    "sp019d4",
    "sp019d5",
    "sp019d6",
    "sp019d7",
    "isced1997_r",
    "ch001_",
    "wave",
    "dn041_",
    "ep328_",
    "ep329_",
    "sl_re011_1",
    "sl_re011_2",
    "sl_re011_3",
    "sl_re011_4",
    "sl_re011_5",
    "sl_re011_6",
    "sl_re011_7",
    "sl_re011_8",
    "sl_re011_9",
    "sl_re011_10",
    "sl_re011_11",
    "sl_re016_1",
    "sl_re016_2",
    "sl_re016_3",
    "sl_re016_4",
    "sl_re016_5",
    "sl_re016_6",
    "sl_re016_7",
    "sl_re016_8",
    "sl_re016_9",
    "sl_re016_10",
    "sl_re016_11",
    "sl_re026_1",
    "sl_re026_2",
    "sl_re026_3",
    "sl_re026_4",
    "sl_re026_5",
    "sl_re026_6",
    "sl_re026_7",
    "sl_re026_8",
    "sl_re026_9",
    "sl_re026_10",
    "sl_re026_11",
    "sl_re018_1",
    "sl_re018_2",
    "sl_re018_3",
    "sl_re018_4",
    "sl_re018_5",
    "sl_re018_6",
    "sl_re018_7",
    "sl_re020_1",
    "sl_re020_2",
    "sl_re020_3",
    "sl_re020_4",
    "sl_re020_5",
    "sl_re020_6",
    "sl_re020_7",
    "dn012d15",
    "dn012d16",
    "dn012d17",
    "dn012d18",
    "dn012d19",
    "dn127_1",
    "dn127_2",
    "dn012d20",
    "sl_re011_12",
    "sl_re011_13",
    "sl_re016_12",
    "sl_re016_13",
    "sl_re026_12",
    "sl_re026_13",
]

In [735]:
set_empty_cols = {
    "sl_re011_14",
    "sl_re011_15",
    "sl_re011_16",
    "sl_re011_17",
    "sl_re011_18",
    "sl_re011_19",
    "sl_re011_20",
    "sl_re016_14",
    "sl_re016_15",
    "sl_re016_16",
    "sl_re016_17",
    "sl_re016_18",
    "sl_re016_19",
    "sl_re016_20",
    "sl_re026_14",
    "sl_re026_15",
    "sl_re026_16",
    "sl_re026_17",
    "sl_re026_18",
    "sl_re026_19",
    "sl_re026_20",
}

In [736]:
def table(df_col):
    return pd.crosstab(df_col, columns='Count')["Count"]

# 2) Data preparation

## Discrete state variables

###### states which are interpolated
- labor market experience (0, 30)
- years in retirement (0, 6)
- years in intensive care (0, 5)
- father age (70, 90)
- mother age (70, 90)
###### states which are not interpolated
- individuals’ type (1, 2)
- father died last period (0, 1)
- mother died last period (0, 1)
- father alive (0, 1)
- mother alive (0, 1)
- health of father (1, 2, 3)
- health of mother (1, 2, 3)
- existence of siblings (0, 1) --> **change to existence of sister?**
- parents live close by (0, 1)
- married (0, 1)
- education (low, high)

In [737]:
# only females
dat = data.copy()

# Filter for females
#dat = dat[dat['gender'] == "Female"]
dat = dat[dat['gender'] == 2]

dat.shape

(14153, 151)

In [738]:
table(dat["dn028_1"] > 0)

dn028_1
False    12743
True      1410
Name: Count, dtype: int64

In [740]:
dat["dn012d1"].describe()

count    4726.000000
mean        0.309564
std         0.464646
min        -2.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: dn012d1, dtype: float64

In [741]:
table(dat["dn012d1"])

dn012d1
-2.0       1
-1.0       2
 0.0    3256
 1.0    1467
Name: Count, dtype: int64

In [742]:
# Age calculation
dat['age'] = dat.apply(lambda row: row['int_year'] - row['yrbirth'] if row['int_month'] >= row['mobirth'] else row['int_year'] - row['yrbirth'] - 1, axis=1)

# Keep only those aged 55 to 68
dat = dat[(dat['age'] >= 55) & (dat['age'] <= 68)]

In [743]:
dat.shape

(6823, 152)

In [744]:
table(dat["age"])

age
55.0    475
56.0    455
57.0    502
58.0    531
59.0    488
60.0    451
61.0    524
62.0    471
63.0    503
64.0    529
65.0    497
66.0    498
67.0    426
68.0    473
Name: Count, dtype: int64

In [745]:
dat["age"].median()

61.0

In [746]:
# Rename 'dn041_' to 'years_educ'
dat.rename(columns={'dn041_': 'years_educ'}, inplace=True)

# Filter rows where 'years_educ' is less than or equal to 25 or is NaN
dat = dat[(dat['years_educ'] <= 25) | dat['years_educ'].isna()]

# Replace negative 'years_educ' values with NaN
dat['years_educ'] = dat['years_educ'].apply(lambda x: np.nan if x < 0 else x)

# Create 'high_educ' column, setting NaN when 'years_educ' is NaN
dat['high_educ'] = np.where(dat['years_educ'].isna(), np.nan, (dat['years_educ'] >= 15).astype(int))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dat.rename(columns={'dn041_': 'years_educ'}, inplace=True)


In [747]:
table(dat["high_educ"])

high_educ
0.0    1407
1.0     358
Name: Count, dtype: int64

In [748]:
table(dat["years_educ"])

years_educ
0.0       2
1.0       3
2.0      12
3.0      23
4.0       4
5.0      12
6.0       9
7.0       7
8.0     102
9.0      48
10.0    119
11.0    376
12.0    342
13.0    237
14.0    111
15.0    105
16.0     93
17.0     55
18.0     39
19.0     26
20.0     12
21.0     11
22.0      5
23.0      7
24.0      1
25.0      4
Name: Count, dtype: int64

In [749]:
dat['high_educ'].isna().sum()

5057

In [750]:
table(dat["dn012d1"])

dn012d1
-1.0       1
 0.0    1414
 1.0     637
Name: Count, dtype: int64

In [751]:
table(dat["dn012dot"])

dn012dot
-1.0       1
 0.0    2037
 1.0      14
Name: Count, dtype: int64

In [752]:
# Create a list of further education columns
further_educ = [
    "dn012d1",
    "dn012d2",
    "dn012d3",
    "dn012d4",
    "dn012d5",
    "dn012d6",
    "dn012d7",
    "dn012d8",
    "dn012d9",
    "dn012d10",
    "dn012d11",
    "dn012d12",
    "dn012d13",
    "dn012d14",
    "dn012d15",
    "dn012d16",
    "dn012d17",
    "dn012d18",
    "dn012d19",
    "dn012d20",
    #'dn012d95' # currently in education --> not needed
]

In [753]:
for var in further_educ:
    print(table(dat[var]))

dn012d1
-1.0       1
 0.0    1414
 1.0     637
Name: Count, dtype: int64
dn012d2
-1.0       1
 0.0    1883
 1.0     168
Name: Count, dtype: int64
dn012d3
-1.0       1
 0.0    1924
 1.0     127
Name: Count, dtype: int64
dn012d4
-1.0       1
 0.0    1919
 1.0     132
Name: Count, dtype: int64
dn012d5
-1.0       1
 0.0    1958
 1.0      93
Name: Count, dtype: int64
dn012d6
-1.0       1
 0.0    1814
 1.0     237
Name: Count, dtype: int64
dn012d7
-1.0       1
 0.0    1724
 1.0     327
Name: Count, dtype: int64
dn012d8
-1.0       1
 0.0    2002
 1.0      49
Name: Count, dtype: int64
dn012d9
-1.0       1
 0.0    2006
 1.0      45
Name: Count, dtype: int64
dn012d10
-1.0       1
 0.0    2014
 1.0      37
Name: Count, dtype: int64
dn012d11
-1.0       1
 0.0    2026
 1.0      25
Name: Count, dtype: int64
dn012d12
-1.0       1
 0.0    2042
 1.0       9
Name: Count, dtype: int64
dn012d13
-1.0       1
 0.0    2045
 1.0       6
Name: Count, dtype: int64
dn012d14
-1.0       1
 0.0    2022
 1.0      25

In [754]:
# Process each 'further_educ' column
for educ in further_educ:
    dat[educ] = np.where(dat[educ] < 0, np.nan, dat[educ])
    number = int(re.search(r"\d+", educ).group())
    dat[educ] = np.where(dat[educ] == number, 1, dat[educ])

In [755]:
dat["dn012dno"] = np.where(dat["dn012dno"] < 0, np.nan, dat["dn012dno"])
dat["dn012dot"] = np.where(dat["dn012dot"] < 0, np.nan, dat["dn012dot"])
dat["dn012dno"] = np.where(dat["dn012dno"] == 1, 0, dat["dn012dno"])

In [756]:
table(dat["dn012dno"])

dn012dno
0.0    985
Name: Count, dtype: int64

In [757]:
table(dat["dn012dot"])

dn012dot
0.0    2037
1.0      14
Name: Count, dtype: int64

In [758]:
dat["dn012dno"].dtype

dtype('float64')

In [759]:
# Calculate the max for columns starting with 'dn012' for each row
dat["dn012_max"] = dat.loc[:, dat.columns.str.startswith("dn012")].max(axis=1)

In [760]:
table(dat['dn012_max']).sum()

2052

In [761]:
def find_max_suffix(row):
    max_suffix = 0
    for col in further_educ:
        if row[col] == 1:
            suffix = int(col.split('dn012d')[-1])
            max_suffix = max(max_suffix, suffix)
            
    return max_suffix if max_suffix >= 0 else np.nan

dat['further_educ_max'] = dat.apply(find_max_suffix, axis=1)

In [762]:
table(dat['further_educ_max'])

further_educ_max
0     4968
1      584
2      155
3      119
4      119
5       89
6      202
7      302
8       47
9       45
10      35
11      24
12       9
13       5
14      25
15       6
16       9
17      11
18      64
19       4
Name: Count, dtype: int64

In [763]:
# Find columns that start with 'dn012'
dn012_columns = [col for col in dat.columns if col.startswith('dn012')]

# Add a new column 'dn012_max' with the maximum value across 'dn012' columns
dat['dn012_max'] = dat[dn012_columns].max(axis=1)

# If you want to drop the 'dn012' columns after creating 'dn012_max', you can do this:
# df = df.drop(columns=dn012_columns)

In [764]:
table(dat['dn012_max'])

dn012_max
0.0     188
1.0    1864
Name: Count, dtype: int64

In [765]:
# Find columns that start with 'dn012'
dn012_columns = [col for col in df.columns if col.startswith('dn012')]

# Create a new column 'dn012_max' with the maximum value across 'dn012' columns
dat['dn012_max'] = df[dn012_columns].max(axis=1)

# Replace NaN values with 0
dat['dn012_max'].fillna(0, inplace=True)

In [766]:
dat['dn012_max'].unique()

array([0., 1.])

In [767]:
dat['further_educ_max'].unique()

array([ 0,  8,  1, 14, 11, 18,  9,  6,  7,  3,  2,  5, 10,  4, 17, 12, 16,
       19, 13, 15])

In [768]:
table(dat['further_educ_max'])

further_educ_max
0     4968
1      584
2      155
3      119
4      119
5       89
6      202
7      302
8       47
9       45
10      35
11      24
12       9
13       5
14      25
15       6
16       9
17      11
18      64
19       4
Name: Count, dtype: int64

In [769]:
dat['further_educ_max'].describe()

count    6822.000000
mean        1.328789
std         3.099757
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max        19.000000
Name: further_educ_max, dtype: float64

dat["high_educ_012"] = (
    (
        (dat["wave"].isin([1, 2, 4]) & (dat["further_educ_max"] >= 3))
        | (dat["wave"].between(5, 7) & (dat["further_educ_max"] >= 10))
    )
    .astype(int)
    .where(dat["wave"] != 3, None)
)

In [770]:
dat["high_educ_012"] = (
    (
        (dat["wave"].isin([1, 2, 4]) & (dat["further_educ_max"] >= 3))
        | (dat["wave"].between(5, 7) & (dat["further_educ_max"] >= 10))
    )
    .astype(int)
)

dat.loc[dat["further_educ_max"].isna(), "high_educ_012"] = None

In [771]:
table(dat["high_educ_012"])

high_educ_012
0.0    6422
1.0     400
Name: Count, dtype: int64

In [772]:
# Create a new column "high_educ_comb" based on conditions
dat['high_educ_comb'] = ((dat['high_educ'] == 1) | (dat['high_educ_012'] == 1)).astype(int)

In [773]:
# Count the occurrences of each value in the "high_educ_comb" column
dat['high_educ_comb'].value_counts()

high_educ_comb
0    6237
1     585
Name: count, dtype: int64

In [774]:
dat = dat.rename(columns={'ch001_': 'nchild'})
dat['nchild'] = dat['nchild'].apply(lambda x: x if x >= 0 else pd.NA)

In [775]:
table(dat['nchild'])

nchild
0.0      394
1.0      856
2.0     1477
3.0      639
4.0      196
5.0       66
6.0       11
7.0        7
8.0        4
9.0        1
13.0       1
Name: Count, dtype: int64

In [776]:
dat["ep005_"] = np.where(dat["ep005_"] >= 0, dat["ep005_"], np.nan)


def calculate_retired(row):
    if row["ep005_"] == 1 or (not pd.isna(row["ep329_"])):
        return 1
    elif pd.isna(row["ep005_"]) and pd.isna(row["ep329_"]):
        return np.nan
    else:
        return 0


dat["retired"] = dat.apply(calculate_retired, axis=1)

In [777]:
table(dat["ep005_"])

ep005_
1.0     2443
2.0     2360
3.0      254
4.0      222
5.0      830
97.0      83
Name: Count, dtype: int64

In [778]:
table(dat["retired"])

retired
0.0    3749
1.0    2443
Name: Count, dtype: int64

In [779]:
dat['ep329_'] = np.where(dat['ep329_'] >= 0, dat['ep329_'], np.nan)
dat['ep328_'] = np.where(dat['ep328_'] >= 0, dat['ep328_'], np.nan)

def calculate_years_since_retirement(row):
    if row['retired'] == 1 or (not pd.isna(row['ep329_'])):
        return row['int_year'] - row['ep329_']
    elif row['retired'] == 0:
        return 0
    else:
        return np.nan

dat['years_since_retirement'] = dat.apply(calculate_years_since_retirement, axis=1)

In [780]:
table(dat['years_since_retirement'])

years_since_retirement
0.0     3902
1.0      282
2.0      282
3.0      236
4.0      192
5.0      119
6.0      113
7.0       87
8.0       84
9.0       50
10.0      27
11.0      17
12.0      16
13.0      12
14.0      14
15.0      12
16.0       6
17.0       7
18.0       6
20.0       4
21.0       2
22.0       2
23.0       2
24.0       3
26.0       2
34.0       1
45.0       1
46.0       2
47.0       1
Name: Count, dtype: int64

In [781]:
table(dat['years_since_retirement']).sum()

5484

In [782]:
dat["married"] = dat["dn014_"].apply(
    lambda x: 1 if x in [1, 3] else (0 if x in [2, 4, 5, 6] else pd.NA)
)

dat["in_partnership"] = dat["dn014_"].apply(
    lambda x: 1 if x in [1, 2] else (0 if x in [3, 4, 5, 6] else pd.NA)
)

In [783]:
conditions = [
    (dat["married"] == 1) | (dat["in_partnership"] == 1),
    (dat["married"].isna()) & (dat["in_partnership"].isna())
]

choices = [1, np.nan]

dat["has_partner"] = np.select(conditions, choices, default=0)

In [784]:
table(dat["married"])

married
0     528
1    1775
Name: Count, dtype: int64

In [785]:
table(dat["in_partnership"])

in_partnership
0     561
1    1742
Name: Count, dtype: int64

In [786]:
table(dat["has_partner"])

has_partner
0.0     523
1.0    1780
Name: Count, dtype: int64

In [787]:
# Update 'sp008_' to handle negative values
dat["sp008_"] = dat["sp008_"].apply(lambda x: x if x >= 0 else pd.NA)

# Update 'sp009_1', 'sp009_2', and 'sp009_3' to handle negative values
columns_to_update = ["sp009_1", "sp009_2", "sp009_3"]
for col in columns_to_update:
    dat[col] = dat[col].apply(lambda x: x if x >= 0 else pd.NA)

In [788]:
table(dat['sp008_'])

sp008_
1.0    1936
5.0    2828
Name: Count, dtype: int64

In [789]:
# Assuming 'dat' is a pandas DataFrame

# Create the 'ever_cared' column
dat["ever_cared"] = np.where(
    (dat["sp008_"] == 1) | (dat["sp018_"] == 1),
    1,
    np.where(
        ((dat["sp008_"] == 5) & (dat["sp018_"] == 5))
        | ((dat["sp008_"] == 5) & dat["sp018_"].isna())
        | (dat["sp008_"].isna() & (dat["sp018_"] == 5)),
        0,
        np.nan,
    ),
)

In [790]:
table(dat['ever_cared'])

ever_cared
0.0    3194
1.0    2157
Name: Count, dtype: int64

In [791]:
# Create the 'ever_cared' column
conditions_ever_cared = [
    (dat["sp008_"] == 1) | (dat["sp018_"] == 1),
    ((dat["sp008_"] == 5) & (dat["sp018_"] == 5))
    | ((dat["sp008_"] == 5) & dat["sp018_"].isna())
    | (dat["sp008_"].isna() & (dat["sp018_"] == 5)),
]

choices_ever_cared = [1, 0]

dat["ever_cared"] = np.select(conditions_ever_cared, choices_ever_cared, default=np.nan)

In [792]:
table(dat["sp008_"])

sp008_
1.0    1936
5.0    2828
Name: Count, dtype: int64

In [793]:
table(dat['ever_cared'])

ever_cared
0.0    3194
1.0    2157
Name: Count, dtype: int64

In [794]:
# Create the 'ever_cared_parents_outside' column
conditions_parents_outside = [
    (dat["sp008_"] == 1)
    & (
        (dat["sp009_1"].isin([2, 3]))
        | (dat["sp009_2"].isin([2, 3]))
        | (dat["sp009_3"].isin([2, 3]))
    ),
    dat["sp008_"].isna(),
]

choices_parents_outside = [1, np.nan]

dat["ever_cared_parents_outside"] = np.select(
    conditions_parents_outside, choices_parents_outside, default=0
)

In [795]:
table(dat['ever_cared_parents_outside'])

ever_cared_parents_outside
0.0    4099
1.0     665
Name: Count, dtype: int64

In [796]:
# Create the 'ever_cared_parents_within' column
conditions_parents_within = [
    (dat["sp018_"] == 1) & ((dat["sp019d2"] == 1) | (dat["sp019d3"] == 1)),
    dat["sp018_"].isna(),
]

choices_parents_within = [1, np.nan]

dat["ever_cared_parents_within"] = np.select(
    conditions_parents_within, choices_parents_within, default=0
)

# Create the 'ever_cared_parents' column
conditions_parents = [
    (dat["ever_cared_parents_outside"] == 1) | (dat["ever_cared_parents_within"] == 1),
    (dat["ever_cared_parents_within"].isna())
    & (dat["ever_cared_parents_outside"].isna()),
]

choices_parents = [1, np.nan]

dat["ever_cared_parents"] = np.select(conditions_parents, choices_parents, default=0)

In [797]:
table(dat["ever_cared_parents_within"])

ever_cared_parents_within
0.0    4441
1.0      72
Name: Count, dtype: int64

In [798]:
table(dat["ever_cared_parents_outside"])

ever_cared_parents_outside
0.0    4099
1.0     665
Name: Count, dtype: int64

In [799]:
table(dat["ever_cared_parents"])

ever_cared_parents
0.0    4623
1.0     728
Name: Count, dtype: int64

In [800]:
table(dat["ever_cared_parents_within"]).sum(), table(dat["ever_cared_parents_outside"]).sum(), table(dat["ever_cared_parents"]).sum()

(4513, 4764, 5351)

In [801]:
# Define conditions and choices for np.select
conditions = [
    (dat["sp018_"] == 1) & ((dat["sp019d2"] == 1) | (dat["sp019d3"] == 1)),
    (dat["sp008_"] == 1)
    & ((dat["sp009_1"] == 2) | (dat["sp009_2"] == 2) | (dat["sp009_3"] == 2)),
    (dat["sp008_"] == 1)
    & ((dat["sp009_1"] == 3) | (dat["sp009_2"] == 3) | (dat["sp009_3"] == 3)),
]

choices = [1, 1, 1]  # Assign 1 if the conditions are met

# Use np.select to create the 'care_in_year' column
dat["care_in_year"] = np.select(conditions, choices, default=0)
# dat.loc[
#    ((dat["sp008_"]).isna() & ((dat["sp019d2"] == 1) | (dat["sp019d3"] == 1))),
#    "care_in_year",
# ] = np.nan

In [802]:
table(dat["sp008_"])

sp008_
1.0    1936
5.0    2828
Name: Count, dtype: int64

In [803]:
table(dat["care_in_year"])

care_in_year
0    6094
1     728
Name: Count, dtype: int64

In [804]:
# not finished!
# add: experience in informal care
# add periods in which person gave informal care (no matter whether outside or within household)
# experience + 1 in next period! (do not count this years informal care as experience)

In [805]:
dat = dat.sort_values(by=['mergeid', 'int_year'], ascending=[True, True])
# On the sorted data set (which should be called dat), how to generate a new variable called 

In [806]:
dat.head(10)

Unnamed: 0,mergeid,int_year,int_month,gender,mobirth,yrbirth,age_int,hhsize,dn002_,dn003_,dn010_,dn009_,dn014_,dn015_,dn016_,dn026_1,dn026_2,dn033_1,dn033_2,dn027_1,dn027_2,dn028_1,dn028_2,dn030_1,dn030_2,dn032_1,dn032_2,dn012d1,dn012d2,dn012d3,dn012d4,dn012d5,dn012d6,dn012d7,dn012d8,dn012d9,dn012d10,dn012d11,dn012d12,dn012d13,dn012d14,dn012dno,dn012dot,ep005_,ep002_,ep213_1,sp008_,sp018_,sp009_1,sp009_2,sp009_3,sp010d1_1,sp010d1_2,sp010d1_3,sp011_1,sp011_2,sp011_3,sp019d2,sp019d3,sp019d4,sp019d5,sp019d6,sp019d7,isced1997_r,nchild,wave,years_educ,ep328_,ep329_,sl_re011_1,sl_re011_2,sl_re011_3,sl_re011_4,sl_re011_5,sl_re011_6,sl_re011_7,sl_re011_8,sl_re011_9,sl_re011_10,sl_re011_11,sl_re011_12,sl_re011_13,sl_re011_14,sl_re011_15,sl_re011_16,sl_re011_17,sl_re011_18,sl_re011_19,sl_re011_20,sl_re016_1,sl_re016_2,sl_re016_3,sl_re016_4,sl_re016_5,sl_re016_6,sl_re016_7,sl_re016_8,sl_re016_9,sl_re016_10,sl_re016_11,sl_re016_12,sl_re016_13,sl_re016_14,sl_re016_15,sl_re016_16,sl_re016_17,sl_re016_18,sl_re016_19,sl_re016_20,sl_re026_1,sl_re026_2,sl_re026_3,sl_re026_4,sl_re026_5,sl_re026_6,sl_re026_7,sl_re026_8,sl_re026_9,sl_re026_10,sl_re026_11,sl_re026_12,sl_re026_13,sl_re026_14,sl_re026_15,sl_re026_16,sl_re026_17,sl_re026_18,sl_re026_19,sl_re026_20,sl_re018_1,sl_re018_2,sl_re018_3,sl_re018_4,sl_re018_5,sl_re018_6,sl_re018_7,sl_re020_1,sl_re020_2,sl_re020_3,sl_re020_4,sl_re020_5,sl_re020_6,sl_re020_7,dn012d15,dn012d16,dn012d17,dn012d18,dn012d19,dn127_1,dn127_2,dn012d20,age,high_educ,dn012_max,further_educ_max,high_educ_012,high_educ_comb,retired,years_since_retirement,married,in_partnership,has_partner,ever_cared,ever_cared_parents_outside,ever_cared_parents_within,ever_cared_parents,care_in_year
7424,DE-000132-01,2009.0,3.0,2,1.0,1953.0,56.0,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3,,,,1971.0,1973.0,1979.0,1993.0,1997.0,,,,,,,,,,,,,,,,1.0,1.0,1.0,2.0,1.0,,,,,,,,,,,,,,,,1973.0,1979.0,1987.0,1997.0,9997.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,56.0,,0.0,0,0.0,0,,,,,,,,,,0
9925,DE-000132-01,2011.0,6.0,2,1.0,1953.0,58.0,2,1.0,1953.0,,,,,,1.0,,3.0,,,,,,7.0,,5.0,,,,,,,,,,,,,,,,,,2.0,,,5.0,5.0,,,,,,,,,,,,,,,,5.0,2.0,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,58.0,,0.0,0,0.0,0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0
12071,DE-000132-01,2013.0,3.0,2,1.0,1953.0,60.0,2,1.0,1953.0,,,,,,1.0,1.0,3.0,5.0,,,,,7.0,7.0,4.0,4.0,,,,,,,,,,,,,,,,,2.0,,,5.0,5.0,,,,,,,,,,,,,,,,5.0,2.0,5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,60.0,,0.0,0,0.0,0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0
19745,DE-000132-01,2015.0,4.0,2,1.0,1953.0,62.0,3,1.0,1953.0,,,,,,1.0,5.0,3.0,,,86.0,,,7.0,,4.0,,,,,,,,,,,,,,,,,,2.0,,,5.0,5.0,,,,,,,,,,,,,,,,5.0,2.0,6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2013.0,,62.0,,0.0,0,0.0,0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0
25532,DE-000132-01,2017.0,7.0,2,1.0,1953.0,64.0,2,1.0,1953.0,,,,,,5.0,,,,97.0,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,2016.0,5.0,5.0,,,,,,,,,,,,,,,,5.0,2.0,7,,4.0,2016.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2016.0,,,64.0,,0.0,0,0.0,0,1.0,1.0,,,,0.0,0.0,0.0,0.0,0
25534,DE-000554-01,2017.0,4.0,2,10.0,1961.0,55.0,2,10.0,1961.0,,,,,,1.0,5.0,4.0,,,84.0,,,7.0,,6.0,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,,3.0,,7,,,,1978.0,1982.0,,,,,,,,,,,,,,,,,,,1.0,1.0,,,,,,,,,,,,,,,,,,,1982.0,9997.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2012.0,,55.0,,0.0,0,0.0,0,0.0,0.0,,,,,,,,0
30460,DE-000554-01,2020.0,1.0,2,10.0,1961.0,58.0,2,10.0,1961.0,,,,,,1.0,,3.0,,,,,,8.0,,6.0,,,,,,,,,,,,,,,,,,2.0,,,5.0,5.0,,,,,,,,,,,,,,,,3.0,,8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,58.0,,0.0,0,0.0,0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0
12075,DE-000802-01,2013.0,3.0,2,7.0,1951.0,61.0,1,7.0,1951.0,5.0,2.0,6.0,,,5.0,5.0,,,87.0,77.0,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,2.0,,,1.0,,1.0,,,,,,3.0,,,,,,,,,3.0,2.0,5,10.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,,,,61.0,0.0,0.0,8,0.0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,,0.0,0
19750,DE-000802-01,2015.0,2.0,2,7.0,1951.0,63.0,1,7.0,1951.0,,,6.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,1.0,,31.0,,,0.0,,,3.0,,,,,,,,,3.0,2.0,6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011.0,1999.0,,63.0,,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,,0.0,0
25536,DE-000802-01,2017.0,3.0,2,7.0,1951.0,65.0,1,7.0,1951.0,,,,,,5.0,5.0,,,87.0,77.0,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,3.0,,7,,,,1970.0,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,1971.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011.0,1999.0,,65.0,,0.0,0,0.0,0,1.0,,,,,,,,,0


In [807]:
# Calculate cumulative sum for 'care_in_year' within each 'mergeid' group
dat["care_experience"] = (
    dat.groupby("mergeid")["care_in_year"]
    .cumsum()
    .where(dat["care_in_year"] >= 0, np.nan)
)

In [808]:
table(dat['care_experience'])

care_experience
0    5509
1     978
2     252
3      58
4      21
5       3
6       1
Name: Count, dtype: int64

In [809]:
dat['care_experience'].mean()

0.25813544415127526

In [810]:
dat[["mergeid", "int_year", "care_in_year", "care_experience"]]

Unnamed: 0,mergeid,int_year,care_in_year,care_experience
7424,DE-000132-01,2009.0,0,0
9925,DE-000132-01,2011.0,0,0
12071,DE-000132-01,2013.0,0,0
19745,DE-000132-01,2015.0,0,0
25532,DE-000132-01,2017.0,0,0
...,...,...,...,...
30457,DE-999404-02,2017.0,0,0
19742,DE-999990-01,2013.0,0,0
25530,DE-999990-01,2015.0,0,0
30458,DE-999990-01,2017.0,0,0


In [811]:
# outside the household
condition = (
    ((dat["sp009_1"] == 1) & (dat["sp010d1_1"] == 1))
    | ((dat["sp009_2"] == 1) & (dat["sp010d1_2"] == 1))
    | ((dat["sp009_3"] == 1) & (dat["sp010d1_3"] == 1))
)

dat[condition].shape

# need to drop personal (intensive care) INSIDE the houeshold to any other than parent
# need to add variables sp019d1
# rename in waves 4, 5 --> sp/sn above
# variables to add

(24, 168)

In [812]:
dat[(dat["sp019d1"] == 1)]

KeyError: 'sp019d1'

In [952]:

# Define conditions and choices for np.select
conditions_dn026 = [
    (dat['dn026_1'] == 1),
    (dat['dn026_1'] == 5)
]

choices_dn026 = [1, 0]

# Create 'mother_alive' based on 'dn026_1' using np.select
dat['mother_alive'] = np.select(conditions_dn026, choices_dn026, default=np.nan)

# Rename 'dn028_1' to 'age_mother'
dat = dat.rename(columns={'dn028_1': 'age_mother'})

In [953]:
dat["age_mother"].isna().sum()

6271

In [954]:
table(dat["age_mother"])

age_mother
67.0     1
71.0     1
72.0     1
73.0     2
74.0     2
75.0    12
76.0     9
77.0    18
78.0    24
79.0    28
80.0    30
81.0    43
82.0    41
83.0    39
84.0    46
85.0    40
86.0    43
87.0    39
88.0    27
89.0    24
90.0    25
91.0    14
92.0    11
93.0    14
94.0     6
95.0     5
96.0     2
97.0     1
98.0     3
Name: Count, dtype: int64

In [955]:
dat["age_mother"].mean()

84.15063520871144

In [956]:
table(dat["mother_alive"])

mother_alive
0.0    2732
1.0    1810
Name: Count, dtype: int64

In [957]:
# Handle negative values in 'dn033_1' and convert to 0 for Excellent, 1 for Very good, and 2 for the rest
conditions_dn033 = [
    (dat["dn033_1"] == 1) | (dat["dn033_1"] == 2),
    (dat["dn033_1"] == 3) | (dat["dn033_1"] == 4),
    (dat["dn033_1"] == 5),
]

choices_dn033 = [0, 1, 2]

# Create 'health_mother' based on 'dn033_1' using np.select
dat["health_mother"] = np.select(conditions_dn033, choices_dn033, default=np.nan)

# Rename 'health_mother_3' to 'health_mother'
dat = dat.rename(columns={"health_mother_3": "health_mother"})

# Re-map values to 0=good, 1=medium, 2=bad
#dat["health_mother"] = dat["health_mother"].replace({0: 0, 1: 1, 2: 3})

In [958]:
table(dat['dn033_1'] > 0) 

dn033_1
False    4989
True     1833
Name: Count, dtype: int64

In [959]:
table(dat["health_mother"])

health_mother
0.0     179
1.0    1208
2.0     446
Name: Count, dtype: int64

In [960]:
# Handle negative values in 'dn026_2' and create 'father_alive'
conditions_dn026_2 = [
    (dat['dn026_2'] == 1),
    (dat['dn026_2'] == 5)
]

choices_dn026_2 = [1, 0]

dat['father_alive'] = np.select(conditions_dn026_2, choices_dn026_2, default=np.nan)

# Rename 'dn028_2' to 'age_father'
dat = dat.rename(columns={'dn028_2': 'age_father'})

# Handle negative values in 'dn033_2' and create 'health_father_3'
conditions_dn033_2 = [
    (dat['dn033_2'] == 1) | (dat['dn033_2'] == 2),
    (dat['dn033_2'] == 3) | (dat['dn033_2'] == 4),
    (dat['dn033_2'] == 5)
]

choices_dn033_2 = [0, 1, 2]

dat['health_father_3'] = np.select(conditions_dn033_2, choices_dn033_2, default=np.nan)


In [961]:
table(dat['dn033_2'])

dn033_2
-2.0      2
-1.0     12
 1.0     15
 2.0     47
 3.0    171
 4.0    222
 5.0    159
Name: Count, dtype: int64

In [962]:
table(dat['dn033_2'] > 0)

dn033_2
False    6208
True      614
Name: Count, dtype: int64

In [963]:
table(dat['health_father_3'])

health_father_3
0.0     62
1.0    393
2.0    159
Name: Count, dtype: int64

In [964]:
table(dat["father_alive"])

father_alive
0.0    3379
1.0     612
Name: Count, dtype: int64

In [965]:
# Handle negative values in 'dn030_1' and 'dn030_2', and create 'dist_father' and 'dist_mother'
dat['dist_father'] = dat['dn030_2'].apply(lambda x: x if x >= 0 else np.nan)
dat['dist_mother'] = dat['dn030_1'].apply(lambda x: x if x >= 0 else np.nan)

# Create 'parents_live_close' based on distance criteria using np.select
conditions_distance = [
    (dat['dist_father'] <= 4) | (dat['dist_mother'] <= 4)
]

choices_distance = [1]

dat['parents_live_close'] = np.select(conditions_distance, choices_distance, default=0)


In [966]:
table(dat['parents_live_close'])

parents_live_close
0    6016
1     806
Name: Count, dtype: int64

In [967]:
dat['ep002_'] = dat['ep002_'].apply(lambda x: x if x >= 0 else np.nan)

In [968]:
# Create 'worked_last_period' based on conditions
dat['worked_last_period'] = np.where((dat['ep005_'] == 2) | (dat['ep002_'] == 1), 1, 0)

In [969]:
table(dat['worked_last_period'])

worked_last_period
0    3700
1    3122
Name: Count, dtype: int64


Variable Name
ep005_
Variable Label
Current job situation
Dataset
sharew5_rel7-1-0_ep
Variable Type
directly measured

Representation Type
Categories


-2	Refusal
-1	Don't know
1	Retired
2	Employed or self-employed (including working for family business)
3	Unemployed
4	Permanently sick or disabled
5	Homemaker
97	Other

In [970]:
# EP141_ChangeInJob
# EP125_ContWork
# EP006_EverWorked

In [971]:
# Sort the DataFrame by 'mergeid' and 'int_year'
dat = dat.sort_values(by=["mergeid", "int_year"])

# Shift the 'ep005_' and 'ep002_' variables by one period
dat["lagged_ep005_"] = dat.groupby("mergeid")["ep005_"].shift(1)
dat["lagged_ep002_"] = dat.groupby("mergeid")["ep002_"].shift(1)

# Create 'worked_last_period' based on the lagged values
dat["worked_last_period"] = np.where(
    (dat["lagged_ep005_"] == 2) | (dat["ep002_"] == 1), 1, 0
)
#dat["worked_last_period"] = np.where((dat["lagged_ep005_"] == 2), 1, 0)

# Drop the lagged columns if not needed
dat = dat.drop(["lagged_ep005_", "lagged_ep002_"], axis=1)

In [972]:
table(dat["worked_last_period"])

worked_last_period
0    4737
1    2085
Name: Count, dtype: int64

# Fix initial condition!!!

# death of parent since last period


In [973]:
# low share of parent alive in Fischer
# age == nan --> parent dead?
# or can the come "back alive" if nan means simply just missing

In [974]:
# dn127_1 (mother) dn127_2 (father)
# only since wave 6

# could use age / health of mothers to check this
# if data about age / health of mother in period before and now not, assume
# that mother died
# same for father

In [975]:
# Group the data by 'age' and count missing values in 'age_mother'
missing_age_mother = dat[dat['age_mother'].isna()].groupby('age')['age'].count()

# Display the result
print(missing_age_mother)

age
55.0    387
56.0    382
57.0    441
58.0    473
59.0    435
60.0    405
61.0    488
62.0    440
63.0    465
64.0    506
65.0    479
66.0    483
67.0    419
68.0    468
Name: age, dtype: int64


In [976]:
mask = (dat['age_mother'].isna()) & (dat['mother_alive'] == 1)
dat[mask]

Unnamed: 0,mergeid,int_year,int_month,gender,mobirth,yrbirth,age_int,hhsize,dn002_,dn003_,dn010_,dn009_,dn014_,dn015_,dn016_,dn026_1,dn026_2,dn033_1,dn033_2,dn027_1,dn027_2,age_mother,age_father,dn030_1,dn030_2,dn032_1,dn032_2,dn012d1,dn012d2,dn012d3,dn012d4,dn012d5,dn012d6,dn012d7,dn012d8,dn012d9,dn012d10,dn012d11,dn012d12,dn012d13,dn012d14,dn012dno,dn012dot,ep005_,ep002_,ep213_1,sp008_,sp018_,sp009_1,sp009_2,sp009_3,sp010d1_1,sp010d1_2,sp010d1_3,sp011_1,sp011_2,sp011_3,sp019d2,sp019d3,sp019d4,sp019d5,sp019d6,sp019d7,isced1997_r,nchild,wave,years_educ,ep328_,ep329_,sl_re011_1,sl_re011_2,sl_re011_3,sl_re011_4,sl_re011_5,sl_re011_6,sl_re011_7,sl_re011_8,sl_re011_9,sl_re011_10,sl_re011_11,sl_re011_12,sl_re011_13,sl_re011_14,sl_re011_15,sl_re011_16,sl_re011_17,sl_re011_18,sl_re011_19,sl_re011_20,sl_re016_1,sl_re016_2,sl_re016_3,sl_re016_4,sl_re016_5,sl_re016_6,sl_re016_7,sl_re016_8,sl_re016_9,sl_re016_10,sl_re016_11,sl_re016_12,sl_re016_13,sl_re016_14,sl_re016_15,sl_re016_16,sl_re016_17,sl_re016_18,sl_re016_19,sl_re016_20,sl_re026_1,sl_re026_2,sl_re026_3,sl_re026_4,sl_re026_5,sl_re026_6,sl_re026_7,sl_re026_8,sl_re026_9,sl_re026_10,sl_re026_11,sl_re026_12,sl_re026_13,sl_re026_14,sl_re026_15,sl_re026_16,sl_re026_17,sl_re026_18,sl_re026_19,sl_re026_20,sl_re018_1,sl_re018_2,sl_re018_3,sl_re018_4,sl_re018_5,sl_re018_6,sl_re018_7,sl_re020_1,sl_re020_2,sl_re020_3,sl_re020_4,sl_re020_5,sl_re020_6,sl_re020_7,dn012d15,dn012d16,dn012d17,dn012d18,dn012d19,dn127_1,dn127_2,dn012d20,age,high_educ,dn012_max,further_educ_max,high_educ_012,high_educ_comb,retired,years_since_retirement,married,in_partnership,has_partner,ever_cared,ever_cared_parents_outside,ever_cared_parents_within,ever_cared_parents,care_in_year,care_experience,mother_alive,health_mother,father_alive,health_father_3,dist_father,dist_mother,parents_live_close,worked_last_period,freq_visits_mother,freq_visits_father,mother_alive_2,lagged_age_mother,mother_dead,lagged_mother_alive,mother_dead_since_last,age_mother_first,birth_year_mother,int_year_mother_first,age_mother_full,age_year_mother_new,age_mother_death,death_transition,year_mother_death
9925,DE-000132-01,2011.0,6.0,2,1.0,1953.0,58.0,2,1.0,1953.0,,,,,,1.0,,3.0,,,,,,7.0,,5.0,,,,,,,,,,,,,,,,,,2.0,,,5.0,5.0,,,,,,,,,,,,,,,,5.0,2.0,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,58.0,,0.0,0,0.0,0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0,0,1.0,1.0,,,,7.0,0,0,5.0,,1.0,,,,,,,2009.0,,,97.0,False,2017.0
12071,DE-000132-01,2013.0,3.0,2,1.0,1953.0,60.0,2,1.0,1953.0,,,,,,1.0,1.0,3.0,5.0,,,,,7.0,7.0,4.0,4.0,,,,,,,,,,,,,,,,,2.0,,,5.0,5.0,,,,,,,,,,,,,,,,5.0,2.0,5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,60.0,,0.0,0,0.0,0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0,0,1.0,1.0,1.0,2.0,7.0,7.0,0,1,4.0,4.0,1.0,,,1.0,0.0,,,2009.0,,,97.0,False,2017.0
19745,DE-000132-01,2015.0,4.0,2,1.0,1953.0,62.0,3,1.0,1953.0,,,,,,1.0,5.0,3.0,,,86.0,,,7.0,,4.0,,,,,,,,,,,,,,,,,,2.0,,,5.0,5.0,,,,,,,,,,,,,,,,5.0,2.0,6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2013.0,,62.0,,0.0,0,0.0,0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0,0,1.0,1.0,0.0,,,7.0,0,1,4.0,,1.0,,,1.0,0.0,,,2009.0,,,97.0,False,2017.0
25534,DE-000554-01,2017.0,4.0,2,10.0,1961.0,55.0,2,10.0,1961.0,,,,,,1.0,5.0,4.0,,,84.0,,,7.0,,6.0,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,,3.0,,7,,,,1978.0,1982.0,,,,,,,,,,,,,,,,,,,1.0,1.0,,,,,,,,,,,,,,,,,,,1982.0,9997.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2012.0,,55.0,,0.0,0,0.0,0,0.0,0.0,,,,,,,,0,0,1.0,1.0,0.0,,,7.0,0,0,6.0,,1.0,,,,,,,2017.0,,,,False,
30460,DE-000554-01,2020.0,1.0,2,10.0,1961.0,58.0,2,10.0,1961.0,,,,,,1.0,,3.0,,,,,,8.0,,6.0,,,,,,,,,,,,,,,,,,2.0,,,5.0,5.0,,,,,,,,,,,,,,,,3.0,,8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,58.0,,0.0,0,0.0,0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0,0,1.0,1.0,,,,8.0,0,1,6.0,,1.0,,,1.0,0.0,,,2017.0,,,,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34643,DE-994888-01,2019.0,11.0,2,5.0,1951.0,68.0,2,5.0,1951.0,,,,,,1.0,,3.0,,,,,,2.0,,1.0,,,,,,,,,,,,,,,,,,1.0,5.0,2016.0,1.0,5.0,2.0,29.0,,1.0,0.0,,1.0,4.0,,,,,,,,3.0,,8,,5.0,2016.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,68.0,,0.0,0,0.0,0,1.0,3.0,,,,1.0,1.0,0.0,1.0,1,2,1.0,1.0,,,,2.0,1,0,1.0,,1.0,,,1.0,0.0,,,2011.0,,,,False,
12061,DE-994952-01,2011.0,6.0,2,5.0,1953.0,58.0,2,5.0,1953.0,,,,,,1.0,,4.0,,,,,,2.0,,1.0,,,,,,,,,,,,,,,,,,2.0,,,5.0,1.0,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,58.0,,0.0,0,0.0,0,0.0,0.0,,,,1.0,0.0,0.0,0.0,0,0,1.0,1.0,,,,2.0,1,0,1.0,,1.0,,,,,,,2009.0,,,94.0,False,2015.0
19717,DE-994952-01,2013.0,2.0,2,5.0,1953.0,59.0,2,5.0,1953.0,,,,,,1.0,5.0,4.0,,,77.0,,,2.0,,1.0,,,,,,,,,,,,,,,,,,2.0,,,1.0,5.0,2.0,,,,,,1.0,,,,,,,,,3.0,1.0,5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,59.0,,0.0,0,0.0,0,0.0,0.0,,,,1.0,1.0,0.0,1.0,1,1,1.0,1.0,0.0,,,2.0,1,1,1.0,,1.0,,,1.0,0.0,,,2009.0,,,94.0,False,2015.0
7416,DE-998320-03,2007.0,4.0,2,1.0,1948.0,59.0,2,1.0,1948.0,,,,,,1.0,5.0,5.0,,,38.0,,,4.0,,2.0,,,,,,,,,,,,,,,,,,2.0,,,1.0,5.0,33.0,,,0.0,,,1.0,,,,,,,,,,,2,12.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,59.0,0.0,0.0,0,0.0,0,0.0,0.0,,,,1.0,0.0,0.0,0.0,0,0,1.0,2.0,0.0,,,4.0,1,0,2.0,,1.0,,,,,,,2007.0,,,,False,


In [977]:
table(dat["age_mother"])

age_mother
67.0     1
71.0     1
72.0     1
73.0     2
74.0     2
75.0    12
76.0     9
77.0    18
78.0    24
79.0    28
80.0    30
81.0    43
82.0    41
83.0    39
84.0    46
85.0    40
86.0    43
87.0    39
88.0    27
89.0    24
90.0    25
91.0    14
92.0    11
93.0    14
94.0     6
95.0     5
96.0     2
97.0     1
98.0     3
Name: Count, dtype: int64

In [978]:
dat["freq_visits_mother"] = dat["dn032_1"]
dat["freq_visits_father"] = dat["dn032_2"]

In [979]:
table(dat["freq_visits_father"]).sum()

604

In [980]:
# Create 'mother_alive_2' based on 'mother_alive'
dat['mother_alive_2'] = np.where(dat['mother_alive'] == 1, 1, np.nan)

In [981]:
## 

In [982]:
# Sort the DataFrame by 'mergeid' and 'int_year'
dat = dat.sort_values(by=['mergeid', 'int_year'])


# Create 'lagged_age_mother' using 'shift' to represent the previous period's values
dat['lagged_age_mother'] = dat.groupby('mergeid')['age_mother'].shift(1)

# Create 'mother_dead' based on the specified conditions
dat['mother_dead'] = np.where(dat['age_mother'].isna() & (dat['lagged_age_mother'] > 0), 1, np.nan)

In [983]:
table(dat["mother_dead"])

mother_dead
1.0    377
Name: Count, dtype: int64

In [984]:
dat['lagged_mother_alive'] = dat.groupby('mergeid')['mother_alive'].shift(1)

# Create 'mother_dead' based on conditions using np.select
conditions = [
    (dat['lagged_mother_alive'] == 0),
    (dat['lagged_mother_alive'] == 1)
]

choices = [1, 0]  # 1 for True, 0 for False

dat['mother_dead_since_last'] = np.select(conditions, choices, np.nan)

In [985]:
table(dat['lagged_mother_alive'])

lagged_mother_alive
0.0    1588
1.0    1172
Name: Count, dtype: int64

In [986]:
table(dat['mother_dead_since_last'])

mother_dead_since_last
0.0    1172
1.0    1588
Name: Count, dtype: int64

In [987]:
dat['mother_dead_since_last'].describe()

count    2760.000000
mean        0.575362
std         0.494377
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: mother_dead_since_last, dtype: float64

## More age info?

In [1167]:
# Create 'mother_alive' based on 'dn026_1' using np.select
dat['mother_alive'] = np.select(conditions_dn026, choices_dn026, default=np.nan)

In [1168]:
# Sort the data by 'mergeid' and 'int_year'
dat = dat.sort_values(by=['mergeid', 'int_year'])

# Group the data by 'mergeid' and transform to get the first non-NaN value of 'age_mother'
dat['age_mother_first'] = dat.groupby('mergeid')['age_mother'].transform('first')
dat['int_year_mother_first'] = dat.groupby('mergeid')['int_year'].transform('first')

In [1169]:
# Calculate the first non-NaN value in 'age_mother_first' within each group
first_age_mother = dat.groupby("mergeid")["age_mother_first"].transform("first")

# Create 'birth_year_mother' based on the calculation
dat["age_year_mother_new"] = (
    dat["int_year"] - dat["int_year_mother_first"] + first_age_mother
)
# dat['age_mother_full'] = dat['int_year'] + (first_age_mother)

# Sort the DataFrame by 'mergeid' and 'int_year'
dat = dat.sort_values(by=["mergeid", "int_year"])


def first_non_empty(series):
    # Helper function to get the first non-empty value
    return series.dropna().iloc[0] if not series.dropna().empty else np.nan


# Create 'int_year_mother_first' to record the 'int_year' when 'age_mother_first' is first non-empty
dat["int_year_mother_first"] = dat.groupby("mergeid")["int_year"].transform(
    lambda x: first_non_empty(dat["int_year"].where(~dat["age_mother_first"].isna()))
)

In [1170]:
# Group the data by 'mergeid'
grouped = dat.groupby('mergeid')

# Determine the most common non-empty value in 'dn027_1' for each 'mergeid'
most_common_value = grouped['dn027_1'].apply(lambda x: x.dropna().mode().iloc[0] if not x.dropna().empty else np.nan)

# Assign the most common value to all rows within the 'mergeid' group
dat['age_mother_death'] = dat['mergeid'].map(most_common_value)

# Fill any remaining NaN values with np.nan
dat['age_mother_death'].fillna(np.nan, inplace=True)

In [1171]:
dat = dat.sort_values(by=["mergeid", "int_year"])

# Initialize an auxiliary variable 'death_transition' to track the transition from 1 to 0
dat["death_transition"] = (dat["mother_alive"] == 0) & (
    dat.groupby("mergeid")["mother_alive"].shift(1) == 1
)

# Calculate 'year_mother_death' based on the first transition from 1 to 0 within each 'mergeid'
dat["year_mother_death"] = dat.groupby("mergeid")["int_year"].transform(
    lambda x: x.where(dat["death_transition"]).min()
)

# Fill remaining NaN values in 'year_mother_death' with np.nan
# dat["year_mother_death"].fillna(np.nan, inplace=True)


# Identify the first observation in the panel for each 'mergeid'
#first_observation_mask = (
#    dat.groupby("mergeid")["int_year"].transform("first") == dat["int_year"]
#)
#
## Further filter for rows where 'mother_alive' is 0
#first_observation_mother_alive_zero_mask = first_observation_mask & (
#    dat["mother_alive"] == 0
#)
#
## Replace values in 'year_mother_death' with 'int_year - 1' for the first observations
#dat.loc[first_observation_mother_alive_zero_mask, "year_mother_death"] = (
#    dat["int_year"] - 1
#)

In [1172]:
# Assuming 'dat' is a pandas DataFrame

# Sort the DataFrame by 'mergeid' and 'int_year'
dat = dat.sort_values(by=["mergeid", "int_year"])

# Identify the first observation in the panel for each 'mergeid'
first_observation_mask = (
    dat.groupby("mergeid")["int_year"].transform("first") == dat["int_year"]
)

# Identify the next observation in the panel for each 'mergeid'
next_observation_mask = (
    dat.groupby("mergeid")["int_year"].transform("first") == dat["int_year"] + 1
)

# Filter for rows where 'mother_alive' is NaN and the conditions are met
# nan_mother_alive_mask = dat["mother_alive"].isna() & first_observation_mask
nan_mother_alive_mask = (
    dat["mother_alive"].isna() & first_observation_mask & next_observation_mask
)

# Replace 'mother_alive' with 1 for the specified rows
dat.loc[nan_mother_alive_mask & (dat["mother_alive"].shift(1) == 1), "mother_alive"] = 1

# Replace 'mother_alive' with 0 for the specified rows
dat.loc[
    nan_mother_alive_mask & (dat["mother_alive"].shift(1) == 0),
    "mother_alive",
] = 0

In [1173]:
first_occurrence_condition = (
    (
        (dat["age_year_mother_new"].notna() & dat["age_mother_death"].notna())
        & (dat["age_year_mother_new"] > dat["age_mother_death"])
    )
    .groupby(dat["mergeid"])
    .idxmax()
)
# Identify the next occurrence of "mother_alive == 1" per "mergeid"
next_occurrence_condition = (dat["mother_alive"] != 1).groupby(dat["mergeid"]).shift(-1)
# Replace 'mother_alive' with 0 for rows where it is NaN and the conditions are met
dat.loc[
    (dat["mother_alive"].isna())
    & (dat.index.isin(first_occurrence_condition))
    & (next_occurrence_condition),
    "mother_alive",
] = 0

In [1174]:
# # Sort the DataFrame by 'mergeid' and 'int_year'
# dat = dat.sort_values(by=["mergeid", "int_year"])
#
#
# # Create a mask for rows where 'mother_alive' is NaN and the preceding row's 'mother_alive' is 0
# nan_mother_alive_mask = dat["mother_alive"].isna() & (dat["mother_alive"].shift(1) == 0)
#
# # Set 'mother_alive' to 0 for the identified rows
# dat.loc[nan_mother_alive_mask, "mother_alive"] = 0


# Sort the DataFrame by 'mergeid' and 'int_year'
dat = dat.sort_values(by=["mergeid", "int_year"])

# Define a custom function to handle grouping within 'mergeid'
def custom_condition(group):
    return group["mother_alive"].isna() & (group["mother_alive"].shift(1) == 0)


# Apply the custom function within each 'mergeid'
nan_mother_alive_mask = dat.groupby("mergeid").apply(custom_condition)

# Flatten the result to a boolean array
nan_mother_alive_mask = nan_mother_alive_mask.values

# Set 'mother_alive' to 0 for the identified rows
dat.loc[nan_mother_alive_mask, "mother_alive"] = 0

In [1175]:
# Sort the DataFrame by 'mergeid' and 'int_year'
dat = dat.sort_values(by=["mergeid", "int_year"])

# Identify the next occurrence of "mother_alive == 1" per "mergeid"
next_occurrence_condition = (dat.groupby("mergeid")["mother_alive"].shift(-1) == 1)

# Replace 'mother_alive' with 1 for rows where it is NaN and the next occurrence condition is met
dat.loc[(dat["mother_alive"].isna()) & next_occurrence_condition, "mother_alive"] = 1

In [1176]:
table(dat["mother_alive"]), dat["mother_alive"].describe()

(mother_alive
 0.0    4478
 1.0    1908
 Name: Count, dtype: int64,
 count    6386.000000
 mean        0.298779
 std         0.457758
 min         0.000000
 25%         0.000000
 50%         0.000000
 75%         1.000000
 max         1.000000
 Name: mother_alive, dtype: float64)

In [1177]:
dat["birth_year_mother"] = (
    dat.groupby("mergeid")
    .apply(lambda group: group["year_mother_death"] - group["age_mother_death"])
    .reset_index(drop=True)
)

In [1178]:
dat.loc[
    dat.index[0:30],
    [
        "mergeid",
        "int_year",
        "birth_year_mother",
        "age_mother_first",
        "int_year_mother_first",
        #"age_mother",
        "age_year_mother_new",
        "age_mother_death",
        #"year_mother_death",
        "mother_alive",
        #"death_transition",
        # "care_in_year",
        #"dn027_1",
    ],
]

Unnamed: 0,mergeid,int_year,birth_year_mother,age_mother_first,int_year_mother_first,age_year_mother_new,age_mother_death,mother_alive
7424,DE-000132-01,2009.0,,,2009.0,,97.0,1.0
9925,DE-000132-01,2011.0,,,2009.0,,97.0,1.0
12071,DE-000132-01,2013.0,,,2009.0,,97.0,1.0
19745,DE-000132-01,2015.0,,,2009.0,,97.0,1.0
25532,DE-000132-01,2017.0,,,2009.0,,97.0,0.0
25534,DE-000554-01,2017.0,,,2017.0,,,1.0
30460,DE-000554-01,2020.0,,,2017.0,,,1.0
12075,DE-000802-01,2013.0,,,2013.0,,87.0,0.0
19750,DE-000802-01,2015.0,,,2013.0,,87.0,0.0
25536,DE-000802-01,2017.0,,,2013.0,,87.0,0.0


In [870]:
table(dat['birth_year_mother']).sum(), dat['birth_year_mother'].isna().sum()

(1506, 5316)

In [None]:
dat['age_mother_first'].isna().sum()

In [856]:
table(dat['age_mother_first'])

age_mother_first
67.0      4
71.0      4
72.0      2
73.0     11
74.0      5
75.0     35
76.0     31
77.0     52
78.0     70
79.0     78
80.0     85
81.0    127
82.0    124
83.0    108
84.0    102
85.0     88
86.0    135
87.0     97
88.0     74
89.0     59
90.0     67
91.0     33
92.0     30
93.0     46
94.0     15
95.0      9
96.0      5
97.0      3
98.0      7
Name: Count, dtype: int64

In [882]:
dat.loc[
    dat.index[10:50],
    ["mergeid", "int_year", "birth_year_mother", "age_mother_first", "age_mother", "mother_alive", "dn027_1"],
]

Unnamed: 0,mergeid,int_year,birth_year_mother,age_mother_first,age_mother,mother_alive,dn027_1
19752,DE-001237-02,2015.0,,,,,
25538,DE-001237-02,2017.0,,,,0.0,81.0
30463,DE-001237-02,2019.0,,,,,
19753,DE-001350-01,2015.0,,,,,
7426,DE-001381-01,2009.0,,,,,
9,DE-002106-02,2004.0,,,,0.0,93.0
3929,DE-002106-02,2006.0,,,,,
11,DE-002173-02,2004.0,1915.0,89.0,89.0,1.0,
3931,DE-002173-02,2007.0,1918.0,89.0,,1.0,
7434,DE-002173-02,2008.0,1919.0,89.0,,,


5316

In [860]:
table(dat["mother_alive"])

mother_alive
0.0    2732
1.0    1810
Name: Count, dtype: int64

In [861]:
# does not change anything :)
dat.loc[dat["dn027_1"] >=0, "mother_alive"] = 0

In [862]:
table(dat["mother_alive"])

mother_alive
0.0    2732
1.0    1810
Name: Count, dtype: int64

## Missing age info!!

In [863]:
dat.loc[(dat["mother_alive"] == 1) & (dat["age_mother"].isna())]

Unnamed: 0,mergeid,int_year,int_month,gender,mobirth,yrbirth,age_int,hhsize,dn002_,dn003_,dn010_,dn009_,dn014_,dn015_,dn016_,dn026_1,dn026_2,dn033_1,dn033_2,dn027_1,dn027_2,age_mother,age_father,dn030_1,dn030_2,dn032_1,dn032_2,dn012d1,dn012d2,dn012d3,dn012d4,dn012d5,dn012d6,dn012d7,dn012d8,dn012d9,dn012d10,dn012d11,dn012d12,dn012d13,dn012d14,dn012dno,dn012dot,ep005_,ep002_,ep213_1,sp008_,sp018_,sp009_1,sp009_2,sp009_3,sp010d1_1,sp010d1_2,sp010d1_3,sp011_1,sp011_2,sp011_3,sp019d2,sp019d3,sp019d4,sp019d5,sp019d6,sp019d7,isced1997_r,nchild,wave,years_educ,ep328_,ep329_,sl_re011_1,sl_re011_2,sl_re011_3,sl_re011_4,sl_re011_5,sl_re011_6,sl_re011_7,sl_re011_8,sl_re011_9,sl_re011_10,sl_re011_11,sl_re011_12,sl_re011_13,sl_re011_14,sl_re011_15,sl_re011_16,sl_re011_17,sl_re011_18,sl_re011_19,sl_re011_20,sl_re016_1,sl_re016_2,sl_re016_3,sl_re016_4,sl_re016_5,sl_re016_6,sl_re016_7,sl_re016_8,sl_re016_9,sl_re016_10,sl_re016_11,sl_re016_12,sl_re016_13,sl_re016_14,sl_re016_15,sl_re016_16,sl_re016_17,sl_re016_18,sl_re016_19,sl_re016_20,sl_re026_1,sl_re026_2,sl_re026_3,sl_re026_4,sl_re026_5,sl_re026_6,sl_re026_7,sl_re026_8,sl_re026_9,sl_re026_10,sl_re026_11,sl_re026_12,sl_re026_13,sl_re026_14,sl_re026_15,sl_re026_16,sl_re026_17,sl_re026_18,sl_re026_19,sl_re026_20,sl_re018_1,sl_re018_2,sl_re018_3,sl_re018_4,sl_re018_5,sl_re018_6,sl_re018_7,sl_re020_1,sl_re020_2,sl_re020_3,sl_re020_4,sl_re020_5,sl_re020_6,sl_re020_7,dn012d15,dn012d16,dn012d17,dn012d18,dn012d19,dn127_1,dn127_2,dn012d20,age,high_educ,dn012_max,further_educ_max,high_educ_012,high_educ_comb,retired,years_since_retirement,married,in_partnership,has_partner,ever_cared,ever_cared_parents_outside,ever_cared_parents_within,ever_cared_parents,care_in_year,care_experience,mother_alive,health_mother,father_alive,health_father_3,dist_father,dist_mother,parents_live_close,worked_last_period,freq_visits_mother,freq_visits_father,mother_alive_2,lagged_age_mother,mother_dead,lagged_mother_alive,mother_dead_since_last,age_mother_first
9925,DE-000132-01,2011.0,6.0,2,1.0,1953.0,58.0,2,1.0,1953.0,,,,,,1.0,,3.0,,,,,,7.0,,5.0,,,,,,,,,,,,,,,,,,2.0,,,5.0,5.0,,,,,,,,,,,,,,,,5.0,2.0,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,58.0,,0.0,0,0.0,0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0,0,1.0,1.0,,,,7.0,0,0,5.0,,1.0,,,,,
12071,DE-000132-01,2013.0,3.0,2,1.0,1953.0,60.0,2,1.0,1953.0,,,,,,1.0,1.0,3.0,5.0,,,,,7.0,7.0,4.0,4.0,,,,,,,,,,,,,,,,,2.0,,,5.0,5.0,,,,,,,,,,,,,,,,5.0,2.0,5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,60.0,,0.0,0,0.0,0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0,0,1.0,1.0,1.0,2.0,7.0,7.0,0,1,4.0,4.0,1.0,,,1.0,0.0,
19745,DE-000132-01,2015.0,4.0,2,1.0,1953.0,62.0,3,1.0,1953.0,,,,,,1.0,5.0,3.0,,,86.0,,,7.0,,4.0,,,,,,,,,,,,,,,,,,2.0,,,5.0,5.0,,,,,,,,,,,,,,,,5.0,2.0,6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2013.0,,62.0,,0.0,0,0.0,0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0,0,1.0,1.0,0.0,,,7.0,0,1,4.0,,1.0,,,1.0,0.0,
25534,DE-000554-01,2017.0,4.0,2,10.0,1961.0,55.0,2,10.0,1961.0,,,,,,1.0,5.0,4.0,,,84.0,,,7.0,,6.0,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,,3.0,,7,,,,1978.0,1982.0,,,,,,,,,,,,,,,,,,,1.0,1.0,,,,,,,,,,,,,,,,,,,1982.0,9997.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2012.0,,55.0,,0.0,0,0.0,0,0.0,0.0,,,,,,,,0,0,1.0,1.0,0.0,,,7.0,0,0,6.0,,1.0,,,,,
30460,DE-000554-01,2020.0,1.0,2,10.0,1961.0,58.0,2,10.0,1961.0,,,,,,1.0,,3.0,,,,,,8.0,,6.0,,,,,,,,,,,,,,,,,,2.0,,,5.0,5.0,,,,,,,,,,,,,,,,3.0,,8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,58.0,,0.0,0,0.0,0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0,0,1.0,1.0,,,,8.0,0,1,6.0,,1.0,,,1.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34643,DE-994888-01,2019.0,11.0,2,5.0,1951.0,68.0,2,5.0,1951.0,,,,,,1.0,,3.0,,,,,,2.0,,1.0,,,,,,,,,,,,,,,,,,1.0,5.0,2016.0,1.0,5.0,2.0,29.0,,1.0,0.0,,1.0,4.0,,,,,,,,3.0,,8,,5.0,2016.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,68.0,,0.0,0,0.0,0,1.0,3.0,,,,1.0,1.0,0.0,1.0,1,2,1.0,1.0,,,,2.0,1,0,1.0,,1.0,,,1.0,0.0,
12061,DE-994952-01,2011.0,6.0,2,5.0,1953.0,58.0,2,5.0,1953.0,,,,,,1.0,,4.0,,,,,,2.0,,1.0,,,,,,,,,,,,,,,,,,2.0,,,5.0,1.0,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,58.0,,0.0,0,0.0,0,0.0,0.0,,,,1.0,0.0,0.0,0.0,0,0,1.0,1.0,,,,2.0,1,0,1.0,,1.0,,,,,
19717,DE-994952-01,2013.0,2.0,2,5.0,1953.0,59.0,2,5.0,1953.0,,,,,,1.0,5.0,4.0,,,77.0,,,2.0,,1.0,,,,,,,,,,,,,,,,,,2.0,,,1.0,5.0,2.0,,,,,,1.0,,,,,,,,,3.0,1.0,5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,59.0,,0.0,0,0.0,0,0.0,0.0,,,,1.0,1.0,0.0,1.0,1,1,1.0,1.0,0.0,,,2.0,1,1,1.0,,1.0,,,1.0,0.0,
7416,DE-998320-03,2007.0,4.0,2,1.0,1948.0,59.0,2,1.0,1948.0,,,,,,1.0,5.0,5.0,,,38.0,,,4.0,,2.0,,,,,,,,,,,,,,,,,,2.0,,,1.0,5.0,33.0,,,0.0,,,1.0,,,,,,,,,,,2,12.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,59.0,0.0,0.0,0,0.0,0,0.0,0.0,,,,1.0,0.0,0.0,0.0,0,0,1.0,2.0,0.0,,,4.0,1,0,2.0,,1.0,,,,,
