In [1]:
import pandas as pd

In [2]:
# Read data.csv

data_df = pd.read_csv('validation_set.csv', encoding='utf-8-sig')
print(data_df.shape)
data_df

(1652, 638)


Unnamed: 0,k_2_by,k_4_by,k_6_by,k_8_by,k_22_by,k_23_by,k_24_by,k_25_by,k_26_by,k_28_by,...,e_323,e_326,e_329,e_330,e_332,e_334,e_343,person_id,birth_year,death_year
0,,,,,,,,,,,...,,,,,,,,62632,1732.0,1784.0
1,,,,,,,,,,,...,,,,,,,,34345,1824.0,1890.0
2,,,,,,,,,,,...,,,,,,,,206202,1542.0,
3,,,,,,,,,,,...,,,,,,,,142346,635.0,698.0
4,,,,,,,,,,,...,,,,,,,,131203,1528.0,1591.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1647,,,,,,,,,,,...,,,,,,,,43633,1039.0,1086.0
1648,,,,,,,,,,,...,,,,,,,,38705,951.0,1019.0
1649,,,,,,,,,,,...,,,,,,,,120815,1622.0,
1650,,,,,,,,,,,...,,,,,,,,59148,1814.0,


In [3]:
# Separate the data into two dataframes: one with only the person_id and birth_year columns, and one with the remaining columns.

data_df_no_id_by = data_df.drop(['person_id', 'birth_year'], axis=1)
data_df_id_by_only = data_df[['person_id', 'birth_year']]
print(data_df_no_id_by.shape)
data_df_no_id_by.head()

(1652, 636)


Unnamed: 0,k_2_by,k_4_by,k_6_by,k_8_by,k_22_by,k_23_by,k_24_by,k_25_by,k_26_by,k_28_by,...,e_319,e_320,e_323,e_326,e_329,e_330,e_332,e_334,e_343,death_year
0,,,,,,,,,,,...,,,,,,,,,,1784.0
1,,,,,,,,,,,...,,,,,,,,,,1890.0
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,698.0
4,,,,,,,,,,,...,,,,,,,,,,1591.0


In [4]:
print(data_df_id_by_only.shape)
data_df_id_by_only.head()

(1652, 2)


Unnamed: 0,person_id,birth_year
0,62632,1732.0
1,34345,1824.0
2,206202,1542.0
3,142346,635.0
4,131203,1528.0


In [5]:
# Read median_list

median_df = pd.read_csv('median_list.txt', encoding='utf-8-sig')
print(median_df.shape)
median_df.head()

(636, 2)


Unnamed: 0,column,median
0,k_2_by,-279.0
1,k_4_by,-102.0
2,k_6_by,-49.0
3,k_8_by,-272.0
4,k_22_by,-398.5


In [6]:
# Create inferred_birth_year by using the median values from median_list and any available year values in the data
# It will take 1+ minutes to run this cell

median_df.set_index('column', inplace=True)

medians = median_df['median']

def infer_birth_year(row, medians):
    values = []
    for param in medians.index:
        if pd.notna(row.get(param)):
            inferred_year = row[param] - medians[param]
            values.append(inferred_year)
    if values:
        return sum(values) / len(values)
    return None

data_df_no_id_by["inferred_birth_year"] = data_df_no_id_by.apply(
    infer_birth_year, axis=1, args=(medians,)
)

data_df_no_id_by.head()

Unnamed: 0,k_2_by,k_4_by,k_6_by,k_8_by,k_22_by,k_23_by,k_24_by,k_25_by,k_26_by,k_28_by,...,e_320,e_323,e_326,e_329,e_330,e_332,e_334,e_343,death_year,inferred_birth_year
0,,,,,,,,,,,...,,,,,,,,,1784.0,1726.5
1,,,,,,,,,,,...,,,,,,,,,1890.0,1823.25
2,,,,,,,,,,,...,,,,,,,,,,1543.0
3,,,,,,,,,,,...,,,,,,,,,698.0,637.0
4,,,,,,,,,,,...,,,,,,,,,1591.0,1533.5


In [7]:
# Calculate all the missing values in the data using the inferred_birth_year and the median values from median_list
# It will take 5:24 minutes to run this cell

def fill_na_values(row, medians):
    for param in medians.index:
        if pd.isna(row[param]) and pd.notna(row["inferred_birth_year"]):
            row[param] = row["inferred_birth_year"] + medians[param]
    return row


data_df_no_id_by = data_df_no_id_by.apply(fill_na_values, axis=1, args=(medians,))
data_df_no_id_by.head()

Unnamed: 0,k_2_by,k_4_by,k_6_by,k_8_by,k_22_by,k_23_by,k_24_by,k_25_by,k_26_by,k_28_by,...,e_320,e_323,e_326,e_329,e_330,e_332,e_334,e_343,death_year,inferred_birth_year
0,1447.5,1624.5,1677.5,1454.5,1328.0,1448.5,1368.0,1443.0,1443.0,1441.0,...,1732.5,1748.0,1740.0,1762.0,1777.5,1741.5,1756.5,1763.5,1784.0,1726.5
1,1544.25,1721.25,1774.25,1551.25,1424.75,1545.25,1464.75,1539.75,1539.75,1537.75,...,1829.25,1844.75,1836.75,1858.75,1874.25,1838.25,1853.25,1860.25,1890.0,1823.25
2,1264.0,1441.0,1494.0,1271.0,1144.5,1265.0,1184.5,1259.5,1259.5,1257.5,...,1549.0,1564.5,1556.5,1578.5,1594.0,1558.0,1573.0,1580.0,1604.0,1543.0
3,358.0,535.0,588.0,365.0,238.5,359.0,278.5,353.5,353.5,351.5,...,643.0,658.5,650.5,672.5,688.0,652.0,667.0,674.0,698.0,637.0
4,1254.5,1431.5,1484.5,1261.5,1135.0,1255.5,1175.0,1250.0,1250.0,1248.0,...,1539.5,1555.0,1547.0,1569.0,1584.5,1548.5,1563.5,1570.5,1591.0,1533.5


In [8]:
# Merge the two dataframes back together, put data_df_id_by_only to the end of data_df_no_id_by

data_df_interpolated = pd.concat([data_df_no_id_by, data_df_id_by_only], axis=1)
data_df_interpolated.head()

Unnamed: 0,k_2_by,k_4_by,k_6_by,k_8_by,k_22_by,k_23_by,k_24_by,k_25_by,k_26_by,k_28_by,...,e_326,e_329,e_330,e_332,e_334,e_343,death_year,inferred_birth_year,person_id,birth_year
0,1447.5,1624.5,1677.5,1454.5,1328.0,1448.5,1368.0,1443.0,1443.0,1441.0,...,1740.0,1762.0,1777.5,1741.5,1756.5,1763.5,1784.0,1726.5,62632,1732.0
1,1544.25,1721.25,1774.25,1551.25,1424.75,1545.25,1464.75,1539.75,1539.75,1537.75,...,1836.75,1858.75,1874.25,1838.25,1853.25,1860.25,1890.0,1823.25,34345,1824.0
2,1264.0,1441.0,1494.0,1271.0,1144.5,1265.0,1184.5,1259.5,1259.5,1257.5,...,1556.5,1578.5,1594.0,1558.0,1573.0,1580.0,1604.0,1543.0,206202,1542.0
3,358.0,535.0,588.0,365.0,238.5,359.0,278.5,353.5,353.5,351.5,...,650.5,672.5,688.0,652.0,667.0,674.0,698.0,637.0,142346,635.0
4,1254.5,1431.5,1484.5,1261.5,1135.0,1255.5,1175.0,1250.0,1250.0,1248.0,...,1547.0,1569.0,1584.5,1548.5,1563.5,1570.5,1591.0,1533.5,131203,1528.0


In [9]:
# Save the interpolated data to data_interpolated.csv
data_df_interpolated.to_csv('validation_set_interpolated.csv', index=False, header=True, encoding='utf-8-sig')