In [1]:
import pandas as pd

In [2]:
# Read data.csv

data_df = pd.read_csv('data.csv', encoding='utf-8-sig')
print(data_df.shape)
data_df

(31377, 638)


Unnamed: 0,k_2_by,k_4_by,k_6_by,k_8_by,k_22_by,k_23_by,k_24_by,k_25_by,k_26_by,k_28_by,...,e_323,e_326,e_329,e_330,e_332,e_334,e_343,person_id,birth_year,death_year
0,,,,,,,,,,,...,,,,,,,,85175,1752.0,1824.0
1,,,,,,,,,,,...,,,,,,,,124329,1656.0,
2,,,,,,,,,,,...,,,,,,,,199193,1436.0,
3,,,,,,,,,,,...,,,,,,,,199009,1436.0,
4,,,,,,,,,,,...,,,,,,,,83307,1696.0,1761.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31372,,,,,,,,,,,...,,,,,,,,133676,1537.0,1604.0
31373,,,,,,,,,,,...,,,,,,,,46698,894.0,972.0
31374,,,,,,,,,,,...,,,,,,,,68324,1527.0,1574.0
31375,,,,,,,,,,,...,,,,,,,,5064,1018.0,1068.0


In [3]:
# Separate the data into two dataframes: one with only the person_id and birth_year columns, and one with the remaining columns.

data_df_no_id_by = data_df.drop(['person_id', 'birth_year'], axis=1)
data_df_id_by_only = data_df[['person_id', 'birth_year']]
print(data_df_no_id_by.shape)
data_df_no_id_by.head()

(31377, 636)


Unnamed: 0,k_2_by,k_4_by,k_6_by,k_8_by,k_22_by,k_23_by,k_24_by,k_25_by,k_26_by,k_28_by,...,e_319,e_320,e_323,e_326,e_329,e_330,e_332,e_334,e_343,death_year
0,,,,,,,,,,,...,,,,,,,,,,1824.0
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,1761.0


In [4]:
print(data_df_id_by_only.shape)
data_df_id_by_only.head()

(31377, 2)


Unnamed: 0,person_id,birth_year
0,85175,1752.0
1,124329,1656.0
2,199193,1436.0
3,199009,1436.0
4,83307,1696.0


In [5]:
# Read median_list

median_df = pd.read_csv('median_list.txt', encoding='utf-8-sig')
print(median_df.shape)
median_df.head()

(636, 2)


Unnamed: 0,column,median
0,k_2_by,-279.0
1,k_4_by,-102.0
2,k_6_by,-49.0
3,k_8_by,-272.0
4,k_22_by,-398.5


In [6]:
# Create inferred_birth_year by using the median values from median_list and any available year values in the data
# It will take 1+ minutes to run this cell

median_df.set_index('column', inplace=True)

medians = median_df['median']

def infer_birth_year(row, medians):
    values = []
    for param in medians.index:
        if pd.notna(row.get(param)):
            inferred_year = row[param] - medians[param]
            values.append(inferred_year)
    if values:
        return sum(values) / len(values)
    return None

data_df_no_id_by["inferred_birth_year"] = data_df_no_id_by.apply(
    infer_birth_year, axis=1, args=(medians,)
)

data_df_no_id_by.head()

Unnamed: 0,k_2_by,k_4_by,k_6_by,k_8_by,k_22_by,k_23_by,k_24_by,k_25_by,k_26_by,k_28_by,...,e_320,e_323,e_326,e_329,e_330,e_332,e_334,e_343,death_year,inferred_birth_year
0,,,,,,,,,,,...,,,,,,,,,1824.0,1763.0
1,,,,,,,,,,,...,,,,,,,,,,1660.0
2,,,,,,,,,,,...,,,,,,,,,,1435.0
3,,,,,,,,,,,...,,,,,,,,,,1433.0
4,,,,,,,,,,,...,,,,,,,,,1761.0,1695.0


In [7]:
# Calculate all the missing values in the data using the inferred_birth_year and the median values from median_list
# It will take 5:24 minutes to run this cell

def fill_na_values(row, medians):
    for param in medians.index:
        if pd.isna(row[param]) and pd.notna(row["inferred_birth_year"]):
            row[param] = row["inferred_birth_year"] + medians[param]
    return row


data_df_no_id_by = data_df_no_id_by.apply(fill_na_values, axis=1, args=(medians,))
data_df_no_id_by.head()

Unnamed: 0,k_2_by,k_4_by,k_6_by,k_8_by,k_22_by,k_23_by,k_24_by,k_25_by,k_26_by,k_28_by,...,e_320,e_323,e_326,e_329,e_330,e_332,e_334,e_343,death_year,inferred_birth_year
0,1484.0,1661.0,1714.0,1491.0,1364.5,1485.0,1404.5,1479.5,1479.5,1477.5,...,1769.0,1784.5,1776.5,1798.5,1814.0,1778.0,1793.0,1800.0,1824.0,1763.0
1,1381.0,1558.0,1611.0,1388.0,1261.5,1382.0,1301.5,1376.5,1376.5,1374.5,...,1666.0,1681.5,1673.5,1695.5,1711.0,1675.0,1690.0,1697.0,1721.0,1660.0
2,1156.0,1333.0,1386.0,1163.0,1036.5,1157.0,1076.5,1151.5,1151.5,1149.5,...,1441.0,1456.5,1448.5,1470.5,1486.0,1450.0,1465.0,1472.0,1496.0,1435.0
3,1154.0,1331.0,1384.0,1161.0,1034.5,1155.0,1074.5,1149.5,1149.5,1147.5,...,1439.0,1454.5,1446.5,1468.5,1484.0,1448.0,1463.0,1470.0,1494.0,1433.0
4,1416.0,1593.0,1646.0,1423.0,1296.5,1417.0,1336.5,1411.5,1411.5,1409.5,...,1701.0,1716.5,1708.5,1730.5,1746.0,1710.0,1725.0,1732.0,1761.0,1695.0


In [8]:
# Merge the two dataframes back together, put data_df_id_by_only to the end of data_df_no_id_by

data_df_interpolated = pd.concat([data_df_no_id_by, data_df_id_by_only], axis=1)
data_df_interpolated.head()

Unnamed: 0,k_2_by,k_4_by,k_6_by,k_8_by,k_22_by,k_23_by,k_24_by,k_25_by,k_26_by,k_28_by,...,e_326,e_329,e_330,e_332,e_334,e_343,death_year,inferred_birth_year,person_id,birth_year
0,1484.0,1661.0,1714.0,1491.0,1364.5,1485.0,1404.5,1479.5,1479.5,1477.5,...,1776.5,1798.5,1814.0,1778.0,1793.0,1800.0,1824.0,1763.0,85175,1752.0
1,1381.0,1558.0,1611.0,1388.0,1261.5,1382.0,1301.5,1376.5,1376.5,1374.5,...,1673.5,1695.5,1711.0,1675.0,1690.0,1697.0,1721.0,1660.0,124329,1656.0
2,1156.0,1333.0,1386.0,1163.0,1036.5,1157.0,1076.5,1151.5,1151.5,1149.5,...,1448.5,1470.5,1486.0,1450.0,1465.0,1472.0,1496.0,1435.0,199193,1436.0
3,1154.0,1331.0,1384.0,1161.0,1034.5,1155.0,1074.5,1149.5,1149.5,1147.5,...,1446.5,1468.5,1484.0,1448.0,1463.0,1470.0,1494.0,1433.0,199009,1436.0
4,1416.0,1593.0,1646.0,1423.0,1296.5,1417.0,1336.5,1411.5,1411.5,1409.5,...,1708.5,1730.5,1746.0,1710.0,1725.0,1732.0,1761.0,1695.0,83307,1696.0


In [9]:
# Read validation data
validation_df = pd.read_csv('validation_set.csv', encoding='utf-8-sig')
print(validation_df.shape)
validation_df.head()

(1652, 638)


Unnamed: 0,k_2_by,k_4_by,k_6_by,k_8_by,k_22_by,k_23_by,k_24_by,k_25_by,k_26_by,k_28_by,...,e_323,e_326,e_329,e_330,e_332,e_334,e_343,person_id,birth_year,death_year
0,,,,,,,,,,,...,,,,,,,,62632,1732.0,1784.0
1,,,,,,,,,,,...,,,,,,,,34345,1824.0,1890.0
2,,,,,,,,,,,...,,,,,,,,206202,1542.0,
3,,,,,,,,,,,...,,,,,,,,142346,635.0,698.0
4,,,,,,,,,,,...,,,,,,,,131203,1528.0,1591.0


In [10]:
# Create data_interpolated_no_validation by removing the rows with person_id that are in the validation_set.csv



# data_df_interpolated_no_validation = data_df_interpolated[~data_df_interpolated['person_id'].isin(validation_df['person_id'])]
# print(data_df_interpolated.shape)
# print(data_df_interpolated_no_validation.shape)
# data_df_interpolated_no_validation.head()

In [11]:
# Save the interpolated data to data_interpolated.csv
data_df_interpolated.to_csv('data_interpolated.csv', index=False, header=True, encoding='utf-8-sig')

# Save the data with no validation records to data_interpolated_no_validation.csv
# data_df_interpolated_no_validation.to_csv('data_interpolated_no_validation.csv', index=False, header=True, encoding='utf-8-sig')

In [12]:
# create data_df_interpolated as a copy of data_df
# data_df_interpolated = data_df.copy()

In [13]:
# Use each column (except for the 'birth_year' and 'person_id' columns) to subtract the 'birth_year' column, and get the average value of each columns as avg_1, avg2...
# use each avg to add birth year in fillna for each column

# for col in data_df_interpolated.columns:
#     if col not in ['birth_year', 'person_id']:
#         avg = data_df_interpolated[col] - data_df['birth_year']
#         data_df_interpolated[col] = data_df_interpolated[col].fillna(avg.median() + data_df_interpolated['birth_year'])
# data_df_interpolated
