The following script selects 513 methylation sites based on [An epigenetic biomarker of aging for lifespan and healthspan](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5940111/) from the Infinium HumanMethylation450K human methylation dataset.

In [1]:
import pandas as pd

In [2]:
# Load supplementary dataset from "An epigenetic biomarker of aging for lifespan and healthspan"
levine_df = pd.read_csv('../../data/input/aging.csv', skiprows=[1])

# Remove extra columns
levine_df = levine_df.iloc[:, [0, 2]]

# Print shape
print("Dataset shape:", levine_df.shape)

# Print head
print(levine_df.head())

Dataset shape: (513, 2)
          CpG   Map Info
0  cg15611364   25806427
1  cg17605084   53177758
2  cg26382071    6485627
3  cg12743894   30301513
4  cg19287114  107046432


In [3]:
# Load Infinium HumanMethylation450K human methylation dataset.
infinium_df = pd.read_csv('../../data/input/humanmethylation.csv', skiprows=7, low_memory=False)

# Remove extra columns
infinium_df = infinium_df.iloc[:, [1, 15]]

# Remove control data
infinium_df = infinium_df.iloc[:-916]

# Print shape
print("Dataset shape:", infinium_df.shape)

# Print head
print(infinium_df.head())

Dataset shape: (485512, 2)
         Name Coordinate_36
0  cg00035864       8613009
1  cg00050873       9973356
2  cg00061679      23723559
3  cg00063477      21151183
4  cg00121626      20123684


In [6]:
# Determine indices
indices_list = infinium_df[infinium_df['Name'].isin(levine_df['CpG'])].index.tolist()

In [7]:
print(len(indices_list))
print(indices_list)

513
[12844, 14759, 15518, 19196, 20395, 20466, 21288, 21636, 24192, 24761, 26264, 27070, 27381, 28475, 28493, 28972, 29089, 29725, 30518, 30532, 30706, 31502, 31859, 35741, 35763, 37051, 38165, 40763, 40941, 40978, 41017, 41385, 43100, 43359, 44000, 44086, 45294, 45406, 45482, 50053, 51094, 51370, 52686, 54695, 54699, 54999, 55057, 57028, 57876, 58928, 60855, 61106, 61853, 61955, 64778, 66430, 68735, 70006, 70289, 72759, 74021, 77807, 78619, 79770, 81542, 82750, 86927, 87352, 87827, 89558, 90808, 92201, 92634, 93817, 94308, 96573, 96663, 97317, 97548, 97561, 97894, 98704, 99669, 100120, 100403, 101078, 102440, 104030, 104104, 104942, 105583, 106133, 106461, 106685, 106820, 107043, 107982, 108202, 108206, 108645, 109089, 109220, 109291, 109385, 111091, 111104, 112074, 114001, 114600, 115376, 115649, 117380, 117527, 118158, 118677, 119416, 119794, 120753, 123973, 126542, 127786, 128758, 129137, 129344, 129424, 129965, 130073, 132968, 134254, 135662, 137435, 138203, 139746, 141687, 144267

In [None]:
import pickle

# Save to pickle file
with open('../../data/output/indices_list.pkl', 'wb') as f:
    pickle.dump(indices_list, f)