The following script selects 513 methylation sites based on [DNA methylation age of human tissues and cell types](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4015143/) from the Infinium HumanMethylation450K human methylation dataset.

In [1]:
import pandas as pd

In [3]:
# Load supplementary dataset from "DNA methylation age of human tissues and cell types"
horvath_df = pd.read_csv('../../data/input/horvath.csv', skiprows=3)

# Remove extra columns
horvath_df = horvath_df.iloc[:, [0]]

# Print shape
print("Dataset shape:", horvath_df.shape)

# Print head
print(horvath_df.head())

Dataset shape: (353, 1)
  (Intercept)
0  cg00075967
1  cg00374717
2  cg00864867
3  cg00945507
4  cg01027739


In [4]:
# Load Infinium HumanMethylation450K human methylation dataset.
infinium_df = pd.read_csv('../../data/input/humanmethylation.csv', skiprows=7, low_memory=False)

# Remove extra columns
infinium_df = infinium_df.iloc[:, [1, 15]]

# Remove control data
infinium_df = infinium_df.iloc[:-916]

# Print shape
print("Dataset shape:", infinium_df.shape)

# Print head
print(infinium_df.head())

Dataset shape: (485512, 2)
         Name Coordinate_36
0  cg00035864       8613009
1  cg00050873       9973356
2  cg00061679      23723559
3  cg00063477      21151183
4  cg00121626      20123684


In [5]:
# Determine indices
indices_list = infinium_df[infinium_df['Name'].isin(horvath_df['(Intercept)'])].index.tolist()

In [6]:
print(len(indices_list))
print(indices_list)

353
[14759, 16069, 21881, 23857, 25045, 25201, 26698, 27862, 28972, 29194, 29425, 29887, 31616, 35099, 37711, 38218, 39873, 43272, 44616, 44661, 44662, 45218, 45266, 46080, 47864, 48863, 50643, 51872, 53071, 55695, 56356, 57539, 61573, 61853, 63032, 64398, 66887, 69673, 70005, 72596, 72759, 73748, 74891, 75642, 77286, 77420, 80571, 83055, 84976, 85685, 87352, 90761, 95845, 96597, 96859, 97489, 97592, 98072, 99657, 101606, 105054, 105209, 105522, 106562, 106685, 108291, 109291, 110501, 111071, 111104, 112328, 112633, 114124, 114623, 115404, 116437, 116567, 116835, 117527, 120823, 121041, 123415, 123486, 126267, 130883, 131143, 131951, 132023, 133625, 134073, 135578, 137055, 138571, 142976, 146039, 147092, 148246, 153652, 156460, 158129, 158404, 159771, 160843, 162789, 165685, 166651, 168184, 169879, 172685, 178374, 178482, 182842, 184070, 184343, 184852, 185282, 187731, 193373, 194365, 198330, 198579, 198608, 201114, 201597, 201877, 204453, 204944, 205438, 205972, 209814, 215544, 215591

In [None]:
import pickle

# Save to pickle file
with open('../../data/output/indices_list_horvath.pkl', 'wb') as f:
    pickle.dump(indices_list, f)