In [1]:
import pandas as pd
import numpy as np
from concurrent.futures import ThreadPoolExecutor

In [2]:
DMR_FEATURE_NAMES = ['2mo', '9mo', '18mo', '9mo-2mo', '18mo-9mo', '18mo-2mo', 'log2(gene_length)', 'log2(r_length)', 'log2(r_length/gene_length)', 'log2(distance)']
CG_GENEBODY_FEATURE_NAMES = ['2mo', '9mo', '18mo', '9mo-2mo', '18mo-9mo', '18mo-2mo', 'log2(gene_length)','(9mo-2mo)*log2(gene_length)',  '(18mo-2mo)*log2(gene_length)', 
                            '(18mo-9mo)*log2(gene_length)','DMG','pvalue']
CH_GENEBODY_FEATURE_NAMES = ['2mo', '9mo', '18mo', '9mo-2mo', '18mo-9mo', '18mo-2mo', 'log2(gene_length)','(9mo-2mo)*log2(gene_length)',  '(18mo-2mo)*log2(gene_length)', 
                            '(18mo-9mo)*log2(gene_length)','DMG','pvalue']
ATAC_FEATURE_NAMES = ['2mo', '9mo', '18mo', 'log2(9mo/2mo)', 'log2(18mo/9mo)', 'log2(18mo/2mo)', 'log2(gene_length)', 'log2(distance)','DAR']
HIC_FEATURE_NAMES = [ 'Tanova', '2mo.Q', '9mo.Q', '18mo.Q','9mo-2mo.Q','18mo-9mo.Q', '18mo-2mo.Q',
                        'log2(gene_length)', 'log2(anchor1_distance)','log2(anchor2_distance)','Diff_Loop'] #'Qanova', 'Eanova',,'2mo.T', '9mo.T', '18mo.T','9mo-2mo.T', '18mo-9mo.T', '18mo-2mo.T', 
ABC_DMR_NAMES = ['2mo.activity', '2mo.contact', '2mo.abc_score', '9mo.activity','9mo.contact', '9mo.abc_score', 
                    '18mo.activity', '18mo.contact','18mo.abc_score', 'log2(eg_distance)','log2(gene_length)','log2(contact_distance)']
ABC_peak_NAMES = ['2mo.activity', '2mo.contact', '2mo.abc_score', '9mo.activity','9mo.contact', '9mo.abc_score', 
                    '18mo.activity', '18mo.contact','18mo.abc_score', 'log2(eg_distance)','log2(gene_length)','log2(contact_distance)']


In [4]:

DATA_FEATURE_NAMES = {
    'dmr': DMR_FEATURE_NAMES,
    'mcg_genebody': CG_GENEBODY_FEATURE_NAMES,
    'mch_genebody': CH_GENEBODY_FEATURE_NAMES,
    'atac': ATAC_FEATURE_NAMES,
    'hic_loop': HIC_FEATURE_NAMES,
    'hic_abc_dmr':ABC_DMR_NAMES,
    'hic_abc_peak':ABC_peak_NAMES 
}

DATA_FEATURE_NAMES_LIST = list(DATA_FEATURE_NAMES.keys())

In [23]:
ct = 'Oligo_NN'

In [24]:
def get_dmr_feat(ct):
    dmr = pd.read_csv(f'ml_input/{ct}/{ct}.aDMR_gene.csv')
    dmr_feat = dmr
    dmr_feat.rename(columns={'gene_name': 'gene'}, inplace=True)
    dmr_feat['9mo-2mo'] = dmr_feat['9mo'] - dmr_feat['2mo']
    dmr_feat['18mo-9mo'] = dmr_feat['18mo'] - dmr_feat['9mo']
    dmr_feat['18mo-2mo'] = dmr_feat['18mo'] - dmr_feat['2mo']
    dmr_feat['log2(gene_length)'] = np.log2((dmr_feat['gene_end'] - dmr_feat['gene_start']).abs().astype(np.float64))
    dmr_feat['log2(r_length)'] = np.log2((dmr_feat['end'] - dmr_feat['start']).abs().astype(np.float64))
    dmr_feat['log2(r_length/gene_length)'] = np.log2((dmr_feat['end'] - dmr_feat['start'])/(dmr_feat['gene_end'] - dmr_feat['gene_start']))
    dmr_feat['log2(distance)'] = np.log2((dmr_feat['gene_start'] - dmr_feat['start']).abs().astype(np.float64))
    dmr_feat = dmr_feat[['gene', *DMR_FEATURE_NAMES]]
    assert dmr_feat.isna().sum().sum() == 0
    assert dmr_feat.isin([np.inf, -np.inf]).sum().sum() == 0
    print('Processed dmr data')
    return dmr_feat

dmr_feat= get_dmr_feat(ct)
dmr_feat.head()

Processed dmr data


Unnamed: 0,gene,2mo,9mo,18mo,9mo-2mo,18mo-9mo,18mo-2mo,log2(gene_length),log2(r_length),log2(r_length/gene_length),log2(distance)
0,Rgs20,0.65,0.6,0.9,-0.05,0.3,0.25,17.32956,5.0,-12.32956,17.20649
1,Sulf1,0.36,0.52,0.56,0.16,0.04,0.2,17.399703,7.924813,-9.474891,16.88709
2,Sulf1,0.43,0.59,0.64,0.16,0.05,0.21,17.399703,5.857981,-11.541723,17.37638
3,Eya1,0.68,0.62,0.47,-0.06,-0.15,-0.21,17.148487,2.321928,-14.826558,17.0741
4,Eya1,0.61,0.37,0.45,-0.24,0.08,-0.16,17.148487,2.321928,-14.826558,17.076962


In [25]:
def get_genebody_feat(ct):
    mcg_genebody = pd.read_csv(f'ml_input/{ct}/{ct}.mCG_genebody_gene.csv')
    mcg_genebody_feat = mcg_genebody
    mcg_genebody_feat.rename(columns={'gene_name': 'gene'}, inplace=True)
    mcg_genebody_feat['9mo-2mo'] = mcg_genebody_feat['9mo'] - mcg_genebody_feat['2mo']
    mcg_genebody_feat['18mo-9mo'] = mcg_genebody_feat['18mo'] - mcg_genebody_feat['9mo']
    mcg_genebody_feat['18mo-2mo'] = mcg_genebody_feat['18mo'] - mcg_genebody_feat['2mo']
    mcg_genebody_feat['log2(gene_length)'] = np.log2(mcg_genebody_feat['gene_length'])
    mcg_genebody_feat['(9mo-2mo)*log2(gene_length)'] = mcg_genebody_feat['9mo-2mo'] * mcg_genebody_feat['log2(gene_length)']
    mcg_genebody_feat['(18mo-9mo)*log2(gene_length)'] = mcg_genebody_feat['18mo-9mo'] * mcg_genebody_feat['log2(gene_length)']
    mcg_genebody_feat['(18mo-2mo)*log2(gene_length)'] = mcg_genebody_feat['18mo-2mo'] * mcg_genebody_feat['log2(gene_length)']
    mcg_genebody_feat = mcg_genebody_feat[['gene', *CG_GENEBODY_FEATURE_NAMES]]
    mcg_genebody_feat= mcg_genebody_feat.dropna()
    assert mcg_genebody_feat.isna().sum().sum() == 0
    assert mcg_genebody_feat.isin([np.inf, -np.inf]).sum().sum() == 0
    print('Processed mCG genebody data')
    return mcg_genebody_feat

mcg_genebody_feat= get_genebody_feat(ct)
mcg_genebody_feat.head()

Processed mCG genebody data


Unnamed: 0,gene,2mo,9mo,18mo,9mo-2mo,18mo-9mo,18mo-2mo,log2(gene_length),(9mo-2mo)*log2(gene_length),(18mo-2mo)*log2(gene_length),(18mo-9mo)*log2(gene_length),DMG,pvalue
0,4932422M17Rik,0.30568,0.373184,0.407544,0.067504,0.034361,0.101865,11.365229,0.767203,1.157718,0.390515,-1,2.498424e-14
1,Phox2a,0.603334,0.66002,0.720904,0.056687,0.060884,0.11757,12.107544,0.686336,1.42349,0.737154,-1,6.708909e-18
2,C130021I20Rik,0.369544,0.423282,0.469904,0.053738,0.046623,0.100361,12.333435,0.662773,1.237792,0.57502,-1,5.804052000000001e-17
3,AI606473,0.354029,0.415628,0.454339,0.061599,0.03871,0.100309,11.745254,0.723495,1.178158,0.454663,-1,5.628330000000001e-17
4,Hoxc12,0.360452,0.427698,0.47947,0.067245,0.051772,0.119017,10.854868,0.729939,1.291918,0.561979,-1,1.9686609999999998e-19


In [32]:
mcg_genebody_feat[mcg_genebody_feat['gene'] == 'Il33']

Unnamed: 0,gene,2mo,9mo,18mo,9mo-2mo,18mo-9mo,18mo-2mo,log2(gene_length),(9mo-2mo)*log2(gene_length),(18mo-2mo)*log2(gene_length),(18mo-9mo)*log2(gene_length),DMG,pvalue
185,Il33,1.06013,0.934792,0.882611,-0.125338,-0.052182,-0.17752,15.119752,-1.895078,-2.684051,-0.788973,1,3.518584e-121


In [26]:
def get_mch_genebody_feat(ct):
    mch_genebody = pd.read_csv(f'ml_input/{ct}/{ct}.mCH_genebody_gene.csv')
    mch_genebody_feat = mch_genebody
    mch_genebody_feat.rename(columns={'gene_name': 'gene'}, inplace=True)
    mch_genebody_feat['9mo-2mo'] = mch_genebody_feat['9mo'] - mch_genebody_feat['2mo']
    mch_genebody_feat['18mo-9mo'] = mch_genebody_feat['18mo'] - mch_genebody_feat['9mo']
    mch_genebody_feat['18mo-2mo'] = mch_genebody_feat['18mo'] - mch_genebody_feat['2mo']
    mch_genebody_feat['log2(gene_length)'] = np.log2(mch_genebody_feat['gene_length'])
    mch_genebody_feat['(9mo-2mo)*log2(gene_length)'] = mch_genebody_feat['9mo-2mo'] * mch_genebody_feat['log2(gene_length)']
    mch_genebody_feat['(18mo-9mo)*log2(gene_length)'] = mch_genebody_feat['18mo-9mo'] * mch_genebody_feat['log2(gene_length)']
    mch_genebody_feat['(18mo-2mo)*log2(gene_length)'] = mch_genebody_feat['18mo-2mo'] * mch_genebody_feat['log2(gene_length)']
    mch_genebody_feat = mch_genebody_feat[['gene', *CH_GENEBODY_FEATURE_NAMES]]
    mch_genebody_feat= mch_genebody_feat.dropna()
    assert mch_genebody_feat.isna().sum().sum() == 0
    assert mch_genebody_feat.isin([np.inf, -np.inf]).sum().sum() == 0
    print('Processed mCH genebody data')
    return mch_genebody_feat

mch_genebody_feat= get_mch_genebody_feat(ct)
mch_genebody_feat.head()

Processed mCH genebody data


Unnamed: 0,gene,2mo,9mo,18mo,9mo-2mo,18mo-9mo,18mo-2mo,log2(gene_length),(9mo-2mo)*log2(gene_length),(18mo-2mo)*log2(gene_length),(18mo-9mo)*log2(gene_length),DMG,pvalue
0,Tcerg1l,0.936379,1.014998,1.051771,0.078619,0.036773,0.115392,17.526163,1.377889,2.022385,0.644495,-1,6.793986e-08
1,Myo1e,1.349795,1.429854,1.450262,0.080059,0.020408,0.100467,17.553869,1.405346,1.763582,0.358236,-1,2.634381e-05
2,Serpina10,0.696458,0.794098,0.819632,0.09764,0.025534,0.123174,14.023928,1.369302,1.727389,0.358087,-1,1.997087e-05
3,St3gal4,1.399687,1.499819,1.536338,0.100133,0.036519,0.136652,16.149509,1.617096,2.206861,0.589765,-1,1.587289e-05
4,AL607142.1,2.403439,2.569563,2.813969,0.166124,0.244406,0.41053,14.822621,2.462388,6.085125,3.622737,-1,1.9722410000000002e-18


In [33]:
mch_genebody_feat[mch_genebody_feat['gene'] == 'Il33']

Unnamed: 0,gene,2mo,9mo,18mo,9mo-2mo,18mo-9mo,18mo-2mo,log2(gene_length),(9mo-2mo)*log2(gene_length),(18mo-2mo)*log2(gene_length),(18mo-9mo)*log2(gene_length),DMG,pvalue
1051,Il33,1.256478,1.086544,1.04196,-0.169933,-0.044584,-0.214517,15.119752,-2.569351,-3.243449,-0.674098,1,3.025557e-18


In [27]:
def get_atac_feat(ct):
    atac = pd.read_csv(f'ml_input/{ct}/{ct}.peak_gene.csv')
    atac_feat = atac
    atac_feat.rename(columns={'gene_name': 'gene'}, inplace=True)
    atac_feat['log2(9mo/2mo)'] = np.log2(atac_feat['9mo'] + 1e-10) - np.log2(atac_feat['2mo'] + 1e-10)
    atac_feat['log2(18mo/9mo)'] = np.log2(atac_feat['18mo'] + 1e-10) - np.log2(atac_feat['9mo'] + 1e-10)
    atac_feat['log2(18mo/2mo)'] = np.log2(atac_feat['18mo'] + 1e-10) - np.log2(atac_feat['2mo'] + 1e-10)
    atac_feat['log2(gene_length)'] = np.log2((atac_feat['gene_end'] - atac_feat['gene_start']).abs().astype(np.float64) + 1e-10)
    atac_feat['log2(distance)'] = np.log2(atac_feat['distance'] + 1e-10)
    atac_feat = atac_feat[['gene', *ATAC_FEATURE_NAMES]]
    #check if any na or inf 
    assert atac_feat.isna().sum().sum() == 0
    assert atac_feat.isin([np.inf, -np.inf]).sum().sum() == 0
    print('Processed atac data')
    return atac_feat

atac_feat= get_atac_feat(ct)
atac_feat.head()

Processed atac data


Unnamed: 0,gene,2mo,9mo,18mo,log2(9mo/2mo),log2(18mo/9mo),log2(18mo/2mo),log2(gene_length),log2(distance),DAR
0,Xkr4,4.201271,2.351554,2.886472,-0.837212,0.295693,-0.541519,18.841064,13.10689,-1.0
1,Xkr4,0.685732,0.487387,0.547499,-0.492577,0.167787,-0.324789,18.841064,13.674413,0.0
2,Xkr4,1.663133,0.736417,0.729998,-1.175308,-0.01263,-1.187939,18.841064,15.941758,0.0
3,Xkr4,0.561618,0.416236,0.402244,-0.432187,-0.04933,-0.481518,18.841064,15.929004,0.0
4,Xkr4,3.608625,2.458281,2.852952,-0.5538,0.214806,-0.338994,18.841064,16.875557,0.0


In [34]:
atac_feat[atac_feat['gene'] == 'Il33']

Unnamed: 0,gene,2mo,9mo,18mo,log2(9mo/2mo),log2(18mo/9mo),log2(18mo/2mo),log2(gene_length),log2(distance),DAR
74393,Il33,6.879039,9.395541,9.374516,0.449769,-0.003232,0.446537,15.273359,10.743151,1.0
74394,Il33,2.649842,3.792369,3.605297,0.517193,-0.072981,0.444212,15.273359,11.507795,1.0
74395,Il33,1.635207,2.959898,3.124839,0.856074,0.078234,0.934309,15.273359,11.891784,1.0
74396,Il33,1.418007,3.280079,3.534532,1.209866,0.107788,1.317655,15.273359,12.26092,1.0
74397,Il33,0.781921,1.291398,1.284946,0.72384,-0.007226,0.716613,15.273359,13.170238,0.0
74398,Il33,0.837772,1.188228,1.199283,0.504182,0.01336,0.517542,15.273359,13.423904,0.0
74399,Il33,0.875006,1.398125,1.523312,0.676128,0.123719,0.799846,15.273359,13.315999,0.0
74400,Il33,0.933961,1.632924,1.623873,0.806024,-0.008019,0.798005,15.273359,13.796648,0.0
74401,Il33,0.940166,1.362549,1.411578,0.53532,0.051,0.586321,15.273359,14.057146,0.0
74402,Il33,0.890521,1.394567,1.303568,0.647097,-0.097352,0.549745,15.273359,14.132901,0.0


In [28]:
def get_hic_feat(ct):
    hic = pd.read_csv(f'ml_input/{ct}/{ct}.Loop_gene.csv.gz')
    hic_feat = hic
    hic_feat.rename(columns={'gene_name': 'gene'}, inplace=True)
    hic_feat['9mo-2mo.Q'] = hic_feat['9mo.Q'] - hic_feat['2mo.Q']
    hic_feat['18mo-9mo.Q'] = hic_feat['18mo.Q'] - hic_feat['9mo.Q']
    hic_feat['18mo-2mo.Q'] = hic_feat['18mo.Q'] - hic_feat['2mo.Q']
    hic_feat['9mo-2mo.T'] = hic_feat['9mo.T'] - hic_feat['2mo.T']
    hic_feat['18mo-9mo.T'] = hic_feat['18mo.T'] - hic_feat['9mo.T']
    hic_feat['18mo-2mo.T'] = hic_feat['18mo.T'] - hic_feat['2mo.T']
    hic_feat['log2(gene_length)'] = np.log2(hic_feat['gene_length'] )
    hic_feat['log2(anchor1_distance)'] = np.log2(hic_feat['anchor1_distance'] + 10000) #10000 i the loop resolution
    hic_feat['log2(anchor2_distance)'] = np.log2(hic_feat['anchor2_distance'] + 10000)

    hic_feat = hic_feat[['gene', *HIC_FEATURE_NAMES]]
    assert hic_feat.isna().sum().sum() == 0
    assert hic_feat.isin([np.inf, -np.inf]).sum().sum() == 0
    print('Processed hic loop data')
    return hic_feat

hic_feat= get_hic_feat(ct)
hic_feat.head()

Processed hic loop data


Unnamed: 0,gene,Tanova,2mo.Q,9mo.Q,18mo.Q,9mo-2mo.Q,18mo-9mo.Q,18mo-2mo.Q,log2(gene_length),log2(anchor1_distance),log2(anchor2_distance),Diff_Loop
0,Xkr4,1.342627,0.004626,0.00353,0.00418,-0.001097,0.00065,-0.000446,18.841064,13.762901,17.341684,0
1,Xkr4,2.40551,0.004112,0.003147,0.003994,-0.000965,0.000847,-0.000118,18.841064,13.762901,17.426027,0
2,Xkr4,2.424875,0.003668,0.003451,0.0039,-0.000217,0.000449,0.000232,18.841064,13.762901,17.505711,0
3,Xkr4,1.582044,0.004151,0.003263,0.004204,-0.000889,0.000941,5.3e-05,18.841064,13.762901,17.652978,0
4,Xkr4,0.688259,0.00371,0.003151,0.004154,-0.000559,0.001003,0.000444,18.841064,13.762901,17.721333,0


In [35]:
hic_feat[hic_feat['gene'] == 'Il33']

Unnamed: 0,gene,Tanova,2mo.Q,9mo.Q,18mo.Q,9mo-2mo.Q,18mo-9mo.Q,18mo-2mo.Q,log2(gene_length),log2(anchor1_distance),log2(anchor2_distance),Diff_Loop
604499,Il33,4.493082,0.002475,0.00299,0.002795,0.000515,-0.000195,0.00032,15.273359,13.634584,18.209501,0
604501,Il33,2.393821,0.002789,0.003425,0.003549,0.000636,0.000124,0.00076,15.273359,14.471548,18.161102,0
604502,Il33,2.576397,0.002809,0.003558,0.003381,0.000749,-0.000177,0.000572,15.273359,13.634584,18.161102,0
604505,Il33,1.731375,0.002565,0.002779,0.003655,0.000213,0.000876,0.00109,15.273359,14.714568,18.111024,0
604506,Il33,0.195316,0.003173,0.003104,0.003735,-7e-05,0.000631,0.000562,15.273359,14.471548,18.111024,0
604507,Il33,0.175255,0.003561,0.00354,0.00385,-2.1e-05,0.00031,0.000289,15.273359,13.634584,18.111024,0
604512,Il33,0.290103,0.003423,0.003302,0.003771,-0.000121,0.000469,0.000348,15.273359,14.471548,18.059144,0
604513,Il33,0.708938,0.004024,0.003707,0.00429,-0.000317,0.000584,0.000266,15.273359,13.634584,18.059144,0
604516,Il33,0.656374,0.00348,0.003514,0.003818,3.4e-05,0.000304,0.000338,15.273359,14.471548,18.005328,0
604517,Il33,0.842623,0.004392,0.00397,0.004523,-0.000422,0.000552,0.000131,15.273359,13.634584,18.005328,0


In [29]:
def get_abc_dmr_feat(ct):
    abc_dmr = pd.read_csv(f'ml_input/{ct}/{ct}.abc_enhancer.DMR_gene.csv').fillna(0)
    abc_dmr_feat = abc_dmr
    abc_dmr_feat.rename(columns={'gene_name': 'gene'}, inplace=True)
    abc_dmr_feat['log2(eg_distance)'] = np.log2(np.minimum(abs(abc_dmr_feat['start'] - abc_dmr_feat['gene_start']), abs(abc_dmr_feat['start'] - abc_dmr_feat['gene_end'])) + 10000)
    abc_dmr_feat['log2(gene_length)'] = np.log2(abc_dmr_feat['gene_end'] - abc_dmr_feat['gene_start'])
    abc_dmr_feat['log2(contact_distance)'] = np.log2(abs(abc_dmr_feat['end'] - abc_dmr_feat['start']) + 10000)
    abc_dmr_feat = abc_dmr_feat[['gene', *ABC_DMR_NAMES]]
    assert abc_dmr_feat.isna().sum().sum() == 0
    assert abc_dmr_feat.isin([np.inf, -np.inf]).sum().sum() == 0
    print('Processed abc dmr data')
    return abc_dmr_feat

abc_dmr_feat= get_abc_dmr_feat(ct)
abc_dmr_feat.head()

Processed abc dmr data


Unnamed: 0,gene,2mo.activity,2mo.contact,2mo.abc_score,9mo.activity,9mo.contact,9mo.abc_score,18mo.activity,18mo.contact,18mo.abc_score,log2(eg_distance),log2(gene_length),log2(contact_distance)
0,Xkr4,0.920623,16.0,0.032218,0.0,0.0,0.0,0.918402,28.0,0.050795,16.972318,18.828722,13.302782
1,Xkr4,0.638158,15.0,0.020937,0.648549,15.0,0.021145,0.698622,18.0,0.02484,17.392376,18.828722,13.34596
2,Xkr4,0.823144,15.0,0.027007,0.778812,15.0,0.025392,0.878011,18.0,0.031218,17.398435,18.828722,13.294764
3,Xkr4,0.693041,14.0,0.021222,0.0,0.0,0.0,0.0,0.0,0.0,17.68342,18.828722,13.332736
4,Xkr4,0.675085,24.0,0.035438,0.727537,15.0,0.02372,0.732392,23.0,0.033274,17.713823,18.828722,13.349558


In [36]:
abc_dmr_feat[abc_dmr_feat['gene'] == 'Il33']

Unnamed: 0,gene,2mo.activity,2mo.contact,2mo.abc_score,9mo.activity,9mo.contact,9mo.abc_score,18mo.activity,18mo.contact,18mo.abc_score,log2(eg_distance),log2(gene_length),log2(contact_distance)
93716,Il33,0.528512,44.0,0.023338,0.599825,62.0,0.029967,0.637267,64.0,0.033086,16.556446,15.119752,13.31939
93717,Il33,0.561667,41.0,0.023111,0.0,0.0,0.0,0.0,0.0,0.0,16.357689,15.119752,13.366322
93718,Il33,0.807369,33.0,0.026739,0.0,0.0,0.0,0.0,0.0,0.0,16.040525,15.119752,13.338458
93719,Il33,0.962441,21.0,0.020284,0.948346,29.0,0.022161,0.0,0.0,0.0,14.984997,15.119752,13.341797
93720,Il33,0.961083,21.0,0.020255,0.916693,29.0,0.021421,0.0,0.0,0.0,14.824213,15.119752,13.315291
93721,Il33,0.782645,40.0,0.031418,0.914846,42.0,0.030962,0.899552,47.0,0.034298,13.333994,15.119752,13.377753
93722,Il33,0.546898,40.0,0.021954,0.826472,42.0,0.027971,0.847568,47.0,0.032316,13.447212,15.119752,13.362218
134601,Il33,0.0,0.0,0.0,0.602665,49.0,0.023796,0.0,0.0,0.0,16.90926,15.119752,13.343047
134602,Il33,0.0,0.0,0.0,0.597155,49.0,0.023578,0.0,0.0,0.0,16.878769,15.119752,13.336646
134603,Il33,0.0,0.0,0.0,0.574907,49.0,0.0227,0.0,0.0,0.0,16.861063,15.119752,13.338179


In [30]:
def get_abc_peak_feat(ct):
    abc_peak =  pd.read_csv(f'ml_input/{ct}/{ct}.abc_enhancer.peak_gene.csv').fillna(0)
    abc_peak_feat = abc_peak
    abc_peak_feat.rename(columns={'gene_name': 'gene'}, inplace=True)
    abc_peak_feat['log2(eg_distance)'] = np.log2(np.minimum(abs(abc_peak_feat['start'] - abc_peak_feat['gene_start']), abs(abc_peak_feat['start'] - abc_peak_feat['gene_end'])) + 10000)
    abc_peak_feat['log2(gene_length)'] = np.log2(abc_peak_feat['gene_end'] - abc_peak_feat['gene_start'])
    abc_peak_feat['log2(contact_distance)'] = np.log2(abs(abc_peak_feat['end'] - abc_peak_feat['start']) + 10000)
    abc_peak_feat = abc_peak_feat[['gene', *ABC_peak_NAMES]]
    assert abc_peak_feat.isna().sum().sum() == 0
    assert abc_peak_feat.isin([np.inf, -np.inf]).sum().sum() == 0
    print('Processed abc peak data')
    return abc_peak_feat

abc_peak_feat= get_abc_peak_feat(ct)
abc_peak_feat.head()

Processed abc peak data


Unnamed: 0,gene,2mo.activity,2mo.contact,2mo.abc_score,9mo.activity,9mo.contact,9mo.abc_score,18mo.activity,18mo.contact,18mo.abc_score,log2(eg_distance),log2(gene_length),log2(contact_distance)
0,Xkr4,11.936699,10.0,0.035034,8.346058,12.0,0.03147,9.832627,12.0,0.034418,17.071127,18.828722,13.358239
1,Xkr4,11.800173,15.0,0.05195,9.004209,15.0,0.042439,9.840076,18.0,0.051667,17.394555,18.828722,13.358239
2,Xkr4,4.691523,16.0,0.022031,0.0,0.0,0.0,4.543866,20.0,0.026509,17.347656,18.828722,13.358239
3,Xkr4,45.844744,16.0,0.215288,38.816642,16.0,0.195151,40.410608,17.0,0.200394,13.293328,18.828722,13.358239
4,Xkr4,13.885295,16.0,0.065206,10.711842,16.0,0.053854,12.186499,17.0,0.060432,13.377482,18.828722,13.358239


In [38]:
df = pd.read_csv(f'ml_input/{ct}/{ct}.luisa_RNA_DEG.csv', index_col =0)
gene2value = df[['DEG']]

In [41]:
 DATA = {}

FEATURE_LOADING_FUNCTIONS = {
    'dmr': get_dmr_feat,
    'mcg_genebody': get_genebody_feat,
    'mch_genebody': get_mch_genebody_feat,
    'atac': get_atac_feat,
    'hic_loop': get_hic_feat,
    'hic_abc_dmr': get_abc_dmr_feat,
    'hic_abc_peak': get_abc_peak_feat
}

tasks = [(name, FEATURE_LOADING_FUNCTIONS[name]) for name in DATA_FEATURE_NAMES_LIST]

In [43]:
with ThreadPoolExecutor() as executor:
    # Submit all tasks and store futures
    futures = {
        name: executor.submit(func, ct) 
        for name, func in tasks
    }
    
    # Get results as they complete and store in DATA
    for name, future in futures.items():
        DATA[name] = future.result()

Processed dmr data
Processed mCH genebody data
Processed mCG genebody data
Processed atac data
Processed abc dmr data
Processed abc peak data
Processed hic loop data


In [48]:
X = {}
# Step 1: Prepare the data
print('Preparing data')

def prepare_data(feature_type):
    features = DATA[feature_type]
    feature_names = DATA_FEATURE_NAMES[feature_type]
    list_feat = features.groupby('gene').apply(lambda x: x[feature_names].values.tolist())
    list_feat = list_feat.reindex(index_order, fill_value=[[0] * len(feature_names)])
    return list_feat.values.tolist()


Preparing data


In [58]:
feature_type = 'dmr'
features = DATA[feature_type]
feature_names = DATA_FEATURE_NAMES[feature_type]
list_feat = features.groupby('gene').apply(lambda x: x[feature_names].values.tolist())
list_feat = list_feat.reindex(index_order, fill_value=[[0] * len(feature_names)])

  list_feat = features.groupby('gene').apply(lambda x: x[feature_names].values.tolist())


In [None]:
with ThreadPoolExecutor() as executor:
    # Submit all tasks and store futures
    futures = {
        feature_type: executor.submit(prepare_data, feature_type) 
        for feature_type in DATA
    }
    
    # Get results as they complete and store in DATA
    for feature_type, future in futures.items():
        X[feature_type] = future.result()

# y = gene2value['DEG'].values.tolist()
# y = np.array([int(i) for i in y])

y = gene2value.values

In [64]:
def load_data(ct):
    df = pd.read_csv(f'ml_input/{ct}/{ct}.luisa_RNA_DEG.csv', index_col =0)
    #df.set_index('gene', inplace=True)

    # non_zero_genes = df[df['DEG'] != 0].index
    # df = df[df.index.isin(non_zero_genes)]
    #gene2value = df[['-log10(fdr)','log2(old/young)', 'DEG']]
    gene2value = df[['DEG']]

    # df = df[df.index.isin(non_zero_genes)]
    # use all pleak/loop, adding columns as pvalue, anova, 
 
    DATA = {}

    FEATURE_LOADING_FUNCTIONS = {
        'dmr': get_dmr_feat,
        'mcg_genebody': get_genebody_feat,
        'mch_genebody': get_mch_genebody_feat,
        'atac': get_atac_feat,
        'hic_loop': get_hic_feat,
        'hic_abc_dmr': get_abc_dmr_feat,
        'hic_abc_peak': get_abc_peak_feat
    }
    
    # Define tasks to run in parallel
    tasks = [
        (name, FEATURE_LOADING_FUNCTIONS[name]) for name in DATA_FEATURE_NAMES_LIST
    ]

    # Run tasks in parallel using ThreadPoolExecutor
    with ThreadPoolExecutor() as executor:
        # Submit all tasks and store futures
        futures = {
            name: executor.submit(func, ct) 
            for name, func in tasks
        }
        
        # Get results as they complete and store in DATA
        for name, future in futures.items():
            DATA[name] = future.result()

    index_order = gene2value.index.tolist()
    # Train a sequence model on dmr_feat to predict gene2value['log2(old/young)']
    # Each gene has a sequence of 4 features, 2mo, 9mo, 18mo, old-young
    # The sequence length is not fixed, so we need to use a dynamic model
    # Let's use a commonly used sequence prediction model for sentence classification
    # like LSTM or Transformer

    X = {}
    # Step 1: Prepare the data
    print('Preparing data')
    def prepare_data(feature_type):
        features = DATA[feature_type]
        feature_names = DATA_FEATURE_NAMES[feature_type]
        list_feat = features.groupby('gene').apply(lambda x: x[feature_names].values.tolist())
        list_feat = list_feat.reindex(index_order, fill_value=[[0] * len(feature_names)])
        return list_feat.values.tolist()
    with ThreadPoolExecutor() as executor:
        # Submit all tasks and store futures
        futures = {
            feature_type: executor.submit(prepare_data, feature_type) 
            for feature_type in DATA
        }
        
        # Get results as they complete and store in DATA
        for feature_type, future in futures.items():
            X[feature_type] = future.result()

    # y = gene2value['DEG'].values.tolist()
    # y = np.array([int(i) for i in y])

    y = gene2value.values

    return {
        'y': y,
        'X': X,
    }

In [65]:
data = load_data(ct)

Processed dmr data
Processed mCH genebody data
Processed mCG genebody data
Processed atac data
Processed abc dmr data
Processed abc peak data
Processed hic loop data
Preparing data


  list_feat = features.groupby('gene').apply(lambda x: x[feature_names].values.tolist())
  list_feat = features.groupby('gene').apply(lambda x: x[feature_names].values.tolist())
  list_feat = features.groupby('gene').apply(lambda x: x[feature_names].values.tolist())
  list_feat = features.groupby('gene').apply(lambda x: x[feature_names].values.tolist())
  list_feat = features.groupby('gene').apply(lambda x: x[feature_names].values.tolist())
  list_feat = features.groupby('gene').apply(lambda x: x[feature_names].values.tolist())
  list_feat = features.groupby('gene').apply(lambda x: x[feature_names].values.tolist())


In [74]:
def get_balanced_data(data):
    # Separate the data into zero and non-zero y values
    y = data['y']
    # zero_indices = np.where(y[:, 2] == 0)[0]
    # non_zero_indices = np.where(y[:, 2] != 0)[0]
    zero_indices = np.where(y == 0)[0]
    non_zero_indices = np.where(y != 0)[0]
    print(f'zero: {len(zero_indices)}, non-zero: {len(non_zero_indices)}')

    # Sample len(non_zero_indices) indices from each group
    n_samples = len(non_zero_indices)
    sampled_zero_indices = np.random.choice(zero_indices, n_samples // 2, replace=False)
    sampled_non_zero_indices = np.random.choice(non_zero_indices, n_samples, replace=False)

    # Combine the sampled indices
    sampled_indices = np.concatenate([sampled_zero_indices, sampled_non_zero_indices])

    # Create balanced dataset
    X_balanced = {}

    print('Getting balanced data')
    def index_features(feature_type):
        features = data['X'][feature_type]
        return [features[i] for i in sampled_indices]

    with ThreadPoolExecutor() as executor:
        # Submit all tasks and store futures
        futures = {
            feature_type: executor.submit(index_features, feature_type) 
            for feature_type in data['X']
        }
        # Get results as they complete and store in X_balanced
        for feature_type, future in futures.items():
            X_balanced[feature_type] = future.result()
    y_balanced = data['y'][sampled_indices, :]
    return X_balanced, y_balanced


X_balanced, y_balanced = get_balanced_data(data)

zero: 4009, non-zero: 1537
Getting balanced data


In [82]:
test_flat = [item for sublist in X_balanced['dmr'] for item in sublist]
test_array = np.array(test_flat)

In [67]:
# Normalization function
def normalize_features(train_data, test_data):
    # Flatten the lists for easier processing
    # train data: [N, L, F] -> [N*L, F]
    train_flat = [item for sublist in train_data for item in sublist]
    test_flat = [item for sublist in test_data for item in sublist]
    
    # Convert to numpy arrays
    train_array = np.array(train_flat)
    test_array = np.array(test_flat)
    
    # Normalize all features using min-max scaling based on train data
    min_vals = np.min(train_array, axis=0)
    max_vals = np.max(train_array, axis=0)
    train_normalized = (train_array - min_vals) / (max_vals - min_vals)
    test_normalized = (test_array - min_vals) / (max_vals - min_vals)
    

    assert np.all(max_vals - min_vals != 0), np.where(max_vals - min_vals == 0)
    
    # Reconstruct the data structure
    def reconstruct_data(normalized_array, original_data):
        normalized_data = []
        idx = 0
        for sublist in original_data:
            normalized_sublist = []
            for _ in sublist:
                normalized_sublist.append(normalized_array[idx].tolist())
                idx += 1
            normalized_data.append(normalized_sublist)
        return normalized_data
    
    train_normalized = reconstruct_data(train_normalized, train_data)
    test_normalized = reconstruct_data(test_normalized, test_data)
    
    return train_normalized, test_normalized


