In [13]:
# This script is for analysing if feature and target are paralogs. The code can be integrated into figures.py
# Author: Yiyun

import sys
sys.path.append('../')

from src.ceres_infer.analyses import *
from ast import literal_eval

In [34]:
### Load data and setting
# set color
src_colors = {'CERES':(214/255, 39/255, 40/255, 1.0), #red
              'RNA-seq':(31/255, 119/255, 180/255, 1.0), #blue
              'CN':(255/255, 127/255, 14/255, 1.0), #orange
              'Mut':(44/255, 160/255, 44/255, 1.0), #green
              'Lineage':(188/255, 189/255, 34/255, 1.0), #yellow
              'nan':(220/255, 220/255, 220/255, 1.0)} #grey
# output directory
dir_out = '../out'

# load summary data
dir_in_res = '../out/20.0216 feat/reg_rf_boruta'
dir_in_anlyz = os.path.join(dir_in_res, 'anlyz_filtered')
df_featSummary = pd.read_csv(os.path.join(dir_in_anlyz, 'feat_summary.csv')) #feature summary
df_featSummary['feat_sources'] = df_featSummary['feat_sources'].apply(literal_eval)
df_featSummary['feat_genes'] = df_featSummary['feat_genes'].apply(literal_eval)

In [43]:
### Modified code
# parse geneset data and read line by line
def parseGeneset_m(fname):
    genesets = dict()
    f = open(fname)
    for x in f:
        gs_name = re.sub('\\t\\t.*\\n','',x)
        genes = re.sub('.*\\t\\t','',x).replace('\t\n','').split(sep='\t')
        genes = np.hstack(genes)
        genesets[gs_name] = genes
    f.close()
    
    return genesets

def isInSameGS_m(target, features, genesets):
    #check if both target and feature is in the same geneset
    #target is one value; features can be an array
    #requires: genesets
    if(not isinstance(features, list)):
        features = [features]
        print(features)
    isInBools = [(len(set(features).intersection(gs))>0) and (target not in features) and (target in gs)  for _,gs in genesets.items()]
    return sum(isInBools)>0

# not generating feat suammry; add paralog file; delete combined geneset
def anlyz_varExp_feats_m(feat_summary, gs_dir = '../out/', outdir_sub='../out'):
    # analyze features
    if(np.logical_not(os.path.exists(outdir_sub))): os.mkdir(outdir_sub)
    
    # paralog analysis
    genesets = parseGeneset_m('%s/paralog.txt' % gs_dir)
    sameGs_counts,sameGs_src_counts,feat_summary_annot = getGrpCounts(isInSameGS_m, isInSameGS_sources, feat_summary, genesets)
    plotGrpCounts(sameGs_counts, sameGs_src_counts, feat_summary_annot, 'in paralog list', outdir_sub)

In [44]:
### Just copied the code because importing wasn't successful
def gen_feat_pies(sameGrp_counts, sameGrp_src_counts, feat_summary_annot, dir_out, fnames, labels):
    # pie chart of counts in/not in group
    c = sameGrp_counts.loc[sameGrp_counts.importanceRank == 'top10', 'count'][0]
    df_counts = pd.Series({labels[0]: c,
                           labels[1]: feat_summary_annot.shape[0] - c})
    plotCountsPie(df_counts,
                  None,
                  fnames[0],
                  dir_out)

    # pie chart of feature source
    c = sameGrp_src_counts.loc[sameGrp_src_counts.importanceRank == 'top10',]
    df_counts = pd.Series(c['count'].values, index=c['source'])
    plotCountsPie(df_counts,
                  None,
                  fnames[1],
                  dir_out,
                  colors=[src_colors[s] for s in df_counts.index])

    #heatmap
    s1 = feat_summary_annot.columns.str.startswith('inSame')
    s2 = ~feat_summary_annot.columns.str.contains('top')
    df = feat_summary_annot.loc[:, s1 & s2]

    plt.figure()
    fig, ax = plt.subplots(figsize=(6, 5))
    ax = sns.heatmap(df, yticklabels=False, xticklabels=list(range(1, 11)), vmin=-1, vmax=1, cmap='RdBu', cbar=False)
    ax.set(xlabel='$\it{n}$th Feature', ylabel='Target genes')
    plt.tight_layout()
    plt.savefig("%s/%s_heatmap.png" % (dir_out, fnames[1]))
    plt.close()

In [45]:
# Paralog feature analyze
anlyz_varExp_feats_m(df_featSummary,gs_dir = '../out', outdir_sub='../out')

['FASN']
['FASN']
['ADSL']
['ACSL1']
['ACTR1B']
['XRN1']
['PAICS']
['ADSL']
['SLC4A7']
['ARNT']
['CHCHD4']
['EXOSC10']
['PKM']
['DYRK1A']
['RPIA']
['UBE2C']
['USP7']
['FKBPL']
['AP2S1']
['AP2M1']
['PAK2']
['SMARCB1']
['BRD7']
['BRAT1']
['AHR']
['ACTR2']
['WRB']
['PPAT']
['NBAS']
['TMX2']
['AK1']
['ATP5PB']
['ATP6AP2']
['ATP6AP1']
['ATP6V0E2']
['ACOT4']
['ATP6V1G2']
['USP22']
['PCGF1']
['UBA6']
['MAPK1']
['WDR73']
['TAF8']
['SMARCD1']
['FARSB']
['HUWE1']
['RINT1']
['SPATA5L1']
['CTGF']
['PDCD10']
['UMPS']
['KEAP1']
['G3BP1']
['RUNX1']
['WDR7']
['WDR83']
['FBXO42']
['MED16']
['CCND1']
['CCND2']
['NLRC3']
['CDK2']
['KPNB1']
['ANKRD65']
['C15orf41']
['AFDN']
['SGO1']
['PKN2']
['RTCB']
['SCAF8']
['CCNE1']
['CCDC82']
['CDK6']
['CCNC']
['TP53']
['SPINT1']
['SDC3']
['CASP8']
['SNRPB']
['CHMP2A']
['KDF1']
['VPS37A']
['TACC3']
['GNB1']
['CNOT10']
['NDUFS5']
['COG1']
['COG5']
['DET1']
['COPG2']
['ILK']
['STRAP']
['CNEP1R1']
['CTNND1']
['TCF7L2']
['UMPS']
['VHL']
['KEAP1']
['SCD']
['COX6C']
['CDC7

['NDUFB6']
['NDUFS5']
['NDUFS1']
['NDUFC1']
['NDUFS1']
['NDUFB5']
['NDUFA10']
['NDUFA8']
['NDUFA6']
['LZTR1']
['RNF146']
['CDC42']
['PAXIP1']
['RFK']
['PFKP']
['RBM19']
['ELP1']
['SHOC2']
['PPP1R14C']
['TAP1']
['TFRC']
['DNAJA2']
['OSTC']
['HIRA']
['GTPBP10']
['PPIEL']
['STAG2']
['PPAT']
['PARD3']
['DCDC2']
['STAG2']
['GAL3ST4']
['TAOK1']
['KLF2P1']
['CIT']
[nan]
['HACD4']
['CYC1']
['PAICS']
['PFDN5']
['RPE']
['TPI1']
['UGP2']
['SIN3B']
['PIK3CA']
['PGM3']
['CCDC115']
['DSCC1']
['GTPBP10']
['ZNF559']
['RPL37']
['RPP25']
['SYNE4']
['POLG']
['DHFR']
['NMNAT1']
['ZBTB47']
['MDM2']
['DNAJC7']
['PPP2CB']
['CWF19L2']
['MDM2']
['PPP6R3']
['CYB5R4']
['EPCAM']
['THAP1']
['PSMB8-AS1']
[nan]
['PSMG4']
['PSMG4']
['SECISBP2']
['COG6']
['UBOX5']
['PLEKHH3']
['FERMT2']
['TRMT61A']
['SOS1']
['UBAP1']
['LDLRAP1']
['DENND4C']
['BIK']
['RIC1']
['WASHC3']
['KRTCAP3']
['PTK2']
['RAD1']
['BRAF']
['PSMD7']
['LARS2']
[nan]
['DLX6-AS1']
['IPPK']
['GNA13']
['RAB6A']
['ARHGEF12']
['MLST8']
['RNASEH2B']
['MDC1']


['POLR2J']
['RNASEH2A']
['MED25']
['GMPS']
['PISD']
['KBTBD2']
['RSU1']
['PAICS']
['VPS37A']
[nan]
['FDX2']
['SDHA']
['IL1RN']
[nan]
['ELMO2']
['DMKN']
['ELMO2']
['RGP1']
['COL6A2']
['KATNA1']
['UBA6']
['ITGAV']
[nan]
['KIRREL1']
['TP53BP1']
['SLC52A3']
[nan]
[nan]
['TCF7L2']
['LAMTOR1']
['MRPS25']
['PPME1']
[nan]
['DLST']
['MRM3']
['MRPL17']
['ATXN2']
['H2AFX']
['MAGOHB']
['TRAF2']
['AFDN']
['AC110285.1']
['PTK2B']
[nan]
['MRPL39']
['TSPYL5']
['HMGCR']
['PRR12']
['SREBF1']
['SREBF1']
['BCL2L2']
['PPM1D']
['MED12']
['TAF5L']
['MED19']
['MED16']
['MED19']
['MED12']
['MED12']
['NCBP2']
['DRC3']
['MLLT10']
[nan]
['DPH5']
[nan]
[nan]
['LAMTOR4']
['DEPDC5']
['EMC3']
['CXCL16']
['MRPL23']
['MRPL17']
['MRPL46']
['MARS2']
['MRPL46']
['ATIC']
['MRPS24']
['MVK']
['DTYMK']
['RHOC']
['MYCNUT']
['DPYSL5']
['AC244452.3']
['ADSL']
['RAB18']
['BNIP1']
['TP53']
['RPS27L']
['BCL2L2-PABPN1']
['ILK']
['NDUFB8']
['NDUFC2']
['NDUFAF4']
['NDUFA6']
['NDUFA2']
['NDUFB10']
['NDUFS5']
['NDUFV2']
['NDUFAF4']
[nan

['SNORD130']
[nan]
['B3GALT6']
[nan]
[nan]
[nan]
[nan]
[nan]
['MSH5']
['POLL']
['CLPX']
[nan]
[nan]
['TIMM10']
['CALU']
['EXT2']
[nan]
['SERPINB13']
[nan]
[nan]
['AJUBA']
[nan]
['FGD4']
['FPGS']
[nan]
[nan]
['SLC7A5']
[nan]
['NFU1']
[nan]
['TFRC']
['SLC2A1']
[nan]
['SCD']
[nan]
['RAB25']
[nan]
['PDX1']
['MRPS5']
['ELOA3D']
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
['HMGCR']
['LINC02532']
['AC046185.3']
['GFM2']
[nan]
[nan]
[nan]
[nan]
[nan]
['ZNHIT3']
[nan]
[nan]
['OSTC']
['CRK']
['ADSL']
[nan]
[nan]
[nan]
['CAD']
['FBXO10']
[nan]
['AL035661.1']
['RNF223']
['FERMT2']
['VPS54']
['TGFB1I1']
[nan]
['SDC1']
[nan]
[nan]
['RB1CC1']
['ENSA']
['TMPRSS4']
[nan]
[nan]
[nan]
['VPS41']
['MRPL21']
['PPP6R3']
[nan]
['GLRX5']
[nan]
['MRPS24']
[nan]
['TRAF3IP2']
['LOC102724020']
[nan]
['TRIO']
['ROPN1B']
[nan]
[nan]
['MRPL17']
[nan]
['ADSL']
[nan]
['EMC3']
[nan]
['KLHDC3']
['MDM2']
['MED16']
['MED13L']
['MED1']
['MED24']
['MED13L']
['MED1']
['MED15']
[nan]
['USP43']
['MEAF6']
[nan]
['OXSM']
[nan]
[nan]
['ND

[nan]
[nan]
[nan]
['VIL1']
['IMPDH2']
[nan]
[nan]
[nan]
['PET117']
[nan]
[nan]
[nan]
['MANF']
[nan]
['ADSL']
['SDHB']
[nan]
[nan]
[nan]
[nan]
[nan]
['TAF5L']
[nan]
[nan]
['PPAT']
['ROPN1B']
[nan]
[nan]
[nan]
[nan]
[nan]
['HSD17B10']
[nan]
[nan]
['PGK1']
['DIRAS1']
['ADGRF1']
[nan]
['TTTY14']
[nan]
[nan]
['ARHGAP35']
[nan]
[nan]
[nan]
['MBTPS1']
[nan]
['ATP6AP1']
['GPRC5C']
[nan]
[nan]
['LOC101929431']
[nan]
['B3GAT3']
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
['IL16']
[nan]
[nan]
[nan]
[nan]
['AL118505.1']
['SYNC']
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
['ATIC']
[nan]
[nan]
['UGP2']
[nan]
['NDUFA10']
[nan]
['ADSS']
[nan]
[nan]
[nan]
[nan]
['AC105219.3']
[nan]
['UBL3']
['EARS2']
['LINC01902']
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
['PMVK']
['KLHL6-AS1']
[nan]
['MRPL10']
[nan]
[nan]
[nan]
[nan]
[nan]
['SLC25A42']
[nan]
[nan]
['PIK3CA']
['RAPGEF1']
['GART']
[nan]
[nan]
[nan]
['ADSS']
[nan]
[nan]
['RAC1']
[nan]
[nan]
[nan]
['LOXL2']
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
['AC108860.2']
[nan]
[nan]


[nan]
[nan]
[nan]
[nan]
['NDUFB9']
['NDUFS1']
['NDUFB9']
['NDUFB3']
['NDUFAF4']
['NDUFA2']
['NDUFC1']
['NDUFB8']
['NDUFA10']
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
['DPH3']
[nan]
[nan]
[nan]
['ADSL']
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
['DHFR']
[nan]
['BASP1P1']
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
['NDUFV2']
['NAMPT']
[nan]
[nan]
[nan]
['PAICS']
[nan]
[nan]
[nan]
[nan]
[nan]
['IBA57']
[nan]
[nan]
[nan]
[nan]
[nan]
['UMPS']
['CAD']
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
['IKBIP']
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
['TAOK1']
[nan]
[nan]
[nan]
['MRPS24']
[nan]
[nan]
['TRPM7']
[nan]
[nan]
[nan]
[nan]
[nan]
['RHNO1']
[nan]
[nan]
['EIF2S3']
['CAD']
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
['TFRC']
['UMPS']
[nan]
[nan]
[nan]
[nan]
['PDGFC']
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[n

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

In [46]:
# on same gene
feat_summary_annot_para = pd.read_csv('../out/inparaloglist/feat_summary_annot.csv', header=0, index_col=0)
sameGrp_counts, sameGrp_src_counts = getGrpCounts_fromFeatSummaryAnnot(feat_summary_annot_para)
gen_feat_pies(sameGrp_counts,sameGrp_src_counts,feat_summary_annot_para,
              dir_out, ['fig2-para',  'fig2-para'], ['Are paralog','Not paralog'])

<Figure size 432x288 with 0 Axes>