In [None]:
import pandas as pd
import numpy as np
import hidef
import os
import difflib
from collections import defaultdict
from sklearn import metrics
from sklearn.metrics import explained_variance_score, mean_absolute_error, r2_score
from scipy.stats.stats import pearsonr, spearmanr

In [None]:
path = 'Input_Data/Feature_Files/' #Replace by path to feature files

In [None]:
keep_files = []
for f in [x for x in os.listdir(path) if x.startswith("synsig1720")]:
    if f == 'synsig1720_normalized_pFAM_domain.csv' or f == 'synsig1720_normalized_mentha_source_feature.csv' or f == 'synsig1720_normalized_chr_no_source_feature.csv':
        continue
    file = pd.read_csv(path+f)
    if len(file['Genes'].unique()) == len(file) == 1720:
        keep_files.append(f)
    #print(f, len(file))

In [None]:
len(keep_files)

48

In [None]:
pearson_features = ['cerebral_cortex_hpa_isoform_exp', 'colon_hpa_isoform_exp', 'ovary_hpa_isoform_exp', 'breast_hpa_isoform_exp', 'lung_hpa_isoform_exp', 'salivary gland_hpa_isoform_exp', 'seminal vesicle_hpa_isoform_exp',
    'lymph node_hpa_isoform_exp', 'placenta_hpa_isoform_exp', 'kidney_hpa_isoform_exp', 'cervix, uterine_hpa_isoform_exp', 'adrenal gland_hpa_isoform_exp', 'thyroid gland_hpa_isoform_exp', 
    'stomach 1_hpa_isoform_exp', 'gallbladder_hpa_isoform_exp', 'duodenum_hpa_isoform_exp', 'fallopian tube_hpa_isoform_exp','endometrium 1_hpa_isoform_exp', 'skin 1_hpa_isoform_exp', 
    'spleen_hpa_isoform_exp', 'gtex_rna_tissue_expression', 'gtex_no_brain_exp', 'appendix_hpa_isoform_exp', 'heart muscle_hpa_isoform_exp', 'small intestine_hpa_isoform_exp', 'epididymis_hpa_isoform_exp', 'testis_hpa_isoform_exp',
    'liver_hpa_isoform_exp', 'esophagus_hpa_isoform_exp', 'urinary bladder_hpa_isoform_exp', 'skeletal muscle_hpa_isoform_exp', 'tonsil_hpa_isoform_exp', 'prostate_hpa_isoform_exp', 
    'parathyroid gland_hpa_isoform_exp','adipose tissue_hpa_isoform_exp', 'smooth muscle_hpa_isoform_exp', 'rectum_hpa_isoform_exp', 'bone marrow_hpa_isoform_exp', 
    'HIP_RNA', 'DFC_RNA', 'V1C_RNA', 'AMY_RNA', 'MD_RNA', 'STR_RNA', 'CBC_RNA']
    
subtraction_features=[ 'Phosphosite_hu_no', 'qPhos_site_number', 'Ensembl_isoform_no', 'Ensembl_aa_length', 'pFAM_domain_number', 'protein_mass', "trans_count", 'gc_content', 'trans_len', 'gene_length', 'exon_no', 'cds_length']
jaccard_features=['chr_no_source_feature', 'pFAM_domain', 'mentha_source_feature']
kernel_features=['mentha_kernel', 'bioplex_kernel']

In [None]:
len(subtraction_features)

12

In [None]:
dict_feature_to_type = defaultdict(str)
for p in pearson_features:
    dict_feature_to_type[p] = 'pearson'
for s in subtraction_features:
    dict_feature_to_type[s] = 'subtraction'
for j in jaccard_features:
    dict_feature_to_type[j] = 'jaccard'
for k in kernel_features:
    dict_feature_to_type[k] = 'kernel'

In [None]:
dict_feature_to_type

defaultdict(str,
            {'cerebral_cortex_hpa_isoform_exp': 'pearson',
             'colon_hpa_isoform_exp': 'pearson',
             'ovary_hpa_isoform_exp': 'pearson',
             'breast_hpa_isoform_exp': 'pearson',
             'lung_hpa_isoform_exp': 'pearson',
             'salivary gland_hpa_isoform_exp': 'pearson',
             'seminal vesicle_hpa_isoform_exp': 'pearson',
             'lymph node_hpa_isoform_exp': 'pearson',
             'placenta_hpa_isoform_exp': 'pearson',
             'kidney_hpa_isoform_exp': 'pearson',
             'cervix, uterine_hpa_isoform_exp': 'pearson',
             'adrenal gland_hpa_isoform_exp': 'pearson',
             'thyroid gland_hpa_isoform_exp': 'pearson',
             'stomach 1_hpa_isoform_exp': 'pearson',
             'gallbladder_hpa_isoform_exp': 'pearson',
             'duodenum_hpa_isoform_exp': 'pearson',
             'fallopian tube_hpa_isoform_exp': 'pearson',
             'endometrium 1_hpa_isoform_exp': 'pearson',
       

In [None]:
dict_file_to_type = defaultdict(str)
for f in keep_files:
    
    f_ = f.replace('synsig1720_normalized_', '')
    
    type_ = dict_feature_to_type[difflib.get_close_matches(f_, list(dict_feature_to_type.keys()))[0]]
    #print(dict_feature_to_type[difflib.get_close_matches(f_, list(dict_feature_to_type.keys()))])
    print(f, type_)
    dict_file_to_type[f] = type_

synsig1720_normalized_lymph node_hpa_isoform_exp.csv pearson
synsig1720_normalized_skeletal muscle_hpa_isoform_exp.csv pearson
synsig1720_normalized_placenta_hpa_isoform_exp.csv pearson
synsig1720_normalized_gc_content.csv subtraction
synsig1720_normalized_Ensembl_isoform_no.csv subtraction
synsig1720_normalized_rectum_hpa_isoform_exp.csv pearson
synsig1720_normalized_prostate_hpa_isoform_exp.csv pearson
synsig1720_normalized_cervix, uterine_hpa_isoform_exp.csv pearson
synsig1720_normalized_stomach 1_hpa_isoform_exp.csv pearson
synsig1720_normalized_parathyroid gland_hpa_isoform_exp.csv pearson
synsig1720_normalized_trans_count.csv subtraction
synsig1720_normalized_breast_hpa_isoform_exp.csv pearson
synsig1720_normalized_pFAM_domain_number.csv subtraction
synsig1720_normalized_appendix_hpa_isoform_exp.csv pearson
synsig1720_normalized_ovary_hpa_isoform_exp.csv pearson
synsig1720_normalized_gtex_rna_tissue_expression.csv pearson
synsig1720_normalized_protein_mass.csv subtraction
synsig1

In [None]:
dict_file_to_type

defaultdict(str,
            {'synsig1720_normalized_lymph node_hpa_isoform_exp.csv': 'pearson',
             'synsig1720_normalized_skeletal muscle_hpa_isoform_exp.csv': 'pearson',
             'synsig1720_normalized_placenta_hpa_isoform_exp.csv': 'pearson',
             'synsig1720_normalized_gc_content.csv': 'subtraction',
             'synsig1720_normalized_Ensembl_isoform_no.csv': 'subtraction',
             'synsig1720_normalized_rectum_hpa_isoform_exp.csv': 'pearson',
             'synsig1720_normalized_prostate_hpa_isoform_exp.csv': 'pearson',
             'synsig1720_normalized_cervix, uterine_hpa_isoform_exp.csv': 'pearson',
             'synsig1720_normalized_stomach 1_hpa_isoform_exp.csv': 'pearson',
             'synsig1720_normalized_parathyroid gland_hpa_isoform_exp.csv': 'pearson',
             'synsig1720_normalized_trans_count.csv': 'subtraction',
             'synsig1720_normalized_breast_hpa_isoform_exp.csv': 'pearson',
             'synsig1720_normalized_pFAM_domai

In [None]:
columns = ['Gene1', 'Gene2'] + list([ x.replace('synsig1720_normalized_', '').replace('.csv', '') for x in dict_file_to_type.keys()])

In [None]:
columns

['Gene1',
 'Gene2',
 'lymph node_hpa_isoform_exp',
 'skeletal muscle_hpa_isoform_exp',
 'placenta_hpa_isoform_exp',
 'gc_content',
 'Ensembl_isoform_no',
 'rectum_hpa_isoform_exp',
 'prostate_hpa_isoform_exp',
 'cervix, uterine_hpa_isoform_exp',
 'stomach 1_hpa_isoform_exp',
 'parathyroid gland_hpa_isoform_exp',
 'trans_count',
 'breast_hpa_isoform_exp',
 'pFAM_domain_number',
 'appendix_hpa_isoform_exp',
 'ovary_hpa_isoform_exp',
 'gtex_rna_tissue_expression',
 'protein_mass',
 'smooth muscle_hpa_isoform_exp',
 'spleen_hpa_isoform_exp',
 'cds_length',
 'adipose tissue_hpa_isoform_exp',
 'urinary bladder_hpa_isoform_exp',
 'seminal vesicle_hpa_isoform_exp',
 'tonsil_hpa_isoform_exp',
 'small intestine_hpa_isoform_exp',
 'Phosphosite_hu_no',
 'trans_len',
 'fallopian tube_hpa_isoform_exp',
 'epididymis_hpa_isoform_exp',
 'gallbladder_hpa_isoform_exp',
 'testis_hpa_isoform_exp',
 'kidney_hpa_isoform_exp',
 'endometrium 1_hpa_isoform_exp',
 'bone marrow_hpa_isoform_exp',
 'duodenum_hpa_is

In [None]:
len(columns)

50

In [None]:
dfs = {}
for f in keep_files:
    df = pd.read_csv(path+f)
    dfs[f] = df
    print(f, len(df))

synsig1720_normalized_lymph node_hpa_isoform_exp.csv 1720
synsig1720_normalized_skeletal muscle_hpa_isoform_exp.csv 1720
synsig1720_normalized_placenta_hpa_isoform_exp.csv 1720
synsig1720_normalized_gc_content.csv 1720
synsig1720_normalized_Ensembl_isoform_no.csv 1720
synsig1720_normalized_rectum_hpa_isoform_exp.csv 1720
synsig1720_normalized_prostate_hpa_isoform_exp.csv 1720
synsig1720_normalized_cervix, uterine_hpa_isoform_exp.csv 1720
synsig1720_normalized_stomach 1_hpa_isoform_exp.csv 1720
synsig1720_normalized_parathyroid gland_hpa_isoform_exp.csv 1720
synsig1720_normalized_trans_count.csv 1720
synsig1720_normalized_breast_hpa_isoform_exp.csv 1720
synsig1720_normalized_pFAM_domain_number.csv 1720
synsig1720_normalized_appendix_hpa_isoform_exp.csv 1720
synsig1720_normalized_ovary_hpa_isoform_exp.csv 1720
synsig1720_normalized_gtex_rna_tissue_expression.csv 1720
synsig1720_normalized_protein_mass.csv 1720
synsig1720_normalized_smooth muscle_hpa_isoform_exp.csv 1720
synsig1720_normal

In [None]:
synsig_genes = dfs[f].Genes.values

In [None]:
synsig_genes

array(['AAK1', 'ABCA2', 'ABCC8', ..., 'ZDHHC17', 'ZDHHC5', 'ZFYVE9'],
      dtype=object)

In [None]:
df_scores = pd.DataFrame(columns = columns)

g1 = []
g2 = []
for i in range(len(synsig_genes)):
    for j in range(i+1, len(synsig_genes)):
        g1.append(synsig_genes[i])
        g2.append(synsig_genes[j])

        
df_scores['Gene1'] = g1
df_scores['Gene2'] = g2

In [None]:
df_scores

Unnamed: 0,Gene1,Gene2,lymph node_hpa_isoform_exp,skeletal muscle_hpa_isoform_exp,placenta_hpa_isoform_exp,gc_content,Ensembl_isoform_no,rectum_hpa_isoform_exp,prostate_hpa_isoform_exp,"cervix, uterine_hpa_isoform_exp",...,gene_length,lung_hpa_isoform_exp,heart muscle_hpa_isoform_exp,Ensembl_aa_length,skin 1_hpa_isoform_exp,salivary gland_hpa_isoform_exp,colon_hpa_isoform_exp,thyroid gland_hpa_isoform_exp,adrenal gland_hpa_isoform_exp,qPhos_site_number
0,AAK1,ABCA2,,,,,,,,,...,,,,,,,,,,
1,AAK1,ABCC8,,,,,,,,,...,,,,,,,,,,
2,AAK1,ABCE1,,,,,,,,,...,,,,,,,,,,
3,AAK1,ABHD17A,,,,,,,,,...,,,,,,,,,,
4,AAK1,ABHD17B,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478335,ZC3HAV1,ZDHHC5,,,,,,,,,...,,,,,,,,,,
1478336,ZC3HAV1,ZFYVE9,,,,,,,,,...,,,,,,,,,,
1478337,ZDHHC17,ZDHHC5,,,,,,,,,...,,,,,,,,,,
1478338,ZDHHC17,ZFYVE9,,,,,,,,,...,,,,,,,,,,


## Combining features using Pearson/Subtraction/Jaccard:

In [None]:
# EXPENSIVE OPERATION- no need to run. I have already created the pairwise combined features df named as
# df_combined_scores_final

In [None]:
'''
for n, f in enumerate(keep_files):
    
    #f = f.replace('synsig1233_normalized_', '')
    df = dfs[f]
    df = df.loc[:, ~df.columns.str.startswith('Unnamed')]
    df = df.loc[:, ~df.columns.str.startswith('Norm_Symbol')]
    
    
    
    scores = []
    col_names = df.columns.values.tolist()
    col_names.remove('Genes')
    print(n, ' : ', f, dict_file_to_type[f], type(df.iloc[0][col_names].values), col_names)
    print(df.head())
    if dict_file_to_type[f] == 'pearson':
        
        for i in range(len(synsig_genes)):
            if i %100 == 0:
                print(i, scores[-10:])
            for j in range(i+1, len(synsig_genes)):
                #print(df.iloc[i].Genes, df.loc[j].Genes)
                v1 = df.iloc[i][col_names].values
                v2 = df.iloc[j][col_names].values
                if len(v1) == 0 or len(v2) == 0:
                    scores.append(0)
                else:
                    scores.append(pearsonr(v1, v2)[0])
                
    
    elif dict_file_to_type[f] == 'subtraction':
        #print(type(df.iloc[i][col_names].values[0]))
        for i in range(len(synsig_genes)):
            if i %100 == 0:
                print(i, scores[-10:])
            for j in range(i+1, len(synsig_genes)):
                #print(df.iloc[i].Genes, df.iloc[j].Genes)
                v1 = df.iloc[i][col_names].values[0]
                v2 = df.iloc[j][col_names].values[0]
                if v1 == None or v2 == None or v1== np.nan or v2 == np.nan:
                    scores.append(0)
                else:
                    scores.append(abs(v1 -v2))
            

                
    elif dict_file_to_type[f] == 'jaccard':
        
        for i in range(len(synsig_genes)):
            if i %100 == 0:
                print(i, scores[-10:])
            for j in range(i+1, len(synsig_genes)):
                v1 = set(df.iloc[i][col_names])
                v2 = set(df.iloc[j][col_names])
                if len(v1) == 0 or len(v2) ==0:
                    scores.append(0)
                else:
                    scores.append(float(len(v1.intersection(v2)) / len(v1.union(v2))))
                
    
    
    print(f, 'DONE')
    print('-'*100)
    df_scores[f.replace('synsig1720_normalized_', '').replace('.csv','')] = scores
    
df = df_scores.to_csv('df_combined_scores_final.csv', index =False)
'''

0  :  synsig1720_normalized_lymph node_hpa_isoform_exp.csv pearson <class 'numpy.ndarray'> ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92']
     Genes          0         1         2         3         4         5  \
0     AAK1   8.557520  4.965836  2.061052  1.481046  0.646388  0.552694   
1    ABCA2   4.909448  4.448198  4.238102  2.579962  2.423234  1.411910   
2    ABCC8   0.169585  0.065895  0.049322  0.048913  0.047664  0.032208   
3    ABCE1  38.718300  6.581758  3.631327  1.920452  0.77811



KeyboardInterrupt: 

In [3]:
df

Unnamed: 0,Gene1,Gene2,ovary_hpa_isoform_exp.csv,gc_content.csv,gene_length.csv,lung_hpa_isoform_exp.csv,rectum_hpa_isoform_exp.csv,duodenum_hpa_isoform_exp.csv,Ensembl_aa_length.csv,trans_len.csv,...,"cervix, uterine_hpa_isoform_exp.csv",spleen_hpa_isoform_exp.csv,parathyroid gland_hpa_isoform_exp.csv,exon_no.csv,smooth muscle_hpa_isoform_exp.csv,pFAM_domain_number.csv,tonsil_hpa_isoform_exp.csv,lymph node_hpa_isoform_exp.csv,placenta_hpa_isoform_exp.csv,mentha_score
0,AAK1,ABCA2,0.965849,24.71,194664.0,0.966208,0.973625,0.813203,788.400000,897.000000,...,0.955330,0.957260,0.989616,11.076923,0.941387,0,0.966947,0.873830,0.985086,9.474456e-06
1,AAK1,ABCC8,0.948068,8.07,131971.0,0.849677,0.951296,0.774551,113.525000,2913.590909,...,0.858091,0.911618,0.993592,15.799145,0.936537,0,0.976120,0.958711,0.820609,2.275136e-07
2,AAK1,ABCE1,0.902037,7.21,185116.0,0.910468,0.917700,0.854478,230.250000,4159.000000,...,0.868469,0.852064,0.924167,0.776923,0.903924,1,0.924981,0.925420,0.882363,7.591042e-06
3,AAK1,ABHD17A,0.953884,23.21,207615.0,0.990977,0.985408,0.948779,340.750000,5006.833333,...,0.884200,0.985801,0.983851,2.589744,0.979764,1,0.953165,0.975387,0.970386,1.482897e-06
4,AAK1,ABHD17B,0.801094,3.97,167873.0,0.882328,0.874644,0.909778,284.000000,4532.500000,...,0.810877,0.879425,0.873878,2.423077,0.808666,1,0.879305,0.871912,0.833293,1.454431e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478335,ZC3HAV1,ZDHHC5,0.994175,1.97,32770.0,0.983448,0.967616,0.979307,338.650000,2319.900000,...,0.973729,0.975584,0.993708,3.625000,0.971927,3,0.910172,0.973318,0.955837,1.890174e-05
1478336,ZC3HAV1,ZFYVE9,0.997563,2.57,138387.0,0.998993,0.983044,0.983673,683.083333,722.833333,...,0.996603,0.998251,0.997564,2.550000,0.992429,1,0.968362,0.960276,0.992455,1.116394e-05
1478337,ZDHHC17,ZDHHC5,0.919718,9.27,56673.0,0.967958,0.884244,0.908784,237.100000,304.066667,...,0.960717,0.937776,0.925313,0.437500,0.936424,1,0.909210,0.936352,0.938986,4.551768e-06
1478338,ZDHHC17,ZFYVE9,0.958484,4.73,114484.0,0.993647,0.909834,0.972343,1258.833333,2738.666667,...,0.991451,0.977793,0.962664,6.612500,0.965084,1,0.955141,0.914600,0.977961,8.688810e-05
