In [1]:
import pandas as pd
import numpy as np
import hidef
import os
import difflib
from collections import defaultdict
from sklearn import metrics
from sklearn.metrics import explained_variance_score, mean_absolute_error, r2_score
from scipy.stats.stats import pearsonr, spearmanr
import requests
import gzip
import networkx as nx
import sys
import ddot
from ddot import Ontology

In [2]:
ref_ont = pd.read_csv('Input_Data/Ref_Ont/Reference_Syngo_CC_ontology.txt', sep = '\t') # PATH to Reference ont.
ref_ont['EdgeType'] = ref_ont['EdgeType'].map(lambda x: 'default' if x == 'Child-Parent' else 'gene')

In [4]:
ref_ont

Unnamed: 0,Parent,Child,EdgeType
0,synapse,synaptic membrane,default
1,synaptic membrane,anchored component of synaptic membrane,default
2,synaptic membrane,extrinsic component of synaptic membrane,default
3,synaptic membrane,integral component of synaptic membrane,default
4,synapse,presynapse,default
...,...,...,...
1761,postsynapse,NGDN,gene
1762,presynaptic mitochondria,integral component of postsynaptic mitochondri...,default
1763,postsynaptic mitochondria,integral component of presynaptic mitochondria...,default
1764,perisynaptic space,perisynaptic extracellular matrix,default


In [5]:
ref_ont_ddot = Ontology.from_table(
        table= ref_ont,
        parent=0,
        child=1,
        is_mapping=lambda x: x[2]=='gene',
)



In [6]:
ref_ont_ddot

974 genes, 151 terms, 1462 gene-term relations, 152 term-term relations
node_attributes: []
edge_attributes: []

In [7]:
ref_ont_ddot.genes

['AAK1',
 'ABHD17A',
 'ABHD17B',
 'ABHD6',
 'ABI1',
 'ABI2',
 'ABL1',
 'ABL2',
 'ABLIM3',
 'ABR',
 'ACAN',
 'ACHE',
 'ACTB',
 'ACTBL2',
 'ACTC1',
 'ACTG1',
 'ACTN2',
 'ACTR2',
 'ACTR3',
 'ADAM10',
 'ADAM22',
 'ADAM23',
 'ADCY1',
 'ADCY8',
 'ADD1',
 'ADD2',
 'ADD3',
 'ADGRA1',
 'ADGRB1',
 'ADGRB3',
 'ADGRL1',
 'ADGRL2',
 'ADORA1',
 'ADORA2A',
 'ADORA2B',
 'ADORA3',
 'ADRA1A',
 'ADRA2A',
 'ADRA2C',
 'ADRB2',
 'AGAP2',
 'AGAP3',
 'AGRN',
 'AKAP1',
 'AKAP5',
 'AKAP9',
 'AKR1A1',
 'ALDOC',
 'ALK',
 'AMPH',
 'ANKS1B',
 'ANO1',
 'ANO2',
 'ANXA1',
 'ANXA5',
 'AP1G1',
 'AP3D1',
 'APBA1',
 'APBB1',
 'APC2',
 'APH1A',
 'APOA4',
 'APOE',
 'APP',
 'ARC',
 'ARF1',
 'ARFGAP1',
 'ARFGAP3',
 'ARFGEF2',
 'ARHGAP39',
 'ARHGAP44',
 'ARHGDIA',
 'ARHGEF15',
 'ARHGEF2',
 'ARHGEF9',
 'ARPC2',
 'ARPC5L',
 'ATAD1',
 'ATG9A',
 'ATP1A3',
 'ATP2B1',
 'ATP2B2',
 'ATP2B3',
 'ATP2B4',
 'ATP6AP1',
 'ATP6V0A1',
 'ATP6V0A4',
 'ATP6V0C',
 'ATP6V0D1',
 'ATP6V1B1',
 'ATP6V1B2',
 'ATP6V1C1',
 'ATP6V1D',
 'ATP6V1E1',
 'ATP6V

In [8]:
ref_ont_ddot_matrix = ref_ont_ddot.flatten()[0]
ref_ont_genes_ddot = ref_ont_ddot.flatten()[1]

In [9]:
rows = []
for r in range(len(ref_ont_ddot_matrix)):
    for c in range(r+1, len(ref_ont_ddot_matrix)):
        rows.append([ref_ont_genes_ddot[r], ref_ont_genes_ddot[c], ref_ont_ddot_matrix[r][c]])

In [10]:
rows

[['AAK1', 'ABHD17A', -0.0],
 ['AAK1', 'ABHD17B', -0.0],
 ['AAK1', 'ABHD6', -0.0],
 ['AAK1', 'ABI1', -0.0],
 ['AAK1', 'ABI2', -0.0],
 ['AAK1', 'ABL1', -0.0],
 ['AAK1', 'ABL2', -0.0],
 ['AAK1', 'ABLIM3', -0.0],
 ['AAK1', 'ABR', -0.0],
 ['AAK1', 'ACAN', -0.0],
 ['AAK1', 'ACHE', -0.0],
 ['AAK1', 'ACTB', -0.0],
 ['AAK1', 'ACTBL2', -0.0],
 ['AAK1', 'ACTC1', -0.0],
 ['AAK1', 'ACTG1', 1.0089147],
 ['AAK1', 'ACTN2', -0.0],
 ['AAK1', 'ACTR2', -0.0],
 ['AAK1', 'ACTR3', -0.0],
 ['AAK1', 'ADAM10', -0.0],
 ['AAK1', 'ADAM22', -0.0],
 ['AAK1', 'ADAM23', 1.0089147],
 ['AAK1', 'ADCY1', -0.0],
 ['AAK1', 'ADCY8', 1.0089147],
 ['AAK1', 'ADD1', 1.0089147],
 ['AAK1', 'ADD2', 1.0089147],
 ['AAK1', 'ADD3', 1.0089147],
 ['AAK1', 'ADGRA1', -0.0],
 ['AAK1', 'ADGRB1', -0.0],
 ['AAK1', 'ADGRB3', -0.0],
 ['AAK1', 'ADGRL1', 1.0089147],
 ['AAK1', 'ADGRL2', -0.0],
 ['AAK1', 'ADORA1', 1.0089147],
 ['AAK1', 'ADORA2A', 1.0089147],
 ['AAK1', 'ADORA2B', -0.0],
 ['AAK1', 'ADORA3', -0.0],
 ['AAK1', 'ADRA1A', 1.0089147],
 ['AA

In [11]:
df_syngo_similarity = pd.DataFrame(rows, columns = ['Gene1', 'Gene2', 'Similarity'])
df_syngo_similarity['Similarity'] = df_syngo_similarity['Similarity'].apply(lambda x: abs(x))
df_syngo_similarity

Unnamed: 0,Gene1,Gene2,Similarity
0,AAK1,ABHD17A,0.000000
1,AAK1,ABHD17B,0.000000
2,AAK1,ABHD6,0.000000
3,AAK1,ABI1,0.000000
4,AAK1,ABI2,0.000000
...,...,...,...
473846,YWHAH,YWHAZ,0.000000
473847,YWHAH,ZDHHC5,0.000000
473848,YWHAQ,YWHAZ,0.000000
473849,YWHAQ,ZDHHC5,0.000000


In [12]:
df_syngo = pd.read_excel('Input_Data/Syngo_CC/syngo_ontologies.xlsx')
df_syngo = df_syngo[df_syngo['GO domain'] == 'BP']
df_syngo

Unnamed: 0,GO term ID,GO domain,GO term name,GO term name - hierarchical structure,GO parent term ID,genes - hgnc_id,genes - hgnc_symbol
150,SYNGO:synprocess,BP,process in the synapse,process in the synapse,,HGNC:1774;HGNC:17072;HGNC:3953;HGNC:2159;HGNC:...,CDK5;ERC1;NCS1;CNR1;MARCKSL1;P2RY1;P2RY2;ADORA...
151,SYNGO:presynprocess,BP,process in the presynapse,├─ process in the presynapse,SYNGO:synprocess,HGNC:1774;HGNC:17072;HGNC:3953;HGNC:2159;HGNC:...,CDK5;ERC1;NCS1;CNR1;MARCKSL1;P2RY1;P2RY2;ADORA...
152,GO:0099509,BP,regulation of presynaptic cytosolic calcium le...,│ ├─ regulation of presynaptic cytosolic c...,SYNGO:presynprocess,HGNC:1774;HGNC:17072;HGNC:3953;HGNC:2159;HGNC:...,CDK5;ERC1;NCS1;CNR1;MARCKSL1;P2RY1;P2RY2;ADORA...
153,GO:1905056,BP,ATPase-coupled calcium ion transmembrane trans...,│ │ ├─ ATPase-coupled calcium ion transme...,GO:0099509,HGNC:816,ATP2B3
154,GO:1905054,BP,calcium-induced calcium release activity invol...,│ │ ├─ calcium-induced calcium release ac...,GO:0099509,,
...,...,...,...,...,...,...,...
407,GO:0098928,BP,presynaptic signaling pathway,├─ presynaptic signaling pathway,SYNGO:pathway,HGNC:2494,CTBP1
408,GO:0099526,BP,presynapse to nucleus signaling pathway,│ └─ presynapse to nucleus signaling pa...,GO:0098928,HGNC:2494,CTBP1
409,GO:0098926,BP,postsynaptic signaling pathway,└─ postsynaptic signaling pathway,SYNGO:pathway,HGNC:712;HGNC:7765;HGNC:1464;HGNC:16062;HGNC:1...,ARRB2;NF1;CAMK4;CRTC1;ABI1;ANKS1B;RNF10;STAT3;...
410,GO:0099527,BP,postsynapse to nucleus signaling pathway,└─ postsynapse to nucleus signaling p...,GO:0098926,HGNC:1464;HGNC:16062;HGNC:11320;HGNC:24600;HGN...,CAMK4;CRTC1;ABI1;ANKS1B;RNF10;STAT3;NSMF;HTT;K...


In [15]:
df_syngo[['GO term ID', 'GO parent term ID', 'genes - hgnc_symbol']]
df_syngo = df_syngo.fillna('')

In [16]:
syngo_genes = []
for i, genelist in df_syngo['genes - hgnc_symbol'].items():
    for g in genelist.split(';'):
        if genelist != '':
            syngo_genes.append(g)

syngo_genes.sort()
syngo_genes = np.unique(syngo_genes)
syngo_genes

array(['ABHD17A', 'ABHD17B', 'ABHD17C', 'ABHD6', 'ABI1', 'ABI2', 'ABI3',
       'ABL1', 'ABL2', 'ABR', 'ACHE', 'ACTB', 'ACTG1', 'ACTN1', 'ACTR3',
       'ADAM10', 'ADAM22', 'ADCY1', 'ADCY8', 'ADD2', 'ADGRB1', 'ADGRB3',
       'ADGRL2', 'ADGRL3', 'ADORA1', 'ADORA2A', 'ADORA2B', 'ADORA3',
       'ADRA1A', 'ADRA2A', 'ADRB1', 'ADRB2', 'AGAP3', 'AGRN', 'AKAP12',
       'AKAP5', 'AKAP7', 'AKAP9', 'ALK', 'AMPH', 'ANKS1B', 'AP1G1',
       'AP1S2', 'AP2B1', 'AP2M1', 'AP3D1', 'APBA1', 'APBA2', 'APBB1',
       'APOE', 'APP', 'ARC', 'ARF1', 'ARF4', 'ARF6', 'ARHGAP22',
       'ARHGAP39', 'ARHGAP44', 'ARHGEF15', 'ARHGEF7', 'ARHGEF9',
       'ARL6IP5', 'ARL8B', 'ARRB2', 'ATAD1', 'ATP2B2', 'ATP2B3',
       'ATP6AP1', 'ATP6V0A1', 'ATP6V0A4', 'ATP6V0C', 'ATP6V0D1',
       'ATP6V1B1', 'ATP6V1B2', 'ATP6V1C1', 'ATP6V1D', 'ATP6V1E1',
       'ATP6V1F', 'ATP6V1G1', 'ATP6V1G2', 'ATP6V1H', 'BACE1', 'BAIAP2',
       'BAIAP3', 'BCAN', 'BCR', 'BDNF', 'BEGAIN', 'BIN1', 'BSN', 'BTBD9',
       'C1QL1', 'C1QL2', 'C1QL

In [17]:
len(syngo_genes)

803

In [3]:
df_combined_scores_1720 = pd.read_csv('df_combined_scores_final.csv')

In [4]:
df_combined_scores_1720

Unnamed: 0,Gene1,Gene2,ovary_hpa_isoform_exp.csv,gc_content.csv,gene_length.csv,lung_hpa_isoform_exp.csv,rectum_hpa_isoform_exp.csv,duodenum_hpa_isoform_exp.csv,Ensembl_aa_length.csv,trans_len.csv,...,"cervix, uterine_hpa_isoform_exp.csv",spleen_hpa_isoform_exp.csv,parathyroid gland_hpa_isoform_exp.csv,exon_no.csv,smooth muscle_hpa_isoform_exp.csv,pFAM_domain_number.csv,tonsil_hpa_isoform_exp.csv,lymph node_hpa_isoform_exp.csv,placenta_hpa_isoform_exp.csv,mentha_score
0,AAK1,ABCA2,0.965849,24.71,194664.0,0.966208,0.973625,0.813203,788.400000,897.000000,...,0.955330,0.957260,0.989616,11.076923,0.941387,0,0.966947,0.873830,0.985086,9.474456e-06
1,AAK1,ABCC8,0.948068,8.07,131971.0,0.849677,0.951296,0.774551,113.525000,2913.590909,...,0.858091,0.911618,0.993592,15.799145,0.936537,0,0.976120,0.958711,0.820609,2.275136e-07
2,AAK1,ABCE1,0.902037,7.21,185116.0,0.910468,0.917700,0.854478,230.250000,4159.000000,...,0.868469,0.852064,0.924167,0.776923,0.903924,1,0.924981,0.925420,0.882363,7.591042e-06
3,AAK1,ABHD17A,0.953884,23.21,207615.0,0.990977,0.985408,0.948779,340.750000,5006.833333,...,0.884200,0.985801,0.983851,2.589744,0.979764,1,0.953165,0.975387,0.970386,1.482897e-06
4,AAK1,ABHD17B,0.801094,3.97,167873.0,0.882328,0.874644,0.909778,284.000000,4532.500000,...,0.810877,0.879425,0.873878,2.423077,0.808666,1,0.879305,0.871912,0.833293,1.454431e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478335,ZC3HAV1,ZDHHC5,0.994175,1.97,32770.0,0.983448,0.967616,0.979307,338.650000,2319.900000,...,0.973729,0.975584,0.993708,3.625000,0.971927,3,0.910172,0.973318,0.955837,1.890174e-05
1478336,ZC3HAV1,ZFYVE9,0.997563,2.57,138387.0,0.998993,0.983044,0.983673,683.083333,722.833333,...,0.996603,0.998251,0.997564,2.550000,0.992429,1,0.968362,0.960276,0.992455,1.116394e-05
1478337,ZDHHC17,ZDHHC5,0.919718,9.27,56673.0,0.967958,0.884244,0.908784,237.100000,304.066667,...,0.960717,0.937776,0.925313,0.437500,0.936424,1,0.909210,0.936352,0.938986,4.551768e-06
1478338,ZDHHC17,ZFYVE9,0.958484,4.73,114484.0,0.993647,0.909834,0.972343,1258.833333,2738.666667,...,0.991451,0.977793,0.962664,6.612500,0.965084,1,0.955141,0.914600,0.977961,8.688810e-05


In [18]:
training = pd.merge(df_combined_scores_1720, df_syngo_similarity, how="inner", on=["Gene1", "Gene2"])
training

Unnamed: 0,Gene1,Gene2,ovary_hpa_isoform_exp.csv,gc_content.csv,gene_length.csv,lung_hpa_isoform_exp.csv,rectum_hpa_isoform_exp.csv,duodenum_hpa_isoform_exp.csv,Ensembl_aa_length.csv,trans_len.csv,...,spleen_hpa_isoform_exp.csv,parathyroid gland_hpa_isoform_exp.csv,exon_no.csv,smooth muscle_hpa_isoform_exp.csv,pFAM_domain_number.csv,tonsil_hpa_isoform_exp.csv,lymph node_hpa_isoform_exp.csv,placenta_hpa_isoform_exp.csv,mentha_score,Similarity
0,AAK1,ABHD17A,0.953884,23.21,207615.0,0.990977,0.985408,0.948779,340.750000,5006.833333,...,0.985801,0.983851,2.589744,0.979764,1,0.953165,0.975387,0.970386,0.000001,0.000000
1,AAK1,ABHD17B,0.801094,3.97,167873.0,0.882328,0.874644,0.909778,284.000000,4532.500000,...,0.879425,0.873878,2.423077,0.808666,1,0.879305,0.871912,0.833293,0.000001,0.000000
2,AAK1,ABI1,0.969118,3.51,101858.0,0.988954,0.950733,0.979989,123.750000,3230.916667,...,0.953674,0.971893,2.538462,0.968425,0,0.919675,0.955133,0.983546,0.000005,0.000000
3,AAK1,ABI2,0.916182,2.45,96863.0,0.937884,0.962939,0.889408,293.029412,1341.600000,...,0.851155,0.902074,0.131410,0.932659,1,0.961966,0.915923,0.892078,0.000005,0.000000
4,AAK1,ABL1,0.934013,2.52,42623.0,0.978149,0.942251,0.957479,206.500000,3184.166667,...,0.948257,0.957789,1.076923,0.926499,2,0.955248,0.979460,0.944632,0.000058,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220111,YWHAH,YWHAZ,0.850606,4.02,23784.0,0.890810,0.986038,0.760589,47.986842,319.833333,...,0.828059,0.950628,2.326087,0.949402,0,0.959916,0.912606,0.886788,0.001101,0.000000
220112,YWHAH,ZDHHC5,0.979230,0.49,20354.0,0.983771,0.869575,0.998791,253.850000,458.266667,...,0.994938,0.890937,4.125000,0.940737,0,0.915946,0.986189,0.981532,0.000018,0.000000
220113,YWHAQ,YWHAZ,0.772859,0.55,10165.0,0.821083,0.848605,0.797151,35.263158,4.833333,...,0.754610,0.816318,0.426087,0.877555,0,0.820303,0.857875,0.844242,0.001186,0.000000
220114,YWHAQ,ZDHHC5,0.997364,5.06,13595.0,0.998587,0.996053,0.995017,170.600000,143.266667,...,0.998440,0.995044,2.225000,0.994055,0,0.997325,0.996946,0.993390,0.000023,0.000000


In [19]:
testing = []
for i, row in df_combined_scores_1720.iterrows():
    if row['Gene1'] in ref_ont_genes_ddot and row['Gene2'] in ref_ont_genes_ddot:
        continue
    else:
        testing.append(row)

In [20]:
testing = pd.DataFrame(testing, columns = df_combined_scores_1720.columns)
testing['Similarity'] = 0
testing

Unnamed: 0,Gene1,Gene2,ovary_hpa_isoform_exp.csv,gc_content.csv,gene_length.csv,lung_hpa_isoform_exp.csv,rectum_hpa_isoform_exp.csv,duodenum_hpa_isoform_exp.csv,Ensembl_aa_length.csv,trans_len.csv,...,spleen_hpa_isoform_exp.csv,parathyroid gland_hpa_isoform_exp.csv,exon_no.csv,smooth muscle_hpa_isoform_exp.csv,pFAM_domain_number.csv,tonsil_hpa_isoform_exp.csv,lymph node_hpa_isoform_exp.csv,placenta_hpa_isoform_exp.csv,mentha_score,Similarity
0,AAK1,ABCA2,0.965849,24.71,194664.0,0.966208,0.973625,0.813203,788.400000,897.000000,...,0.957260,0.989616,11.076923,0.941387,0,0.966947,0.873830,0.985086,9.474456e-06,0
1,AAK1,ABCC8,0.948068,8.07,131971.0,0.849677,0.951296,0.774551,113.525000,2913.590909,...,0.911618,0.993592,15.799145,0.936537,0,0.976120,0.958711,0.820609,2.275136e-07,0
2,AAK1,ABCE1,0.902037,7.21,185116.0,0.910468,0.917700,0.854478,230.250000,4159.000000,...,0.852064,0.924167,0.776923,0.903924,1,0.924981,0.925420,0.882363,7.591042e-06,0
9,AAK1,ABLIM1,0.946876,0.08,120596.0,0.994884,0.992168,0.994326,15.954545,2870.192308,...,0.898963,0.996494,7.723982,0.974713,1,0.975881,0.956694,0.994901,1.932182e-05,0
11,AAK1,ACACB,0.971575,4.83,64721.0,0.968102,0.983206,0.924845,431.642857,1204.300000,...,0.882839,0.893047,7.143590,0.969082,4,0.955744,0.937574,0.889824,1.190014e-06,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478335,ZC3HAV1,ZDHHC5,0.994175,1.97,32770.0,0.983448,0.967616,0.979307,338.650000,2319.900000,...,0.975584,0.993708,3.625000,0.971927,3,0.910172,0.973318,0.955837,1.890174e-05,0
1478336,ZC3HAV1,ZFYVE9,0.997563,2.57,138387.0,0.998993,0.983044,0.983673,683.083333,722.833333,...,0.998251,0.997564,2.550000,0.992429,1,0.968362,0.960276,0.992455,1.116394e-05,0
1478337,ZDHHC17,ZDHHC5,0.919718,9.27,56673.0,0.967958,0.884244,0.908784,237.100000,304.066667,...,0.937776,0.925313,0.437500,0.936424,1,0.909210,0.936352,0.938986,4.551768e-06,0
1478338,ZDHHC17,ZFYVE9,0.958484,4.73,114484.0,0.993647,0.909834,0.972343,1258.833333,2738.666667,...,0.977793,0.962664,6.612500,0.965084,1,0.955141,0.914600,0.977961,8.688810e-05,0


In [21]:
len(training)+len(testing)

1478340

In [22]:
training

Unnamed: 0,Gene1,Gene2,ovary_hpa_isoform_exp.csv,gc_content.csv,gene_length.csv,lung_hpa_isoform_exp.csv,rectum_hpa_isoform_exp.csv,duodenum_hpa_isoform_exp.csv,Ensembl_aa_length.csv,trans_len.csv,...,spleen_hpa_isoform_exp.csv,parathyroid gland_hpa_isoform_exp.csv,exon_no.csv,smooth muscle_hpa_isoform_exp.csv,pFAM_domain_number.csv,tonsil_hpa_isoform_exp.csv,lymph node_hpa_isoform_exp.csv,placenta_hpa_isoform_exp.csv,mentha_score,Similarity
0,AAK1,ABHD17A,0.953884,23.21,207615.0,0.990977,0.985408,0.948779,340.750000,5006.833333,...,0.985801,0.983851,2.589744,0.979764,1,0.953165,0.975387,0.970386,0.000001,0.000000
1,AAK1,ABHD17B,0.801094,3.97,167873.0,0.882328,0.874644,0.909778,284.000000,4532.500000,...,0.879425,0.873878,2.423077,0.808666,1,0.879305,0.871912,0.833293,0.000001,0.000000
2,AAK1,ABI1,0.969118,3.51,101858.0,0.988954,0.950733,0.979989,123.750000,3230.916667,...,0.953674,0.971893,2.538462,0.968425,0,0.919675,0.955133,0.983546,0.000005,0.000000
3,AAK1,ABI2,0.916182,2.45,96863.0,0.937884,0.962939,0.889408,293.029412,1341.600000,...,0.851155,0.902074,0.131410,0.932659,1,0.961966,0.915923,0.892078,0.000005,0.000000
4,AAK1,ABL1,0.934013,2.52,42623.0,0.978149,0.942251,0.957479,206.500000,3184.166667,...,0.948257,0.957789,1.076923,0.926499,2,0.955248,0.979460,0.944632,0.000058,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220111,YWHAH,YWHAZ,0.850606,4.02,23784.0,0.890810,0.986038,0.760589,47.986842,319.833333,...,0.828059,0.950628,2.326087,0.949402,0,0.959916,0.912606,0.886788,0.001101,0.000000
220112,YWHAH,ZDHHC5,0.979230,0.49,20354.0,0.983771,0.869575,0.998791,253.850000,458.266667,...,0.994938,0.890937,4.125000,0.940737,0,0.915946,0.986189,0.981532,0.000018,0.000000
220113,YWHAQ,YWHAZ,0.772859,0.55,10165.0,0.821083,0.848605,0.797151,35.263158,4.833333,...,0.754610,0.816318,0.426087,0.877555,0,0.820303,0.857875,0.844242,0.001186,0.000000
220114,YWHAQ,ZDHHC5,0.997364,5.06,13595.0,0.998587,0.996053,0.995017,170.600000,143.266667,...,0.998440,0.995044,2.225000,0.994055,0,0.997325,0.996946,0.993390,0.000023,0.000000


## Random Forest

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error
import xgboost
from xgboost import XGBRegressor

In [28]:
regr = RandomForestRegressor(random_state=0)

In [29]:
training = training.sample(frac=1)

In [30]:
training_mod = training.loc[:, ~training.columns.str.startswith('Gene')]

In [39]:
len(training_mod)

220116

In [40]:
len(training)

220116

In [41]:
X_train = training_mod.iloc[:, ~training_mod.columns.str.startswith('Similarity')][:180000]
Y_train = training['Similarity'].iloc[:180000]

X_valid = training_mod.iloc[:, ~training_mod.columns.str.startswith('Similarity')][180000:]
Y_valid = training['Similarity'].iloc[180000:]

In [42]:
print( len(X_train), len(Y_train), len(X_valid), len(Y_valid))

180000 180000 40116 40116


In [43]:
X_train = X_train.fillna(0)
X_valid = X_valid.fillna(0)

In [62]:
X = training_mod.loc[:, ~training_mod.columns.str.startswith('Similarity')]
Y = training['Similarity']

X = X.fillna(0)

regr = RandomForestRegressor(random_state=0)

In [65]:
testing_mod = testing.loc[:, ~testing.columns.str.startswith('Gene')]
X_test = testing_mod.loc[:, ~testing_mod.columns.str.startswith('Similarity')]
Y_test = testing_mod['Similarity']

In [67]:
regr.fit(X,Y)

RandomForestRegressor(random_state=0)

In [68]:
X_test = X_test.fillna(0)

In [69]:
Y_pred_test = regr.predict(X_test) 

In [70]:
Y_pred_test

array([0.59748608, 1.34364303, 0.77903168, ..., 0.76583904, 0.7950998 ,
       0.77148595])

In [71]:
testing['Similarity'] = Y_pred_test

In [72]:
testing

Unnamed: 0,Gene1,Gene2,ovary_hpa_isoform_exp.csv,gc_content.csv,gene_length.csv,lung_hpa_isoform_exp.csv,rectum_hpa_isoform_exp.csv,duodenum_hpa_isoform_exp.csv,Ensembl_aa_length.csv,trans_len.csv,...,spleen_hpa_isoform_exp.csv,parathyroid gland_hpa_isoform_exp.csv,exon_no.csv,smooth muscle_hpa_isoform_exp.csv,pFAM_domain_number.csv,tonsil_hpa_isoform_exp.csv,lymph node_hpa_isoform_exp.csv,placenta_hpa_isoform_exp.csv,mentha_score,Similarity
0,AAK1,ABCA2,0.965849,24.71,194664.0,0.966208,0.973625,0.813203,788.400000,897.000000,...,0.957260,0.989616,11.076923,0.941387,0,0.966947,0.873830,0.985086,9.474456e-06,0.597486
1,AAK1,ABCC8,0.948068,8.07,131971.0,0.849677,0.951296,0.774551,113.525000,2913.590909,...,0.911618,0.993592,15.799145,0.936537,0,0.976120,0.958711,0.820609,2.275136e-07,1.343643
2,AAK1,ABCE1,0.902037,7.21,185116.0,0.910468,0.917700,0.854478,230.250000,4159.000000,...,0.852064,0.924167,0.776923,0.903924,1,0.924981,0.925420,0.882363,7.591042e-06,0.779032
9,AAK1,ABLIM1,0.946876,0.08,120596.0,0.994884,0.992168,0.994326,15.954545,2870.192308,...,0.898963,0.996494,7.723982,0.974713,1,0.975881,0.956694,0.994901,1.932182e-05,0.807302
11,AAK1,ACACB,0.971575,4.83,64721.0,0.968102,0.983206,0.924845,431.642857,1204.300000,...,0.882839,0.893047,7.143590,0.969082,4,0.955744,0.937574,0.889824,1.190014e-06,0.516389
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478335,ZC3HAV1,ZDHHC5,0.994175,1.97,32770.0,0.983448,0.967616,0.979307,338.650000,2319.900000,...,0.975584,0.993708,3.625000,0.971927,3,0.910172,0.973318,0.955837,1.890174e-05,0.502457
1478336,ZC3HAV1,ZFYVE9,0.997563,2.57,138387.0,0.998993,0.983044,0.983673,683.083333,722.833333,...,0.998251,0.997564,2.550000,0.992429,1,0.968362,0.960276,0.992455,1.116394e-05,0.572945
1478337,ZDHHC17,ZDHHC5,0.919718,9.27,56673.0,0.967958,0.884244,0.908784,237.100000,304.066667,...,0.937776,0.925313,0.437500,0.936424,1,0.909210,0.936352,0.938986,4.551768e-06,0.765839
1478338,ZDHHC17,ZFYVE9,0.958484,4.73,114484.0,0.993647,0.909834,0.972343,1258.833333,2738.666667,...,0.977793,0.962664,6.612500,0.965084,1,0.955141,0.914600,0.977961,8.688810e-05,0.795100


In [73]:
final = training.append(testing)

In [74]:
final

Unnamed: 0,Gene1,Gene2,ovary_hpa_isoform_exp.csv,gc_content.csv,gene_length.csv,lung_hpa_isoform_exp.csv,rectum_hpa_isoform_exp.csv,duodenum_hpa_isoform_exp.csv,Ensembl_aa_length.csv,trans_len.csv,...,spleen_hpa_isoform_exp.csv,parathyroid gland_hpa_isoform_exp.csv,exon_no.csv,smooth muscle_hpa_isoform_exp.csv,pFAM_domain_number.csv,tonsil_hpa_isoform_exp.csv,lymph node_hpa_isoform_exp.csv,placenta_hpa_isoform_exp.csv,mentha_score,Similarity
78921,CRTC1,EEF1D,0.911294,5.384483,79624.414773,0.909009,0.990739,0.892771,374.206044,3617.028736,...,0.928527,0.929479,8.123786,0.963443,1,0.972491,0.905014,0.963849,3.139725e-06,0.000000
202188,RPH3A,RPL35A,0.957233,1.080000,322081.000000,0.918492,0.849794,0.983259,158.016667,551.210526,...,0.955491,0.916978,5.168350,0.777159,1,0.792449,0.932293,0.963471,6.166159e-07,1.008915
217033,SLITRK3,YWHAZ,0.898471,4.660000,26476.000000,0.980706,0.983804,0.986812,527.596491,1607.166667,...,0.704731,0.755241,2.826087,0.943104,0,0.936742,0.970116,0.757447,2.780650e-05,1.698959
104693,EEF1A2,EPN1,0.854231,8.390000,23461.000000,0.969111,0.922063,0.954929,184.000000,4469.250000,...,0.963153,0.980260,1.428571,0.842820,2,0.851858,0.967755,0.962509,3.505518e-05,0.000000
200640,RGS9,RPL28,0.961343,6.040000,108986.000000,0.843984,0.768607,0.900640,502.311111,1158.166667,...,0.946695,0.896341,8.090909,0.791515,3,0.852505,0.803359,0.834593,2.227689e-06,1.008915
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478335,ZC3HAV1,ZDHHC5,0.994175,1.970000,32770.000000,0.983448,0.967616,0.979307,338.650000,2319.900000,...,0.975584,0.993708,3.625000,0.971927,3,0.910172,0.973318,0.955837,1.890174e-05,0.502457
1478336,ZC3HAV1,ZFYVE9,0.997563,2.570000,138387.000000,0.998993,0.983044,0.983673,683.083333,722.833333,...,0.998251,0.997564,2.550000,0.992429,1,0.968362,0.960276,0.992455,1.116394e-05,0.572945
1478337,ZDHHC17,ZDHHC5,0.919718,9.270000,56673.000000,0.967958,0.884244,0.908784,237.100000,304.066667,...,0.937776,0.925313,0.437500,0.936424,1,0.909210,0.936352,0.938986,4.551768e-06,0.765839
1478338,ZDHHC17,ZFYVE9,0.958484,4.730000,114484.000000,0.993647,0.909834,0.972343,1258.833333,2738.666667,...,0.977793,0.962664,6.612500,0.965084,1,0.955141,0.914600,0.977961,8.688810e-05,0.795100


In [75]:
final = final[['Gene1', 'Gene2', 'Similarity']]

In [76]:
final.to_csv('combined_similarity_matrix_1720.csv', index= False)

In [77]:
final[final['Similarity'] > 0.95].to_csv('combined_similarity_matrix_1720_0.95.txt' , header=None, sep = '\t',index= False)

In [78]:
max(final['Similarity'])

8.927778244018555