In [2]:
from google.colab import drive
drive.mount('/content/MyDrive')

Mounted at /content/MyDrive


In [3]:
import pandas as pd
!pip install mygene
import mygene
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

def get_gene_symbols_from_proteins(list_of_ensembl_ids):
    # get Ensembl IDs for gene names
    mg = mygene.MyGeneInfo()
    res = mg.querymany(list_of_ensembl_ids,
                       scopes='ensembl.protein',
                       fields='symbol',
                     species='human', returnall=True
                      )
    def get_symbol_and_ensembl(d):
      if 'symbol' in d:
        return [d['query'], d['symbol']]
      else:
        return [d['query'], None]

    node_names = [get_symbol_and_ensembl(d) for d in res['out']]
    # now, retrieve the names and IDs from a dictionary and put in DF
    node_names = pd.DataFrame(node_names, columns=['Ensembl_ID', 'Symbol']).set_index('Ensembl_ID')
    node_names.dropna(axis=0, inplace=True)
    return node_names

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting mygene
  Downloading mygene-3.2.2-py2.py3-none-any.whl (5.4 kB)
Collecting biothings-client>=0.2.6
  Downloading biothings_client-0.2.6-py2.py3-none-any.whl (37 kB)
Installing collected packages: biothings-client, mygene
Successfully installed biothings-client-0.2.6 mygene-3.2.2


In [4]:
string_ppis = pd.read_csv('/content/MyDrive/MyDrive/nithya/data.txt', sep=' ')
string_ppis

Unnamed: 0,protein1,protein2,combined_score
0,9606.ENSP00000000233,9606.ENSP00000379496,155
1,9606.ENSP00000000233,9606.ENSP00000314067,197
2,9606.ENSP00000000233,9606.ENSP00000263116,222
3,9606.ENSP00000000233,9606.ENSP00000361263,181
4,9606.ENSP00000000233,9606.ENSP00000409666,270
...,...,...,...
11938493,9606.ENSP00000485678,9606.ENSP00000354800,213
11938494,9606.ENSP00000485678,9606.ENSP00000308270,151
11938495,9606.ENSP00000485678,9606.ENSP00000335660,181
11938496,9606.ENSP00000485678,9606.ENSP00000300127,154


In [5]:
high_conf_string_ppis = string_ppis[string_ppis.combined_score >= 850].copy()
high_conf_string_ppis.loc[:, 'protein1'] = [i[1] for i in high_conf_string_ppis.protein1.str.split('.')]
high_conf_string_ppis.loc[:, 'protein2'] = [i[1] for i in high_conf_string_ppis.protein2.str.split('.')]
high_conf_string_ppis

Unnamed: 0,protein1,protein2,combined_score
160,ENSP00000000233,ENSP00000263245,877
187,ENSP00000000233,ENSP00000440005,969
401,ENSP00000000233,ENSP00000356737,914
565,ENSP00000000233,ENSP00000328551,936
944,ENSP00000000233,ENSP00000429900,879
...,...,...,...
11938186,ENSP00000485663,ENSP00000436049,999
11938211,ENSP00000485663,ENSP00000248342,999
11938226,ENSP00000485663,ENSP00000416255,908
11938234,ENSP00000485663,ENSP00000220849,999


In [6]:
ens_names = high_conf_string_ppis.protein1.append(high_conf_string_ppis.protein2).unique()
# ens_names
ens_to_symbol = get_gene_symbols_from_proteins(ens_names)
ens_to_symbol

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-12885...done.
Finished.
365 input query terms found no hit:
	['ENSP00000035383', 'ENSP00000062104', 'ENSP00000155858', 'ENSP00000204615', 'ENSP00000205890', 'ENS


Unnamed: 0_level_0,Symbol
Ensembl_ID,Unnamed: 1_level_1
ENSP00000000233,ARF5
ENSP00000000412,M6PR
ENSP00000001008,FKBP4
ENSP00000001146,CYP26B1
ENSP00000002125,NDUFAF7
...,...
ENSP00000485586,NXF2
ENSP00000485615,LOC102724200
ENSP00000485627,WASHC1
ENSP00000485659,MUC5AC


In [7]:
p1_incl = high_conf_string_ppis.join(ens_to_symbol, on='protein1', how='inner', rsuffix='_p1')
# p1_incl
both_incl = p1_incl.join(ens_to_symbol, on='protein2', how='inner', rsuffix='_p2')
string_edgelist_symbols = both_incl.drop(['protein1', 'protein2'], axis=1)
string_edgelist_symbols.columns = ['confidence', 'partner1', 'partner2']
string_ppi_final = string_edgelist_symbols[['partner1', 'partner2', 'confidence']]
string_ppi_final

Unnamed: 0,partner1,partner2,confidence
160,ARF5,ARFGAP3,877
927595,CLTA,ARFGAP3,924
1120046,COPB1,ARFGAP3,935
1489555,KDELR2,ARFGAP3,934
3865243,ARF4,ARFGAP3,889
...,...,...,...
11882644,POU2AF3,POU2AF2,873
11897431,DNAAF11,CFAP298,856
11907953,SPATA31A7,SPATA32,869
11914156,CIITA,ZXDC,906


In [8]:
from tqdm import tqdm
edges = list()
all_proteins = set()
string_ppi_final.reset_index()
for ndex, row in tqdm(string_ppi_final.iterrows()):
   all_proteins.add(row['partner1'].strip())
   all_proteins.add(row['partner2'].strip())
   edges.append([row['partner1'].strip(), row['partner2'].strip()])

   

278602it [00:26, 10467.03it/s]


In [9]:
protein_score_map = dict()

import random
for protein in tqdm(all_proteins):
  protein_score_map[protein] = random.randint(1, 10)


100%|██████████| 12397/12397 [00:00<00:00, 426388.40it/s]


In [10]:
index_to_protein = dict()
for index, protein in enumerate(all_proteins):
  index_to_protein[protein] = index+1

len(index_to_protein)

12397

In [15]:
indexed_edges = list()
for edge in edges:
  [p1, p2] = edge
  indexed_edges.append([index_to_protein[p1], index_to_protein[p2]])
len(indexed_edges)

278602

In [16]:
import csv
f = open('network_1.csv', 'w')

writer = csv.writer(f)
for edge in edges:
  writer.writerow(edge)
f.close()

df = pd.read_csv('network_1.csv')
df.to_csv('network_1.tsv', index=False, sep="\t")

In [17]:
f = open('network_1_edge_list.csv', 'w')
writer = csv.writer(f)
for edge in indexed_edges:
  writer.writerow(edge)
f.close()

df = pd.read_csv('network_1_edge_list.csv')
df.to_csv('network_1_edge_list.tsv', index=False, sep="\t")

In [19]:
f = open('network_1_index_gene.csv', 'w')
writer = csv.writer(f)
for protein, index in index_to_protein.items():
  writer.writerow([index, protein])
f.close()

df = pd.read_csv('network_1_index_gene.csv')
df.to_csv('network_1_index_gene.tsv', index=False, sep="\t")

In [20]:
f = open('scores_1.csv', 'w')
writer = csv.writer(f)
for protein, score in protein_score_map.items():
  writer.writerow([protein, score])
f.close()

df = pd.read_csv('scores_1.csv')
df.to_csv('scores_1.tsv', index=False, sep="\t")

In [22]:
f = open('scores_2.csv', 'w')
writer = csv.writer(f)
for protein in all_proteins:
  writer.writerow([protein, random.randint(1, 10)/10.0])
f.close()

df = pd.read_csv('scores_2.csv')
df.to_csv('scores_2.tsv', index=False, sep="\t")