##### Parsing Biogrid data

In [1]:
import os
import pandas as pd

directory_path = 'files/construction_data/BioGRID'

file_extension = '.txt'

dataframes = []
enterez_id_pairs = set()

for file in os.listdir(directory_path):
    if file.endswith(file_extension):
        file_path = os.path.join(directory_path, file)
        df = pd.read_csv(file_path, delimiter='\t')
        df = df[~df['Experimental System'].isin(['Co-localization', 'Genetic interference'
                                      'Synthetic Rescue', 'Synthetic Growth Defect',
                                      'Synthetic Lethality'])]
        df = df[(df['SWISS-PROT Accessions Interactor A'] != '-') & (df['SWISS-PROT Accessions Interactor B'] != '-')]
        
        dataframes.append(df)

In [2]:
dataframes[0].columns

Index(['#BioGRID Interaction ID', 'Entrez Gene Interactor A',
       'Entrez Gene Interactor B', 'BioGRID ID Interactor A',
       'BioGRID ID Interactor B', 'Systematic Name Interactor A',
       'Systematic Name Interactor B', 'Official Symbol Interactor A',
       'Official Symbol Interactor B', 'Synonyms Interactor A',
       'Synonyms Interactor B', 'Experimental System',
       'Experimental System Type', 'Author', 'Publication Source',
       'Organism ID Interactor A', 'Organism ID Interactor B', 'Throughput',
       'Score', 'Modification', 'Qualifications', 'Tags', 'Source Database',
       'SWISS-PROT Accessions Interactor A', 'TREMBL Accessions Interactor A',
       'REFSEQ Accessions Interactor A', 'SWISS-PROT Accessions Interactor B',
       'TREMBL Accessions Interactor B', 'REFSEQ Accessions Interactor B',
       'Ontology Term IDs', 'Ontology Term Names', 'Ontology Term Categories',
       'Ontology Term Qualifier IDs', 'Ontology Term Qualifier Names',
       'Ontology

In [3]:
official_names_mapping = {}
for df in dataframes:
    for index, row in df.iterrows():
        id_a = str(row['SWISS-PROT Accessions Interactor A'])
        id_b = str(row['SWISS-PROT Accessions Interactor B'])
        
        if id_a not in official_names_mapping:
            official_names_mapping[id_a] = row['Official Symbol Interactor A']
            
        if id_b not in official_names_mapping:
            official_names_mapping[id_b] = row['Official Symbol Interactor B']
        
        if id_a != id_b:
            pair = tuple(sorted([id_a, id_b]))
            enterez_id_pairs.add(pair)

In [4]:
enterez_id_pairs

{('O00170', 'O14576'),
 ('O00170', 'O14733'),
 ('O00170', 'O14874'),
 ('O00170', 'O15197'),
 ('O00170', 'O15392'),
 ('O00170', 'O43493'),
 ('O00170', 'O60260'),
 ('O00170', 'O60346'),
 ('O00170', 'O60885'),
 ('O00170', 'O75161'),
 ('O00170', 'O75170'),
 ('O00170', 'O75665'),
 ('O00170', 'O75953'),
 ('O00170', 'O76071'),
 ('O00170', 'O94761'),
 ('O00170', 'O94966'),
 ('O00170', 'O95140'),
 ('O00170', 'O95278'),
 ('O00170', 'O95999'),
 ('O00170', 'P00533'),
 ('O00170', 'P04049'),
 ('O00170', 'P04090'),
 ('O00170', 'P04626'),
 ('O00170', 'P04629'),
 ('O00170', 'P06239'),
 ('O00170', 'P06241'),
 ('O00170', 'P06733'),
 ('O00170', 'P06748'),
 ('O00170', 'P07900'),
 ('O00170', 'P07947'),
 ('O00170', 'P07949'),
 ('O00170', 'P08235'),
 ('O00170', 'P08238'),
 ('O00170', 'P09769'),
 ('O00170', 'P09936'),
 ('O00170', 'P0DTC9'),
 ('O00170', 'P0DTD1'),
 ('O00170', 'P0DTD2'),
 ('O00170', 'P10398'),
 ('O00170', 'P11279'),
 ('O00170', 'P11801'),
 ('O00170', 'P12931'),
 ('O00170', 'P13385'),
 ('O00170',

In [5]:
len(enterez_id_pairs)

164

### Parsing IntaAct

Nije potrebno dodatno filtriranje, svo filitriranje odradjeno prilikom preuzimanja podataka.

In [6]:
directory_path = 'files/construction_data/IntAct'
file_extension = '.txt'
dataframes = []

for file in os.listdir(directory_path):
    if file.endswith(file_extension):
        file_path = os.path.join(directory_path, file)
        df = pd.read_csv(file_path, delimiter='\t')
        dataframes.append(df)

In [7]:
import re

def get_gene_name(text):
    print(text)
    pattern1 = re.compile(r'uniprotkb:([a-zA-Z0-9_-]+?)\(gene name\)')
    pattern2 = re.compile(r'psi-mi:([a-zA-Z0-9_-]+?)\(display_short\)')
    match1 = pattern1.search(text)
    if match1:
        return match1.group(1)
    else:
        match2 = pattern2.search(text)
        result = match2.group(1)
        if result.endswith('_human'):
            result = result[:-6]
        return result

def get_swiss_id(text):
    pattern = re.compile(r'uniprotkb:(\w+)')
    match = pattern.search(text)
    if match:
        return match.group(1)
    else:
        return None

In [8]:
for df in dataframes:
    for index, row in df.iterrows():
        id_a = get_swiss_id(row['# ID(s) interactor A'])
        id_b = get_swiss_id(row['ID(s) interactor B'])
        
        if id_a == None or id_b == None:
            continue
        
        if id_a not in official_names_mapping:
            official_names_mapping[id_a] = get_gene_name(row['Alias(es) interactor A'])
            
        if id_b not in official_names_mapping:
            official_names_mapping[id_b] = get_gene_name(row['Alias(es) interactor B'])
        
        if id_a != id_b:
            pair = tuple(sorted([id_a, id_b]))
            enterez_id_pairs.add(pair)

psi-mi:pde2a_human(display_short)|psi-mi:cGMP-dependent 3',5'-cyclic phosphodiesterase(display_long)|uniprotkb:PDE2A(gene name)|uniprotkb:Cyclic GMP-stimulated phosphodiesterase(gene name synonym)
psi-mi:max_human(display_short)|psi-mi:Protein max(display_long)|uniprotkb:MAX(gene name)|uniprotkb:Myc-associated factor X(gene name synonym)|uniprotkb:Class D basic helix-loop-helix protein 4(gene name synonym)|uniprotkb:BHLHD4(gene name synonym)|IntAct:NM_002382(author assigned name)|IntAct:4149(author assigned name)
psi-mi:ebna3_ebvb9(display_short)|psi-mi:Epstein-Barr nuclear antigen 3(display_long)|uniprotkb:Epstein-Barr nuclear antigen 3A(gene name synonym)|uniprotkb:BLRF3-BERF1(orf name)|uniprotkb:EBNA3(gene name)
psi-mi:rmp_human(display_short)|psi-mi:Unconventional prefoldin RPB5 interactor 1(display_long)|uniprotkb:URI1(gene name)|uniprotkb:C19orf2(gene name synonym)|uniprotkb:NNX3(gene name synonym)|uniprotkb:PPP1R19(gene name synonym)|uniprotkb:Protein NNX3(gene name synonym)|uni

In [9]:
len(enterez_id_pairs)

182

In [10]:
with open("files/network.sif", "w") as file:
    for pair in enterez_id_pairs:
            first = official_names_mapping[pair[0]]
            second = official_names_mapping[pair[1]]
            
            file.write(f"{first}\t-\t{second}\n")