##### Parsing Biogrid data

In [1]:
import os
import pandas as pd

directory_path = 'files/construction_data'

file_extension = '.txt'

dataframes = []
enterez_id_pairs = set()

for file in os.listdir(directory_path):
    if file.endswith(file_extension):
        file_path = os.path.join(directory_path, file)
        df = pd.read_csv(file_path, delimiter='\t')
        df = df[~df['Experimental System'].isin(['Co-localization', 'Genetic interference'
                                      'Synthetic Rescue', 'Synthetic Growth Defect',
                                      'Synthetic Lethality'])]
        df = df[(df['Entrez Gene Interactor A'] != '-') & (df['Entrez Gene Interactor B'] != '-')]
        
        dataframes.append(df)

In [15]:
official_names_mapping = {}
for df in dataframes:
    for index, row in df.iterrows():
        id_a = str(row['Entrez Gene Interactor A'])
        id_b = str(row['Entrez Gene Interactor B'])
        
        if id_a not in official_names_mapping:
            official_names_mapping[id_a] = row['Official Symbol Interactor A']
            
        if id_b not in official_names_mapping:
            official_names_mapping[id_b] = row['Official Symbol Interactor B']
        
        if id_a != id_b:
            pair = tuple(sorted([int(id_a), int(id_b)]))
            pair = tuple([str(pair[0]), str(pair[1])])
            enterez_id_pairs.add(pair)

In [17]:
with open("files/network.sif", "w") as file:
    for pair in enterez_id_pairs:
            first = official_names_mapping[pair[0]]
            second = official_names_mapping[pair[1]]
            
            if first != 'No symbol found' and second != 'No symbol found':
                file.write(f"{first}\t-\t{second}\n")