# Fetch and prepare human interactome data

This Jupyter notebook fetches TSV data from [The Human Reference Protein Interactome Mapping Project](http://www.interactome-atlas.org) and extracts connectivity information from it for constructing a protein-protein interaction (PPi) network.

In [None]:
import os

import pandas as pd

## HuRI

In [None]:
# Load TSV data from website
url = 'http://www.interactome-atlas.org/data/HuRI.tsv'
df = pd.read_csv(url, sep='\t', header=None)

# Check consistency
for row in df.values:
    p1, p2 = row
    assert p1.startswith('ENSG') and p2.startswith('ENSG')

# Save to networks directory
filepath = os.path.join('..', 'networks', 'HuRI.tsv')
with open(filepath, 'w') as file_handle:
    df.to_csv(file_handle, sep='\t', index=False, header=False)

## HI-union

In [None]:
# Load TSV data from website (243 MB, takes some time)
url = 'http://www.interactome-atlas.org/data/HI-union.tsv'
column_names = ['Protein1', 'Protein2']
df = pd.read_csv(url, sep='\t', header=None, usecols=[0, 1], names=column_names)

# Drop rows that contain only 'sequence    identification)'
first_column = column_names[0]
mask = df[first_column] != 'sequence'
df = df[mask]

# Remove prefix 'uniprotkb:' but leave 'ensembl:' as it is
def remove_prefix(word):
    prefix1 = 'uniprotkb:'
    prefix2 = 'ensembl:'
    if word.startswith(prefix1):
        new_word = word.lstrip(prefix1)
    elif word.startswith(prefix2):
        new_word = word
    else:
        raise ValueError('Word does not contain any accepted prefix:', word)
    return new_word

for col in column_names:
    df[col] = [remove_prefix(word) for word in df[col].tolist()]

# Save to networks directory
filepath = os.path.join('..', 'networks', 'HI-union-minimal.tsv')
with open(filepath, 'w') as file_handle:
    df.to_csv(file_handle, sep='\t', index=False, header=False)