1. Print the first six lines of the interaction datafile within the python/R session
2. Compute the frequencies of interaction types in the database, print in order of decreasing frequency
3. Compute the total number of protein-protein interactions in the database
4. Compute the total number of proteins that participate in protein-protein interactions
5. Compute the number of unique protein-protein interactions (undirected)
6. Optional: make it more interesting: do not use the file system for saving the SIF file or compressed SIF file

For protein-protein interactions: “interacts-with”, “neighbor-of”, and “in-complex-with”

In [8]:
from urllib.request import urlopen
import gzip
import timeit

baseURL = "http://www.pathwaycommons.org/archives/PC2/current/"
filename = "PathwayCommons.8.All.BINARY_SIF.hgnc.txt.sif.gz"
outFilePath = "pc.sif"
interaction_types_ppi = set(["interacts-with","in-complex-with","neighbor-of"])

start_time = timeit.default_timer()

zfd = urlopen(baseURL + filename)
fd = gzip.GzipFile(fileobj=zfd, mode="r")

# initialize the SIF file interaction counter
intctr = 0
linectr = 0
from collections import defaultdict

interactions = set()
proteins = set()
intnamectr = defaultdict(int)

for line in fd:
    if linectr < 6:
        print(line)
        
    linectr += 1
    
    [prot1, interaction_type, prot2] = line.decode("utf-8").rstrip("\n").split("\t")
    intnamectr[interaction_type] += 1
    if interaction_type in interaction_types_ppi:
        intctr += 1
        proteins |= set([prot1, prot2])
        interactions.add(min(prot1, prot2) + "-" + max(prot1, prot2))       
        
elapsed = timeit.default_timer() - start_time

print(elapsed)

b'A1BG\tcontrols-expression-of\tA2M\n'
b'A1BG\tinteracts-with\tABCC6\n'
b'A1BG\tcontrols-phosphorylation-of\tAKT1\n'
b'A1BG\tcontrols-state-change-of\tAKT1\n'
b'A1BG\tinteracts-with\tANXA7\n'
b'A1BG\tinteracts-with\tCDKN1A\n'
5.516805504099466


In [10]:
print(intctr)

523498


In [11]:
len(proteins)

17020

In [12]:
len(interactions)

491784

In [13]:
from operator import itemgetter
sorted(intnamectr.items(), key=itemgetter(1), reverse=True)

[('interacts-with', 369895),
 ('in-complex-with', 153603),
 ('chemical-affects', 135268),
 ('catalysis-precedes', 120948),
 ('controls-expression-of', 110013),
 ('controls-state-change-of', 106156),
 ('controls-production-of', 18482),
 ('consumption-controlled-by', 16816),
 ('controls-phosphorylation-of', 15636),
 ('used-to-produce', 13705),
 ('controls-transport-of', 6960),
 ('reacts-with', 3607),
 ('controls-transport-of-chemical', 2847)]

In [71]:
import pandas
zfd = urlopen(baseURL + filename)
fd = gzip.GzipFile(fileobj=zfd, mode="r")
df = pandas.read_csv(fd, sep="\t", names=["species1","interaction_type","species2"])

In [54]:
print(df.head())
print(df.shape[0])

  species1             interaction_type species2
0     A1BG       controls-expression-of      A2M
1     A1BG               interacts-with    ABCC6
2     A1BG  controls-phosphorylation-of     AKT1
3     A1BG     controls-state-change-of     AKT1
4     A1BG               interacts-with    ANXA7
1073936


In [24]:
df.interaction_type.unique()

array(['controls-expression-of', 'interacts-with',
       'controls-phosphorylation-of', 'controls-state-change-of',
       'in-complex-with', 'catalysis-precedes', 'controls-production-of',
       'controls-transport-of', 'controls-transport-of-chemical',
       'chemical-affects', 'consumption-controlled-by', 'reacts-with',
       'used-to-produce'], dtype=object)

In [63]:
ppirows = df.interaction_type.isin(interaction_types_ppi)
sum(ppirows)

523498

In [64]:
newlist = df["species1"][ppirows].tolist() + df["species2"][ppirows].tolist()
len(set(newlist))

17020

In [68]:
len(set(df["species1"][ppirows] + "-" + df["species2"][ppirows]))

491784

In [70]:
df["interaction_type"].value_counts()

interacts-with                    369895
in-complex-with                   153603
chemical-affects                  135268
catalysis-precedes                120948
controls-expression-of            110013
controls-state-change-of          106156
controls-production-of             18482
consumption-controlled-by          16816
controls-phosphorylation-of        15636
used-to-produce                    13705
controls-transport-of               6960
reacts-with                         3607
controls-transport-of-chemical      2847
Name: interaction_type, dtype: int64