In [1]:
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET

import matplotlib.pyplot as plt
import seaborn as sns

import bokeh.io
import bokeh.plotting
bokeh.io.output_notebook()

import glob, os

# Virus Intact Database

42,154 pairwise interactions from <a href="https://www.ebi.ac.uk/legacy-intact/query/annot:%22dataset:virus%22?conversationContext=7" target="\blank">here</a>.

In [441]:
virus_human = pd.read_csv("Data/virus_human.csv")
human_human = pd.read_csv("Data/human_important_for_virus.csv")

In [445]:
virus_human[["Protein A", "Protein B", "Protein A DB", "Protein B DB"]]

Unnamed: 0,Protein A,Protein B,Protein A DB,Protein B DB
0,K9N643,O00505,uniprotkb,uniprotkb
1,K9N643,O00629,uniprotkb,uniprotkb
2,K9N643,O00629,uniprotkb,uniprotkb
3,K9N643,O00629,uniprotkb,uniprotkb
4,P52294,K9N643,uniprotkb,uniprotkb
...,...,...,...,...
27019,P04608,Q99733,uniprotkb,uniprotkb
27020,P04608,O75607,uniprotkb,uniprotkb
27021,P04608,O15355,uniprotkb,uniprotkb
27022,P04608,O43865,uniprotkb,uniprotkb


# Get list of human proteins with which to do enrichment. 

# Include proteins from the `human_human` dataframe too. Compare to all human proteins

In [556]:
prots_A = virus_human.loc[virus_human["Taxname A"] == "Homo sapiens"]["Protein A"].values
prots_B = virus_human.loc[virus_human["Taxname B"] == "Homo sapiens"]["Protein B"].values

virus_human_proteins = set(prots_A).union(prots_B)
human_human_proteins = set(human_human["Protein A"]).union(set(human_human["Protein B"]))

human_proteins = list(virus_human_proteins.union(human_human_proteins))
print(f"{len(human_proteins)} human proteins")
pd.DataFrame(human_proteins).to_csv("Data/human_proteins.txt", index=False, header=None, sep="\t")

# converted UniProt to UniProt KB ID to update names. This is a full dataframe, not a text file
human_fixed_df = pd.read_csv("Data/human_proteins_fixed.tab", sep="\t")

# indicates that the second column is what we want
sum(human_proteins_fixed_df.iloc[:, 0].values != human_proteins_fixed_df.iloc[:, 1].values)

# save the updated UniProt IDs to a new text file
human_proteins_lst = human_proteins_fixed_df["Entry"].values
pd.DataFrame(human_proteins_lst).to_csv("Data/human_proteins_updated.txt", sep="\t", index=False, header=None)

9061 human proteins


358

In [673]:
len(human_fixed_df)

8673

In [602]:
human_proteins_lst = pd.read_csv("Data/human_proteins_updated.txt", sep="\t", header=None).iloc[:, 0].values
print(len(human_proteins_lst), len(np.unique(human_proteins_lst)))

human_proteins_all = pd.read_csv("Data/human_proteins_all.gz", sep="\t", compression="gzip")
print(len(human_proteins_all["Entry"].unique()), len(human_proteins_all))
human_proteins_all[["Entry"]].to_csv("Data/human_proteins_all.txt", sep="\t", index=False, header=None)

8673 8673
79052 79052


In [647]:
def get_genes(df, col):
    
    gene_names = []

    for _, row in df.iterrows():

        if not pd.isnull(row["Gene names"]):
            gene_names.append(row["Gene names"].split(" ")[0])
        else:
            gene_names.append(row[col].split("_")[0])
            
    return gene_names

In [671]:
gene_names_all = get_genes(human_proteins_all, "Entry name")

# phew, that makes more sense
print(f"{len(np.unique(gene_names_all))} genes")

gene_names = get_genes(human_proteins_fixed_df, "Entry name")

# phew, that makes more sense
print(f"{len(np.unique(gene_names))} genes")

#pd.DataFrame(np.unique(gene_names_all)).to_csv("Data/human_genes_all.txt", sep="\t", header=None, index=False)

# take the intersection because all the genes from the small list should be in the large database. Probably just mislabeling or something
#pd.DataFrame(set(gene_names).intersection(gene_names_all)).to_csv("Data/human_genes.txt", sep="\t", header=None, index=False)

21340 genes
8045 genes


In [674]:
human_proteins_all = pd.read_csv("Data/human_proteins_all.gz", sep="\t", compression="gzip")

print(f"{len(human_proteins_all["Entry"].unique())} proteins in the human interactome")


SyntaxError: invalid syntax (1031757919.py, line 3)

In [676]:
len(human_proteins_all["Entry"].unique())

79052

In [661]:
genes = pd.read_csv("Data/human_genes.txt", sep="\t", header=None).iloc[:, 0].values
genes_all = pd.read_csv("Data/human_genes_all.txt", sep="\t", header=None).iloc[:, 0].values

In [663]:
# all significant at alpha = 0.05 even after FDR
go_function = pd.read_csv("Data/GO_genes_function.tsv", sep="\t")
go_process = pd.read_csv("Data/GO_genes_process.tsv", sep="\t")

In [666]:
go_process.shape

(1692, 10)

In [668]:
go_process.Description.values

array(['cellular metabolic process', 'metabolic process',
       'macromolecule metabolic process', ...,
       'ribosomal subunit export from nucleus',
       'positive regulation of type I interferon-mediated signaling pathway',
       'positive regulation of double-strand break repair via homologous recombination'],
      dtype=object)

In [669]:
len(go_process.loc[go_process["FDR q-value"] < 0.05])

1692

In [670]:
go_process["Genes"][0].split(", ")

['[AFG3L2  -  afg3 atpase family member 3-like 2 (s. cerevisiae)',
 'POP1  -  processing of precursor 1',
 'ribonuclease p/mrp subunit (s. cerevisiae)',
 'MSL3  -  male-specific lethal 3 homolog (drosophila)',
 'AGAP3  -  arfgap with gtpase domain',
 'ankyrin repeat and ph domain 3',
 'AADAC  -  arylacetamide deacetylase',
 'SF3A3  -  splicing factor 3a',
 'subunit 3',
 '60kda',
 'HNRNPA0  -  heterogeneous nuclear ribonucleoprotein a0',
 'ABAT  -  4-aminobutyrate aminotransferase',
 'SEC61B  -  sec61 beta subunit',
 'PDIA5  -  protein disulfide isomerase family a',
 'member 5',
 'ABCF1  -  atp-binding cassette',
 'sub-family f (gcn20)',
 'member 1',
 'OS9  -  osteosarcoma amplified 9',
 'endoplasmic reticulum lectin',
 'ABL1  -  c-abl oncogene 1',
 'non-receptor tyrosine kinase',
 'ABCA4  -  atp-binding cassette',
 'sub-family a (abc1)',
 'member 4',
 'ABL2  -  c-abl oncogene 2',
 'non-receptor tyrosine kinase',
 'ERP29  -  endoplasmic reticulum protein 29',
 'ACACA  -  acetyl-coa carb