In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Create a dataset with different protein families 

In [4]:
import os 
os.chdir("drive/MyDrive/protein-embedding-02456-2020/notebooks")

In [5]:
! pip install Biopython

Collecting Biopython
[?25l  Downloading https://files.pythonhosted.org/packages/76/02/8b606c4aa92ff61b5eda71d23b499ab1de57d5e818be33f77b01a6f435a8/biopython-1.78-cp36-cp36m-manylinux1_x86_64.whl (2.3MB)
[K     |████████████████████████████████| 2.3MB 5.8MB/s 
Installing collected packages: Biopython
Successfully installed Biopython-1.78


In [7]:
# import general libraries
import torch
from torch import nn
from torch.utils.data import DataLoader
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd 

In [8]:
# read in table from uniprot with info and sequences
BL_uniprot=pd.read_csv("../data/uniprot-beta-lactamase.tab",sep='\t')
BL_uniprot.shape

  interactivity=interactivity, compiler=compiler, result=result)


(502432, 27)

In [9]:
# get sequence length
BL_uniprot["seq_len"]=BL_uniprot["Sequence"].str.len()
# remove row with extreme sequence length
BL_uniprot=BL_uniprot.loc[(BL_uniprot['seq_len'] > 20) & (BL_uniprot['seq_len'] <= 500)]
BL_uniprot.shape

(414788, 28)

In [10]:
BL_uniprot["Protein families"].value_counts().head(10)

Metallo-beta-lactamase superfamily, Glyoxalase II family                                                                       19417
Class-A beta-lactamase family                                                                                                  17665
Hcp beta-lactamase family                                                                                                       7384
Class-C beta-lactamase family                                                                                                   6138
Class-D beta-lactamase family                                                                                                   3712
Metallo-beta-lactamase superfamily, Class-B beta-lactamase family                                                               3512
LysR transcriptional regulatory family                                                                                          2104
Binding-protein-dependent transport system permease family           

## Create unbiased dataset

We pick 3000 sequences from selected families to create a new dataset where each family of beta-lactamase is equally represented. 

In [11]:
fam_size = 900
families=["Metallo-beta-lactamase superfamily, Glyoxalase II family", "Class-A beta-lactamase family",\
          "Hcp beta-lactamase family", "Class-C beta-lactamase family", "Class-D beta-lactamase family",\
          "Metallo-beta-lactamase superfamily, Class-B beta-lactamase family"] #"LysR transcriptional regulatory family", \
          #"Binding-protein-dependent transport system permease family", "Peptidase S12 family, YfeW subfamily"]


idx_fam=[]
for j in range(len(families)):
  idx_fam.append([i for i, e in enumerate(BL_uniprot["Protein families"]) if e == families[j]])


idx_fam_get=[]
for i in range(len(idx_fam)):
  cur_idx = idx_fam[i]
  idx = np.random.choice(range(len(cur_idx)), size=fam_size, replace=False)
  idx_fam_get.append(np.array(cur_idx)[idx])

flat_fam = [item for sublist in idx_fam_get for item in sublist]


In [12]:
sele_BL_uniprot=BL_uniprot.iloc[flat_fam]
sele_BL_uniprot["Protein families"].value_counts().head(10)

Metallo-beta-lactamase superfamily, Glyoxalase II family             900
Class-D beta-lactamase family                                        900
Metallo-beta-lactamase superfamily, Class-B beta-lactamase family    900
Class-A beta-lactamase family                                        900
Class-C beta-lactamase family                                        900
Hcp beta-lactamase family                                            900
Name: Protein families, dtype: int64

In [13]:
sele_BL_uniprot.head(n=2)

Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Organism,Length,Protein families,Taxonomic lineage (ALL),Gene ontology (biological process),Gene ontology (cellular component),Gene ontology (GO),Gene ontology (molecular function),Gene ontology IDs,Taxonomic lineage (CLASS),Taxonomic lineage (FAMILY),Taxonomic lineage (FORMA),Taxonomic lineage (GENUS),Taxonomic lineage (INFRACLASS),Taxonomic lineage (all),Beta strand,Helix,Turn,Mass,Cross-reference (Pfam),Sequence,Taxonomic lineage (PHYLUM),seq_len
84693,A0A6B2BVP3,A0A6B2BVP3_9VIBR,unreviewed,Hydroxyacylglutathione hydrolase (EC 3.1.2.6) ...,gloB CAG51_05270,Vibrio sp. V24_P1S3T111,252,"Metallo-beta-lactamase superfamily, Glyoxalase...","cellular organisms, Bacteria, Proteobacteria, ...",methylglyoxal catabolic process to D-lactate v...,,hydroxyacylglutathione hydrolase activity [GO:...,hydroxyacylglutathione hydrolase activity [GO:...,GO:0004416; GO:0019243; GO:0046872,Gammaproteobacteria,Vibrionaceae,,Vibrio,,"cellular organisms, Bacteria, Proteobacteria, ...",,,,28389,PF16123;PF00753;,MLQIKSIPAFNDNYIWLIQNSDQRCAVVDPGDAAPVLEYLSKHQLT...,Proteobacteria,252
71017,A0A537BF03,A0A537BF03_9PROT,unreviewed,Hydroxyacylglutathione hydrolase (EC 3.1.2.6) ...,gloB E6H49_00240 E6H56_13975,Betaproteobacteria bacterium,258,"Metallo-beta-lactamase superfamily, Glyoxalase...","cellular organisms, Bacteria, Proteobacteria, ...",methylglyoxal catabolic process to D-lactate v...,,hydroxyacylglutathione hydrolase activity [GO:...,hydroxyacylglutathione hydrolase activity [GO:...,GO:0004416; GO:0019243; GO:0046872,Betaproteobacteria,,,,,"cellular organisms, Bacteria, Proteobacteria, ...",,,,28099,PF16123;PF00753;,MPEKGFQVIPLRAFKDNYVWTLRNAACAAVVDPGEAQPVLDYLAAE...,Proteobacteria,258


In [None]:
# export data
#sele_BL_uniprot.to_csv("data/ONLYbalanced900_pfam_uniprot_BL.tsv", sep="\t", index=False)

# Create a dataset with 2 very different protein families


In [15]:
# read in table from uniprot with info and sequences
cytoc_uniprot=pd.read_csv("data_small/uniprot_Cytochrome_c.tab",sep='\t')
# get sequence length
cytoc_uniprot["seq_len"]=cytoc_uniprot["Sequence"].str.len()
# remove row with extreme sequence length
cytoc_uniprot=cytoc_uniprot.loc[(cytoc_uniprot['seq_len'] > 20) & (cytoc_uniprot['seq_len'] <= 500)]
cytoc_uniprot.shape

(6697, 28)

In [16]:
cytoc_uniprot.head(n=2)

Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Organism,Length,Protein families,Taxonomic lineage (ALL),Gene ontology (biological process),Gene ontology (cellular component),Gene ontology (GO),Gene ontology (molecular function),Gene ontology IDs,Taxonomic lineage (CLASS),Taxonomic lineage (FAMILY),Taxonomic lineage (FORMA),Taxonomic lineage (GENUS),Taxonomic lineage (INFRACLASS),Taxonomic lineage (all),Beta strand,Helix,Turn,Mass,Cross-reference (Pfam),Sequence,Taxonomic lineage (PHYLUM),seq_len
1,P00004,CYC_HORSE,reviewed,Cytochrome c,CYCS CYC,Equus caballus (Horse),105,Cytochrome c family,"cellular organisms, Eukaryota, Opisthokonta, M...",apoptotic process [GO:0006915]; cytochrome c-h...,cytochrome complex [GO:0070069]; cytosol [GO:0...,cytochrome complex [GO:0070069]; cytosol [GO:0...,"electron transporter, transferring electrons f...",GO:0005758; GO:0005829; GO:0006122; GO:0006123...,Mammalia,Equidae (horses),,Equus,,"cellular organisms, Eukaryota, Opisthokonta, M...","STRAND 22..24; /evidence=""ECO:0000244|PDB:5IY...","HELIX 4..14; /evidence=""ECO:0000244|PDB:6K9J""...","TURN 15..18; /evidence=""ECO:0000244|PDB:6K9J""...",11833,PF00034;,MGDVEKGKKIFVQKCAQCHTVEKGGKHKTGPNLHGLFGRKTGQAPG...,Chordata,105
2,P62897,CYC_MOUSE,reviewed,"Cytochrome c, somatic",Cycs,Mus musculus (Mouse),105,Cytochrome c family,"cellular organisms, Eukaryota, Opisthokonta, M...",activation of cysteine-type endopeptidase acti...,cytosol [GO:0005829]; mitochondrial intermembr...,cytosol [GO:0005829]; mitochondrial intermembr...,"electron transporter, transferring electrons f...",GO:0005634; GO:0005739; GO:0005758; GO:0005829...,Mammalia,Muridae,,Mus,,"cellular organisms, Eukaryota, Opisthokonta, M...",,,,11605,PF00034;,MGDVEKGKKIFVQKCAQCHTVEKGGKHKTGPNLHGLFGRKTGQAAG...,Chordata,105


In [17]:
cytoc_uniprot["Protein families"].value_counts().head(n=10)

Cytochrome c family                                                         4634
Cytochrome c family, PsbV subfamily                                         1005
Cytochrome c family, PetJ subfamily                                          986
Cytochrome c family, PetJ subfamily; Cytochrome c family, PsbV subfamily      57
Multiheme cytochrome c family                                                  2
Cytochrome c family; Heme-copper respiratory oxidase family                    2
CcoP / FixP family; Cytochrome c family, PetJ subfamily                        1
Wnt family; Cytochrome c family                                                1
TFIIF beta subunit family; Cytochrome c family                                 1
Nup35 family; Cytochrome c family                                              1
Name: Protein families, dtype: int64

In [18]:
fam_size = 900
families=["Cytochrome c family", "Cytochrome c family, PetJ subfamily", "Cytochrome c family, PsbV subfamily"]
data = cytoc_uniprot

idx_fam=[]
for j in range(len(families)):
  idx_fam.append([i for i, e in enumerate(data["Protein families"]) if e == families[j]])

idx_fam_get=[]
for i in range(len(idx_fam)):
  cur_idx = idx_fam[i]
  #print(cur_idx)
  idx = np.random.choice(range(len(cur_idx)), size=fam_size, replace=False)
  idx_fam_get.append(np.array(cur_idx)[idx])

flat_fam = [item for sublist in idx_fam_get for item in sublist]

In [19]:
# select those families 
sele_cytoc_uniprot=data.iloc[flat_fam]
sele_cytoc_uniprot["Protein families"].value_counts().head(10)

Cytochrome c family                    900
Cytochrome c family, PetJ subfamily    900
Cytochrome c family, PsbV subfamily    900
Name: Protein families, dtype: int64

In [20]:
BL_CytoC=pd.concat([sele_cytoc_uniprot, sele_BL_uniprot ])
BL_CytoC["Protein families"].value_counts().head(n=10)
#BL_CytoC.to_csv("data/BL-cytoC_mix_bal.tsv", sep="\t", index=False)

Cytochrome c family                                                  900
Metallo-beta-lactamase superfamily, Glyoxalase II family             900
Class-D beta-lactamase family                                        900
Cytochrome c family, PsbV subfamily                                  900
Metallo-beta-lactamase superfamily, Class-B beta-lactamase family    900
Class-A beta-lactamase family                                        900
Class-C beta-lactamase family                                        900
Cytochrome c family, PetJ subfamily                                  900
Hcp beta-lactamase family                                            900
Name: Protein families, dtype: int64