In [1]:
import os 
os.chdir("drive/MyDrive/02456-protein-project2020/notebooks")

# Test embeddings on downstream task
We will test the embedding using a downstream task. Here we will assert how well a simple random forest classifier performs on the data. We will consider one task classification of pfam family. All the models considered here have been treained on the mixed cytochrome c and beta-lactamase sequences from uniprot.  

In [2]:
# import modules 
import numpy as np 
from sklearn.ensemble import RandomForestClassifier
import sklearn
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, multilabel_confusion_matrix

import torch

In [27]:
#"emb_BL-cytoC_mix_b_32_train13252_iter300_hidden5.pth",
embeddings = ["emb_GRU_BL-cytoC_b_32_train13252_iter1000_hidden5.pth","emb_BL-cytoC_mix_b_32_train13252_iter300_hidden50.pth", "emb_BL-cytoC_mix_b_32_train13252_iter300_hidden100.pth","emb_BL-cytoC_mix_b_32_train13252_iter300_hidden500.pth", "emb_BL-cytoC_mix_bidir_GRU_b_32_train13252_iter600_hidden5.pth", "emb_BL-cytoC_mix_bidir_GRU_b_32_train13252_iter300_hidden50.pth", "emb_BL-cytoC_mix_bidir_GRU_b_32_train13252_iter500_hidden100.pth", "emb_BL-cytoC_mix_bidir_GRU_b_32_train13252_iter300_hidden500.pth"]

acc_last=[]
acc_max=[]
acc_mean=[]

for emb in embeddings: 
  cur_emb=torch.load("embeddings/{}".format(emb), map_location=torch.device('cpu'))
  cur_metadata = cur_emb["metadata"]
  


  ## get last 
  cur_emb_last=cur_emb["emb_last"]
  X_train, X_test, Y_train, Y_test =sklearn.model_selection.train_test_split(cur_emb_last,\
                            cur_metadata["Protein families"], train_size=0.8, random_state=False)
  model = RandomForestClassifier(n_estimators=100, bootstrap=True)

  model.fit(X_train, Y_train)
  rf_predictions = model.predict(X_test)
  cur_acc = accuracy_score(Y_test, rf_predictions)
  acc_last.append(cur_acc)

  ## get max 
  cur_emb_max=cur_emb["emb_max"]
  X_train, X_test, Y_train, Y_test =sklearn.model_selection.train_test_split(cur_emb_max,\
                            cur_metadata["Protein families"], train_size=0.8, random_state=False)
  model = RandomForestClassifier(n_estimators=100, bootstrap=True)

  model.fit(X_train, Y_train)
  rf_predictions = model.predict(X_test)
  cur_acc_max = accuracy_score(Y_test, rf_predictions)
  acc_max.append(cur_acc_max)

  ## get mean
  cur_emb_mean=cur_emb["emb_mean"]
  X_train, X_test, Y_train, Y_test =sklearn.model_selection.train_test_split(cur_emb_mean,\
                            cur_metadata["Protein families"], train_size=0.8, random_state=False)
  model = RandomForestClassifier(n_estimators=100, bootstrap=True)

  model.fit(X_train, Y_train)
  rf_predictions = model.predict(X_test)
  cur_acc_mean = accuracy_score(Y_test, rf_predictions)
  acc_mean.append(cur_acc_mean)


In [28]:
import pandas as pd
df=pd.DataFrame({"architecture":["hidden 5, GRU", "hidden 50, GRU","hidden 100, GRU", "hidden 500, GRU", "hidden 5, GRU bidirectional","hidden 50, GRU bidirectional", "hidden 100, GRU bidirectional", "hidden 500, GRU bidirectional"], "last":acc_last, "mean":acc_mean, "max":acc_max})
df

Unnamed: 0,architecture,last,mean,max
0,"hidden 5, GRU",0.518644,0.6,0.522034
1,"hidden 50, GRU",0.677966,0.677966,0.725424
2,"hidden 100, GRU",0.677966,0.644068,0.705085
3,"hidden 500, GRU",0.657627,0.661017,0.688136
4,"hidden 5, GRU bidirectional",0.522034,0.60339,0.640678
5,"hidden 50, GRU bidirectional",0.579661,0.661017,0.654237
6,"hidden 100, GRU bidirectional",0.589831,0.623729,0.644068
7,"hidden 500, GRU bidirectional",0.586441,0.647458,0.684746


In [29]:
print(df.to_latex(index=False)) 

\begin{tabular}{lrrr}
\toprule
                  architecture &      last &      mean &       max \\
\midrule
                 hidden 5, GRU &  0.518644 &  0.600000 &  0.522034 \\
                hidden 50, GRU &  0.677966 &  0.677966 &  0.725424 \\
               hidden 100, GRU &  0.677966 &  0.644068 &  0.705085 \\
               hidden 500, GRU &  0.657627 &  0.661017 &  0.688136 \\
   hidden 5, GRU bidirectional &  0.522034 &  0.603390 &  0.640678 \\
  hidden 50, GRU bidirectional &  0.579661 &  0.661017 &  0.654237 \\
 hidden 100, GRU bidirectional &  0.589831 &  0.623729 &  0.644068 \\
 hidden 500, GRU bidirectional &  0.586441 &  0.647458 &  0.684746 \\
\bottomrule
\end{tabular}





The classification task here is a multiclass classification task with 12 classes as shown below.


In [21]:
cur_metadata["Protein families"].value_counts()


Cytochrome c family                                                  461
Cytochrome c family, PsbV subfamily                                  101
Class-A beta-lactamase family                                         98
LysR transcriptional regulatory family                                97
Class-C beta-lactamase family                                         94
Metallo-beta-lactamase superfamily, Glyoxalase II family              92
Cytochrome c family, PetJ subfamily                                   92
Metallo-beta-lactamase superfamily, Class-B beta-lactamase family     91
Hcp beta-lactamase family                                             90
Class-D beta-lactamase family                                         88
Peptidase S12 family, YfeW subfamily                                  87
Binding-protein-dependent transport system permease family            82
Name: Protein families, dtype: int64

As we are considering a multiclass problem a naive classifier would be expected to alway guess on the largest class, given us a base accuary of:

In [22]:
461/1473


0.31296673455532925