### Imports and Setup

In [95]:
import numpy as np
import random
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, RBF,WhiteKernel
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

from modAL.models import BayesianOptimizer, ActiveLearner, CommitteeRegressor
from modAL.acquisition import max_EI
from modAL.disagreement import max_std_sampling

# import seqlogo

import copy

### Set random seed
seed = 5
random.seed(seed)
np.random.seed(seed)

# 1.1. Data Prep

### Load Data

In [96]:
data = pd.read_csv('data/hw3_data.csv', delimiter=',',header=0)
print(data.shape)
data.head(5)

(9051, 4)


Unnamed: 0,id,allele,seq,pIC50
0,seq0,HLA-A*02:01,AIIDYIAYM,9.0
1,seq1,HLA-A*02:01,AIYDTMQYV,9.0
2,seq2,HLA-A*02:01,ALATFTVNI,9.0
3,seq3,HLA-A*02:01,ALDEGLLPV,9.0
4,seq4,HLA-A*02:01,ALFPIIWAL,9.0


### Encode Data
I want to keep some notion of the order of the amino acids. The 3D structure they form is vital for predicting binding quality. One possible way is to group letters in order. How do I decide how many letters constitutes a group?

Ex: 3-mers
```
AIIDYIAYM
AII
 IID
  IDY
    ...
      AYM
```

3-mers results in N < J, where N is the number of samples and J is the number of features. If we want to use multivariate regression, we'll need to use ridge regression or some other sparse form.

In [97]:
data['seq'].str.len().unique()  # every seq is length 9

array([9])

In [98]:
# create separate columns for each amino acid
for i in range(9):
    colname='seq'+str(i)
    data[colname] = [x[i] for x in data['seq']]

In [99]:
data.head(1)

Unnamed: 0,id,allele,seq,pIC50,seq0,seq1,seq2,seq3,seq4,seq5,seq6,seq7,seq8
0,seq0,HLA-A*02:01,AIIDYIAYM,9.0,A,I,I,D,Y,I,A,Y,M


In [100]:
# number of unique letters
len(data['seq0'].unique())

20

In [101]:
# create triplets
k = 3
count = 9-k+1
start=0
end=k
for i in range(count):
    colname='triplet'+str(i)
    data[colname] = [x[start:end] for x in data['seq']]
    start +=1
    end +=1

In [102]:
data.head(1)

Unnamed: 0,id,allele,seq,pIC50,seq0,seq1,seq2,seq3,seq4,seq5,seq6,seq7,seq8,triplet0,triplet1,triplet2,triplet3,triplet4,triplet5,triplet6
0,seq0,HLA-A*02:01,AIIDYIAYM,9.0,A,I,I,D,Y,I,A,Y,M,AII,IID,IDY,DYI,YIA,IAY,AYM


In [103]:
X_df = data.drop(['pIC50','id','allele', 'seq'],axis=1)
y = data['pIC50']
print(X_df.shape)
print(y.shape)

(9051, 16)
(9051,)


In [104]:
X_df.head(1)

Unnamed: 0,seq0,seq1,seq2,seq3,seq4,seq5,seq6,seq7,seq8,triplet0,triplet1,triplet2,triplet3,triplet4,triplet5,triplet6
0,A,I,I,D,Y,I,A,Y,M,AII,IID,IDY,DYI,YIA,IAY,AYM


In [105]:
enc = OneHotEncoder(handle_unknown='ignore')
X = enc.fit_transform(X_df)
X.shape

(9051, 28195)

### Split Data

In [106]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(6064, 28195)
(6064,)
(2987, 28195)
(2987,)


# 1.1. Active Learning

In [None]:
### TO DO
### please edit this block, and feel free to remove any code we provided for you.

regressor = ActiveLearner(
    X_training=X_train[0].reshape(1, -1), y_training=y_train[0].reshape(1, -1)    
)
n_queries = 200
for idx in range(n_queries):
    query_idx = np.random.randint(len(X_train))
    X_train, y_train = (np.delete(X_train, query_idx, axis=0), np.delete(y_train, query_idx))
    regressor.teach(X_train[query_idx].reshape(1,-1), y_train[query_idx].reshape(1,-1))

y_pred_final = regressor.predict(X_test, return_std=False)
y_train_pred=regressor.predict(X_train, return_std=False)
r2=r2_score(y_test,y_pred_final)
r2_train=r2_score(y_train,y_train_pred)
print(y_test,y_pred_final,y_train_pred)
print("R2",r2,r2_train)