### Imports and Setup

In [2]:
import numpy as np
import random
import pandas as pd
import datetime
import matplotlib.pyplot as plt

# sklearn
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, RBF, WhiteKernel,ExpSineSquared,DotProduct,RationalQuadratic
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# scipy
from scipy import sparse

# custom packages
from packages.gridsearch import gridsearch as gs



### Set random seed
seed = 5
random.seed(seed)
np.random.seed(seed)

### Suppresses Warning
import warnings
warnings.filterwarnings('ignore')

# 1. Data Prep

### 1.1. Load

In [3]:
data = pd.read_csv('data/hw3_data.csv', delimiter=',',header=0)
print(data.shape)

(9051, 4)


### 1.2. Encode

In [4]:
# create separate columns for each amino acid
for i in range(9): # all seq are length 9
    colname='seq'+str(i)
    data[colname] = [x[i] for x in data['seq']]

# separate features and target, remove unnecessary columns
X_df = data.drop(['pIC50','id','allele', 'seq'],axis=1)
y = data['pIC50']

# encode features
enc = OneHotEncoder(handle_unknown='ignore')
X_enc = enc.fit_transform(X_df)

# ?? standardize target

# convert to numpy array ?? is this necessary
X_pool=sparse.csr_matrix.toarray(X_enc)
y_pool = y.to_numpy()


### 1.3. Split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_pool, y_pool, test_size=0.33)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


(6064, 180)
(6064,)
(2987, 180)
(2987,)


# 2. Experiments with offline learners and kernels
To see if a particular regressor or kernel works better on the data than any other.

### 2.1 Random Forest Regressor
Doesn't meet 0.6 threshold.

In [6]:
# %%time

# # train RFC model on entire pool of data
# rf = RandomForestRegressor(n_estimators = 20, 
#                             max_depth = 6, 
#                             random_state = seed)
# rf.fit(X_train, y_train)

# # calculate accuracy
# print(rf.score(X_test,y_test)) #0.476

### 2.2 Ridge Regression
Barely passable to meet 0.6 threshold.

In [7]:
# %%time

# clf = Ridge(alpha=1.0)
# clf.fit(X_train, y_train)

# print(clf.score(X_test,y_test))  #0.632

### 2.3 Ridge Regression with variable alpha
Did not dramatically improve score. (0.63 -> 0.65 maybe)

In [8]:
# # grid search had no significant improvement 
# for i in np.linspace(0.1,5,50):
#     clf = Ridge(alpha=i)
#     clf.fit(X_train, y_train)

#     print(np.round(i,2),np.round(clf.score(X_test,y_test),3))

### 2.4 RBF Kernel
Good score.

In [9]:
# %%time
# #2 min

# # checking if WhiteKernel is helping or not
# kernel = RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3))

# gpr = GaussianProcessRegressor(kernel,random_state=seed)
# gpr.fit(X_train, y_train)
# print(gpr.score(X_test,y_test)) #0.6885

### 2.5 RBF + WhiteKernel
WhiteKernel doesn't seem to improve score. Good score.

In [10]:
# %%time
# # 15 minutes for (5931,180)

# check if WhiteKernel helps or not
# kernel = RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \
#          + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))

# gpr = GaussianProcessRegressor(kernel,random_state=seed)
# gpr.fit(X_train, y_train)
# print(gpr.score(X_test,y_test)) #0.686 score

### 2.6 RBF no length_scale_bounds
Seems to increase runtime slightly. No effect on results. Good score.

In [11]:
# %%time
# # 2 min

# # checking if bounds is helping or not
# kernel = RBF(length_scale=1.0)

# gpr = GaussianProcessRegressor(kernel,random_state=seed)
# gpr.fit(X_train, y_train)
# print(gpr.score(X_test,y_test))  #0.6885

### 2.7 RationalQuadratic kernel
Good score

In [12]:
# %%time
# # 3.5 min

# kernel = RationalQuadratic(length_scale=1.0, alpha=1.5, length_scale_bounds=(1e-2, 1e3))
# gpr = GaussianProcessRegressor(kernel,random_state=seed)
# gpr.fit(X_train, y_train)
# print(gpr.score(X_test,y_test)) #0.6888

### 2.8 DotProduct + WhiteKernel
Barely passable score.

In [13]:
# %%time
# # 2 min

# kernel = DotProduct() + WhiteKernel()
# gpr = GaussianProcessRegressor(kernel,random_state=seed)
# gpr.fit(X_train, y_train)
# print(gpr.score(X_test,y_test)) # 0.632

### 2.9 Default Kernel
Bad.

In [14]:
# %%time
# # 15 s

# gpr = GaussianProcessRegressor(random_state=seed)
# gpr.fit(X_train, y_train)
# print(gpr.score(X_test,y_test)) # -0.806

### 2.10 RBF + ExpSineSquared
Idea is that sequences are "periodic" data, and sequences repeat in groups. If we combine RBF and periodic kernel we could better model the data. Model wouldn't run due to LinAlgError:

```
LinAlgError: ("The kernel, RBF(length_scale=1) + ExpSineSquared(length_scale=1, periodicity=1), is not returning a positive definite matrix. Try gradually increasing the 'alpha' parameter of your GaussianProcessRegressor estimator.", '10-th leading minor of the array is not positive definite')
```

Attempted to modify alpha to fix issue, to no avail.

In [15]:
# %%time

# from numpy.linalg import LinAlgError

# kernel = RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) + ExpSineSquared(length_scale=1, periodicity=1)

# for alpha in [1E-9,1E-8,1E-7,1E-6,1E-5,1E-4,1E-3,1E-2,1E-1,1E-0]:
        
# #     try:
# #         gpr = GaussianProcessRegressor(kernel,random_state=seed, alpha=alpha)
# #         gpr.fit(X_train, y_train)
# #         print(alpha, gpr.score(X_test,y_test))  
# #     except LinAlgError:
# #         print(alpha, "Error")


### 2.11 Matern
Good model.

In [16]:
# %%time
# # 2 min

# kernel = Matern(length_scale=1.0, nu=1.5)

# gpr = GaussianProcessRegressor(kernel,random_state=seed)
# gpr.fit(X_train, y_train)
# print(gpr.score(X_test,y_test)) # 0.687

# 3. Active Learning model - Grid Search v1
Experiments to test out various combinations to achieve $R^2 \ge 0.6$ threshold.

All experiments in this section use the following configurations:
- Category: Committee
- Learner: Gaussian Process
- query_strategy: max_std_sampling

In [17]:
#?? Cookbook suggests "put a product of SE kernels on those dimensions" (I have 180 dimensions)

### Functions

In [18]:
def matern_details(kernel):
    return [kernel,kernel.length_scale, kernel.nu]

In [19]:
def rbf_details(kernel):
    return [kernel, kernel.length_scale, kernel.length_scale_bounds]

In [20]:
def rationalquadratic_details(kernel):
    return [kernel, kernel.length_scale, kernel.alpha]

In [21]:
def rbf_plus_white(kernel):
    return [kernel, kernel.k1.length_scale, kernel.k2.noise_level]

### 3.1. Matern
- 26 minutes with `2*2*10` Matern configurations at 100 queries - 2,5 learners
- 3 hours 4 minutes with `2*2*10` Matern configurations at 100 queries - 10,20 learners


In [22]:
# %%time

# # results file
# filename = 'data/gridsearch.' + datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') + '.csv'
# row_prefix = ['Committee','Gaussian Process','max_std_sampling']
# fields = ['category','learner','query_strategy','kernel','length_scale', 'nu', 'n_learners', 'n_initial','n_queries','r2']

# # configs
# kernels = [Matern(length_scale=i, nu=1.5) for i in np.linspace(0.5,1,2)]

# n_learners = [2]
# n_initials = [10]
# n_queries = 1

# # run process
# gaussian_process_regressor_gs(kernels,n_learners,n_initials,X_train,y_train,X_test,y_test,n_queries,
#                                         seed, filename, fields, row_prefix, matern_details)


### 3.2. RBF

In [23]:
# %%time

# # results file
# filename = 'data/gridsearch.' + datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') + '.csv'
# row_prefix = ['Committee','Gaussian Process','max_std_sampling']
# fields = ['category','learner','query_strategy','kernel','length_scale', 'length_scale_bounds', 'n_learners', 'n_initial','n_queries','r2']

# # configs
# kernels = [RBF(length_scale=i,length_scale_bounds=(1e-2, 1e3)) for i in np.linspace(0.38,0.418,2)]

# n_learners = [2]
# n_initials = [10]
# n_queries = 1

# # run process
# gaussian_process_regressor_gs(kernels,n_learners,n_initials,X_train,y_train,X_test,y_test,n_queries,
#                                         seed, filename, fields, row_prefix, rbf_details)


### 3.3. RationalQuadratic

In [24]:
# %%time

# # results file
# filename = 'data/gridsearch.' + datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') + '.csv'
# row_prefix = ['Committee','Gaussian Process','max_std_sampling']
# fields = ['category','learner','query_strategy','kernel','length_scale', 'alpha', 'n_learners', 'n_initial','n_queries','r2']

# # configs
# kernels = [RationalQuadratic(length_scale=i, alpha=j, length_scale_bounds=(1e-2, 1e3)) 
#            for i in np.linspace(0.5,10,2) 
#            for j in np.linspace(0.5,2,2)]

# n_learners = [2]
# n_initials = [10]
# n_queries = 1

# # run process
# gaussian_process_regressor_gs(kernels,n_learners,n_initials,X_train,y_train,X_test,y_test,n_queries,
#                                         seed, filename, fields, row_prefix, rationalquadratic_details)

### 3.4. RBF + WhiteKernel random search
Stopped halfway through because results weren't interesting. Adding noise doesn't improve score.

In [25]:
# %%time

# # results file
# filename = 'data/gridsearch.' + datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') + '.csv'
# row_prefix = ['Committee','Gaussian Process','max_std_sampling']
# fields = ['category','learner','query_strategy','kernel','rbf_length_scale', 'white_noise_scale', 'n_learners', 'n_initial','n_queries','r2']

# # configs
# kernels = [RBF(length_scale=0.45,length_scale_bounds=(1e-2, 1e3)) \
#            + WhiteKernel(noise_level=i, noise_level_bounds=(1e-10, 1e+1)) 
#            for i in np.random.uniform(low=0.1,high=1.0,size=20)]

# n_learners = [10]
# n_initials = [80]
# n_queries = 100

# # run process
# gaussian_process_regressor_gs(kernels,n_learners,n_initials,X_train,y_train,X_test,y_test,n_queries,
#                                         seed, filename, fields, row_prefix, rbf_plus_white)

### 3.5. RBF linear search wide
Search for peaks.

In [26]:
# %%time

# # results file
# filename = 'data/gridsearch.' + datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') + '.csv'
# row_prefix = ['Committee','Gaussian Process','max_std_sampling']
# fields = ['category','learner','query_strategy','kernel','length_scale', 'length_scale_bounds', 'n_learners', 'n_initial','n_queries','r2']

# # configs
# kernels = [RBF(length_scale=i) for i in [1e-10,1e-5,1e-2,1e-1,1,1e1,100]]

# n_learners = [10]
# n_initials = [80]
# n_queries = 100

# # run process
# gaussian_process_regressor_gs(kernels,n_learners,n_initials,X_train,y_train,X_test,y_test,n_queries,
#                                         seed, filename, fields, row_prefix, rbf_details)


### 3.6. RBF linear search narrow
Search near current peaks.

In [27]:
# %%time
# # 1 hour

# # results file
# filename = 'data/gridsearch.' + datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') + '.csv'
# row_prefix = ['Committee','Gaussian Process','max_std_sampling']
# fields = ['category','learner','query_strategy','kernel','length_scale', 'length_scale_bounds', 'n_learners', 'n_initial','n_queries','r2']

# # configs
# kernels = [RBF(length_scale=i) for i in np.linspace(0.1,1,10)] \
#         + [RBF(length_scale=i) for i in np.linspace(1,10,10)]

# n_learners = [10]
# n_initials = [80]
# n_queries = 100

# # run process
# gaussian_process_regressor_gs(kernels,n_learners,n_initials,X_train,y_train,X_test,y_test,n_queries,
#                                         seed, filename, fields, row_prefix, rbf_details)


### 3.7. RBF linear search narrow 2
Search near current peaks. Skipped fifth run due to time.

In [28]:
# %%time

# # results file
# filename = 'data/gridsearch.' + datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') + '.csv'
# row_prefix = ['Committee','Gaussian Process','max_std_sampling']
# fields = ['category','learner','query_strategy','kernel','length_scale', 'length_scale_bounds', 'n_learners', 'n_initial','n_queries','r2']

# # configs
# kernels = [RBF(length_scale=i) for i in np.linspace(0.4,7,40)] \
#         + [RBF(length_scale=i) for i in np.linspace(0.4,7,40)] \
#         + [RBF(length_scale=i) for i in np.linspace(0.4,7,40)] \
#         + [RBF(length_scale=i) for i in np.linspace(0.4,7,40)] \
#         + [RBF(length_scale=i) for i in np.linspace(0.4,7,40)]

# n_learners = [10]
# n_initials = [80]
# n_queries = 100

# # run process
# gaussian_process_regressor_gs(kernels,n_learners,n_initials,X_train,y_train,X_test,y_test,n_queries,
#                                         seed, filename, fields, row_prefix, rbf_details)


### 3.8. RBF linear search narrow 3
Search near current peaks.

In [29]:
# %%time
# # 3h 4 min

# # results file
# filename = 'data/gridsearch.' + datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') + '.csv'
# row_prefix = ['Committee','Gaussian Process','max_std_sampling']
# fields = ['category','learner','query_strategy','kernel','length_scale', 'length_scale_bounds', 'n_learners', 'n_initial','n_queries','r2']

# # configs
# kernels = [RBF(length_scale=i,length_scale_bounds=(1e-2, 1e3)) for i in np.linspace(0.45,0.55,10)] \
#         + [RBF(length_scale=i,length_scale_bounds=(1e-2, 1e3)) for i in np.linspace(0.7,0.8,10)] \
#         + [RBF(length_scale=i,length_scale_bounds=(1e-2, 1e3)) for i in np.linspace(2.4,3.0,10)] \
#         + [RBF(length_scale=i,length_scale_bounds=(1e-2, 1e3)) for i in np.linspace(3.7,4.0,10)] \
#         + [RBF(length_scale=i,length_scale_bounds=(1e-2, 1e3)) for i in np.linspace(4.8,5.2,10)]  

# n_learners = [10]
# n_initials = [80]
# n_queries = 100

# # run process
# gaussian_process_regressor_gs(kernels,n_learners,n_initials,X_train,y_train,X_test,y_test,n_queries,
#                                         seed, filename, fields, row_prefix, rbf_details)


### 3.9. RBF random search narrow
Randomized points near peaks.

In [30]:
# %%time

# # results file
# filename = 'data/gridsearch.' + datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') + '.csv'

# ex_id = ['3.9']
# row_prefix = ex_id + ['Committee','Gaussian Process','max_std_sampling']
# fields = ['experiment','category','learner','query_strategy','kernel','length_scale', 'length_scale_bounds', 'n_learners', 'n_initial','n_queries','r2']

# # configs
# kernels = [RBF(length_scale=i,length_scale_bounds=(1e-2, 1e3)) for i in np.random.uniform(low=2.4,high=3.0,size=20)] \
#         + [RBF(length_scale=i,length_scale_bounds=(1e-2, 1e3)) for i in np.random.uniform(low=3.8,high=4.0,size=20)] \
#         + [RBF(length_scale=i,length_scale_bounds=(1e-2, 1e3)) for i in np.random.uniform(low=4.8,high=5.0,size=20)] \
#         + [RBF(length_scale=i,length_scale_bounds=(1e-2, 1e3)) for i in np.random.uniform(low=0.49,high=0.51,size=20)]

# n_learners = [10]
# n_initials = [80]
# n_queries = 100

# # run process
# gs.grid_search_1(kernels,n_learners,n_initials,X_train,y_train,X_test,y_test,n_queries,
#                                         seed, filename, fields, row_prefix, rbf_details)


RBF(length_scale=2.76)|10|80|100|0.5635222023115654
RBF(length_scale=2.51)|10|80|100|0.5337165452605679
RBF(length_scale=2.47)|10|80|100|0.5489614420553026
RBF(length_scale=2.94)|10|80|100|0.5283697583584003
RBF(length_scale=2.65)|10|80|100|0.5344300885630899
RBF(length_scale=2.41)|10|80|100|0.5526588906310574
RBF(length_scale=2.99)|10|80|100|0.5503690594715509
RBF(length_scale=2.47)|10|80|100|0.5485406059990521
RBF(length_scale=2.62)|10|80|100|0.5242808318628994
RBF(length_scale=2.41)|10|80|100|0.5605198240938523
RBF(length_scale=2.41)|10|80|100|0.5320978421020564
RBF(length_scale=2.98)|10|80|100|0.5519891745018656
RBF(length_scale=2.81)|10|80|100|0.5440711891091994
RBF(length_scale=2.76)|10|80|100|0.5362017242590282
RBF(length_scale=2.78)|10|80|100|0.5401868832042537
RBF(length_scale=2.62)|10|80|100|0.5445414171042621
RBF(length_scale=2.6)|10|80|100|0.5394760369539029
RBF(length_scale=2.42)|10|80|100|0.5517369385722448
RBF(length_scale=2.74)|10|80|100|0.541609830847074
RBF(length_sca

### 3.10. RBF best points
Checking rounded values near the highest R2 points.

In [30]:
%%time


# results file
filename = 'data/gridsearch.' + datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') + '.csv'

ex_id = ['3.9']
row_prefix = ex_id + ['Committee','Gaussian Process','max_std_sampling']
fields = ['experiment','category','learner','query_strategy','kernel','length_scale', 'length_scale_bounds', 'n_learners', 'n_initial','n_queries','r2']

# configs
kernels = [RBF(length_scale=i,length_scale_bounds=(1e-2, 1e3))
           for i in [0.500,0.738,1.923,3.785,5.985,2.769,2.760,2.412,4.983]]

n_learners = [10]
n_initials = [80]
n_queries = 100

# run process
gs.grid_search_1(kernels,n_learners,n_initials,X_train,y_train,X_test,y_test,n_queries,
                                        seed, filename, fields, row_prefix, rbf_details)


RBF(length_scale=2.76)|10|80|100|0.5635222023115654
RBF(length_scale=2.51)|10|80|100|0.5337165452605679
RBF(length_scale=2.47)|10|80|100|0.5489614420553026
RBF(length_scale=2.94)|10|80|100|0.5283697583584003
RBF(length_scale=2.65)|10|80|100|0.5344300885630899
RBF(length_scale=2.41)|10|80|100|0.5526588906310574
RBF(length_scale=2.99)|10|80|100|0.5503690594715509
RBF(length_scale=2.47)|10|80|100|0.5485406059990521
RBF(length_scale=2.62)|10|80|100|0.5242808318628994
RBF(length_scale=2.41)|10|80|100|0.5605198240938523
RBF(length_scale=2.41)|10|80|100|0.5320978421020564
RBF(length_scale=2.98)|10|80|100|0.5519891745018656
RBF(length_scale=2.81)|10|80|100|0.5440711891091994
RBF(length_scale=2.76)|10|80|100|0.5362017242590282
RBF(length_scale=2.78)|10|80|100|0.5401868832042537
RBF(length_scale=2.62)|10|80|100|0.5445414171042621
RBF(length_scale=2.6)|10|80|100|0.5394760369539029
RBF(length_scale=2.42)|10|80|100|0.5517369385722448
RBF(length_scale=2.74)|10|80|100|0.541609830847074
RBF(length_sca