### Imports and Setup

In [108]:
import numpy as np
import random
import pandas as pd
import csv
import datetime
import matplotlib.pyplot as plt
import copy

# numpy
from numpy.linalg import LinAlgError

# sklearn
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, RBF, WhiteKernel,ExpSineSquared,DotProduct,RationalQuadratic
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# modAL
from modAL.disagreement import max_std_sampling
from modAL.models import ActiveLearner, CommitteeRegressor

# scipy
from scipy import sparse

# from sklearn import preprocessing
# from sklearn.utils import shuffle
# from modAL.models import BayesianOptimizer, 
# from modAL.acquisition import max_EI


### Set random seed
seed = 5
random.seed(seed)
np.random.seed(seed)

### Suppresses Warning
import warnings
warnings.filterwarnings('ignore')

# 1. Data Prep

### 1.1. Load

In [109]:
data = pd.read_csv('data/hw3_data.csv', delimiter=',',header=0)
print(data.shape)

(9051, 4)


### 1.2. Encode

In [110]:
# create separate columns for each amino acid
for i in range(9): # all seq are length 9
    colname='seq'+str(i)
    data[colname] = [x[i] for x in data['seq']]

# separate features and target, remove unnecessary columns
X_df = data.drop(['pIC50','id','allele', 'seq'],axis=1)
y = data['pIC50']

# encode features
enc = OneHotEncoder(handle_unknown='ignore')
X_enc = enc.fit_transform(X_df)

# ?? standardize target

# ?? convert to numpy array
X_pool=sparse.csr_matrix.toarray(X_enc)
y_pool = y.to_numpy()


### 1.3. Split Data

In [111]:
X_train, X_test, y_train, y_test = train_test_split(X_pool, y_pool, test_size=0.33)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


(6064, 180)
(6064,)
(2987, 180)
(2987,)


# 2. Experiments with offline learners and kernels

### 2.1 Random Forest Regressor
Doesn't meet 0.6 threshold.

In [112]:
# %%time

# # train RFC model on entire pool of data
# rf = RandomForestRegressor(n_estimators = 20, 
#                             max_depth = 6, 
#                             random_state = seed)
# rf.fit(X_train, y_train)

# # calculate accuracy
# print(rf.score(X_test,y_test))  # uses R^2

### 2.2 Ridge Regression
Barely passable to meet 0.6 threshold.

In [113]:
# %%time

# clf = Ridge(alpha=1.0)
# clf.fit(X_train, y_train)

# print(clf.score(X_test,y_test))  #0.632

### 2.3 Ridge Regression with variable alpha
Did not dramatically improve score. (0.63 -> 0.65 maybe)

In [114]:
# # grid search had no significant improvement 
# for i in np.linspace(0.1,5,50):
#     clf = Ridge(alpha=i)
#     clf.fit(X_train, y_train)

#     print(np.round(i,2),np.round(clf.score(X_test,y_test),3))

### 2.4 RBF Kernel
Good score.

In [115]:
# %%time
# #2 min

# # checking if WhiteKernel is helping or not
# kernel = RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3))

# gpr = GaussianProcessRegressor(kernel,random_state=seed)
# gpr.fit(X_train, y_train)
# print(gpr.score(X_test,y_test)) #0.6885

### 2.5 RBF + WhiteKernel
WhiteKernel doesn't seem to improve score. Good score.

In [116]:
# %%time
# # 15 minutes for (5931,180)

# check if WhiteKernel helps or not
# kernel = RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \
#          + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))

# gpr = GaussianProcessRegressor(kernel,random_state=seed)
# gpr.fit(X_train, y_train)
# print(gpr.score(X_test,y_test)) #0.686 score

### 2.6 RBF no length_scale_bounds
Seems to increase runtime slightly. No effect on results. Good score.

In [117]:
# %%time
# # 2 min

# # checking if bounds is helping or not
# kernel = RBF(length_scale=1.0)

# gpr = GaussianProcessRegressor(kernel,random_state=seed)
# gpr.fit(X_train, y_train)
# print(gpr.score(X_test,y_test))  #0.6885

### 2.7 RationalQuadratic kernel
Good score

In [118]:
# %%time
# # 3.5 min

# kernel = RationalQuadratic(length_scale=1.0, alpha=1.5, length_scale_bounds=(1e-2, 1e3))
# gpr = GaussianProcessRegressor(kernel,random_state=seed)
# gpr.fit(X_train, y_train)
# print(gpr.score(X_test,y_test)) #0.6888

### 2.8 DotProduct + WhiteKernel
Barely passable score.

In [119]:
# %%time
# # 2 min

# kernel = DotProduct() + WhiteKernel()
# gpr = GaussianProcessRegressor(kernel,random_state=seed)
# gpr.fit(X_train, y_train)
# print(gpr.score(X_test,y_test)) # 0.632

### 2.9 Default Kernel
Bad.

In [120]:
# %%time
# # 15 s

# gpr = GaussianProcessRegressor(random_state=seed)
# gpr.fit(X_train, y_train)
# print(gpr.score(X_test,y_test)) # -0.806

### 2.10 RBF + ExpSineSquared
Idea is that sequences are "periodic" data, and sequences repeat in groups. If we combine RBF and periodic kernel we could better model the data. Model wouldn't run due to LinAlgError:

```
LinAlgError: ("The kernel, RBF(length_scale=1) + ExpSineSquared(length_scale=1, periodicity=1), is not returning a positive definite matrix. Try gradually increasing the 'alpha' parameter of your GaussianProcessRegressor estimator.", '10-th leading minor of the array is not positive definite')
```

Attempted to modify alpha to fix issue, to no avail.

In [121]:
# %%time

# kernel = RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) + ExpSineSquared(length_scale=1, periodicity=1)

# for alpha in [1E-9,1E-8,1E-7,1E-6,1E-5,1E-4,1E-3,1E-2,1E-1,1E-0]:
        
# #     try:
# #         gpr = GaussianProcessRegressor(kernel,random_state=seed, alpha=alpha)
# #         gpr.fit(X_train, y_train)
# #         print(alpha, gpr.score(X_test,y_test))  
# #     except LinAlgError:
# #         print(alpha, "Error")
# #         pass


# 3. Active Learning model - Grid Search v1
Experiments to test out various combinations to meet R2 >= 0.6 threshold.

All experiments in this section use the following configurations:
- Category: Committee
- Learner: Gaussian Process
- query_strategy: max_std_sampling

### Useful Functions

In [122]:
#?? Cookbook suggests "put a product of SE kernels on those dimensions" (I have 180 dimensions)

In [123]:
def get_next_sample(learner, X, y):
    
    # call the query strategy defined in the learner to obtain a new sample
    query_idx, query_sample = learner.query(X)
    
    # modify indexing to interpret as collection of one element with d features
    query_sample_reshaped = query_sample.reshape(1,-1)
   
    # obtain the query label
    query_label = y[query_idx]

    # modify indexing to interpret as 1D array of one element
    query_label_reshaped = query_label.reshape(1,)
    
    return query_sample_reshaped, query_label_reshaped, query_idx

In [124]:
def run_active_learner_regression(learner, X_pool, y_pool, X_test, y_test, n_queries):

    # perform active learning
    for q in range(n_queries):

        # get sample
        X_sample, y_sample, query_idx = get_next_sample(learner, X_pool, y_pool)

        # use new sample to update the model
        learner.teach(X_sample, y_sample)
        
        # remove labeled instance from pool
        X_pool = np.delete(X_pool, query_idx, axis=0)
        y_pool = np.delete(y_pool, query_idx)


In [125]:
def write_results_to_file(filename,fields=None,rows=None):
    
    with open(filename,'a') as f:
        
        # using csv.writer method from CSV package 
        write = csv.writer(f) 

        if fields:
            write.writerow(fields) 
        
        if rows:
            write.writerows(rows) 

In [126]:
def gaussian_process_regressor_gs(kernels, n_learners, n_initials, X_pool, y_pool, X_test, y_test, n_queries,
                                  seed, filename, fields, row_prefix, details_func):
    
    # append fields as first row to file
    write_results_to_file(filename, fields)
    
    # perform grid search
    for kernel in kernels:
        for n_learner in n_learners:
            for n_initial in n_initials:
                
                # make a copy of the data for use in this test
                X_pool_gs = copy.deepcopy(X_pool)
                y_pool_gs = copy.deepcopy(y_pool)

                # get initial training set for each learner
                initial_idx = []
                for i in range(n_learner):
                    initial_idx.append(np.random.choice(len(X_pool_gs), size=n_initial, replace=False))
                
                # initialize learners for Committee
                learner_list = [
                    ActiveLearner(
                        estimator=GaussianProcessRegressor(kernel,random_state=seed),
                        X_training=X_pool_gs[idx],
                        y_training=y_pool_gs[idx]
                    ) for idx in initial_idx]
                    
                # create Committee
                committee = CommitteeRegressor(
                                learner_list=learner_list,
                                query_strategy=max_std_sampling
                            )

                # perform active learning
                run_active_learner_regression(committee, X_pool_gs, y_pool_gs, X_test, y_test, n_queries)

                # score model
                y_pred = committee.predict(X_test, return_std=False)
                r2=r2_score(y_test,y_pred)
                
                
                # create row for file
                kernel_details = details_func(kernel)
                meta = [n_learner, n_initial, n_queries, r2]
                row = row_prefix + kernel_details + meta

                # append to file
                write_results_to_file(filename, rows=[row])
                
                # output to console for tracking progress
                print('{}|{}|{}|{}|{}'.format(kernel, n_learner, n_initial,n_queries, r2))


In [127]:
def matern_details(kernel):
    return [kernel,kernel.length_scale, kernel.nu]

In [128]:
def rbf_details(kernel):
    return [kernel, kernel.length_scale, kernel.length_scale_bounds]

In [129]:
def rationalquadratic_details(kernel):
    return [kernel, kernel.length_scale, kernel.alpha]

In [130]:
def rbf_plus_white(kernel):
    return [kernel, kernel.k1.length_scale, kernel.k2.noise_level]

### 3.1. Matern
- 26 minutes with `2*2*10` Matern configurations at 100 queries - 2,5 learners
- 3 hours 4 minutes with `2*2*10` Matern configurations at 100 queries - 10,20 learners


In [131]:
# %%time

# # results file
# filename = 'data/gridsearch.' + datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') + '.csv'
# row_prefix = ['Committee','Gaussian Process','max_std_sampling']
# fields = ['category','learner','query_strategy','kernel','length_scale', 'nu', 'n_learners', 'n_initial','n_queries','r2']

# # configs
# kernels = [Matern(length_scale=i, nu=1.5) for i in np.linspace(0.5,1,2)]

# n_learners = [2]
# n_initials = [10]
# n_queries = 1

# # run process
# gaussian_process_regressor_gs(kernels,n_learners,n_initials,X_train,y_train,X_test,y_test,n_queries,
#                                         seed, filename, fields, row_prefix, matern_details)


### 3.2. RBF

In [132]:
# %%time

# # results file
# filename = 'data/gridsearch.' + datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') + '.csv'
# row_prefix = ['Committee','Gaussian Process','max_std_sampling']
# fields = ['category','learner','query_strategy','kernel','length_scale', 'length_scale_bounds', 'n_learners', 'n_initial','n_queries','r2']

# # configs
# kernels = [RBF(length_scale=i,length_scale_bounds=(1e-2, 1e3)) for i in np.linspace(0.38,0.418,2)]

# n_learners = [2]
# n_initials = [10]
# n_queries = 1

# # run process
# gaussian_process_regressor_gs(kernels,n_learners,n_initials,X_train,y_train,X_test,y_test,n_queries,
#                                         seed, filename, fields, row_prefix, rbf_details)


### 3.3. RationalQuadratic

In [133]:
# %%time

# # results file
# filename = 'data/gridsearch.' + datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') + '.csv'
# row_prefix = ['Committee','Gaussian Process','max_std_sampling']
# fields = ['category','learner','query_strategy','kernel','length_scale', 'alpha', 'n_learners', 'n_initial','n_queries','r2']

# # configs
# kernels = [RationalQuadratic(length_scale=i, alpha=j, length_scale_bounds=(1e-2, 1e3)) 
#            for i in np.linspace(0.5,10,2) 
#            for j in np.linspace(0.5,2,2)]

# n_learners = [2]
# n_initials = [10]
# n_queries = 1

# # run process
# gaussian_process_regressor_gs(kernels,n_learners,n_initials,X_train,y_train,X_test,y_test,n_queries,
#                                         seed, filename, fields, row_prefix, rationalquadratic_details)

### 3.4. RBF + WhiteKernel

In [134]:
# %%time

# # results file
# filename = 'data/gridsearch.' + datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') + '.csv'
# row_prefix = ['Committee','Gaussian Process','max_std_sampling']
# fields = ['category','learner','query_strategy','kernel','rbf_length_scale', 'white_noise_scale', 'n_learners', 'n_initial','n_queries','r2']

# # configs
# kernels = [RBF(length_scale=0.45,length_scale_bounds=(1e-2, 1e3)) \
#            + WhiteKernel(noise_level=i, noise_level_bounds=(1e-10, 1e+1)) 
#            for i in np.random.uniform(low=0.1,high=1.0,size=2)]


# n_learners = [2]
# n_initials = [10]
# n_queries = 1

# # run process
# gaussian_process_regressor_gs(kernels,n_learners,n_initials,X_train,y_train,X_test,y_test,n_queries,
#                                         seed, filename, fields, row_prefix, rbf_plus_white)

### 3.5. RBF explore more length_scale values

In [135]:
# %%time

# # results file
# filename = 'data/gridsearcha.' + datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S') + '.csv'
# row_prefix = ['Committee','Gaussian Process','max_std_sampling']
# fields = ['category','learner','query_strategy','kernel','length_scale', 'length_scale_bounds', 'n_learners', 'n_initial','n_queries','r2']

# # configs
# kernels = [RBF(length_scale=i,length_scale_bounds=(1e-2, 1e3)) 
#            for i in np.random.uniform(low=0.38,high=1.0,size=2)] \
#         + [RBF(length_scale=i,length_scale_bounds=(1e-2, 1e3)) 
#            for i in np.random.uniform(low=1,high=10,size=2)]



# n_learners = [2]
# n_initials = [10]
# n_queries = 1

# # run process
# gaussian_process_regressor_gs(kernels,n_learners,n_initials,X_train,y_train,X_test,y_test,n_queries,
#                                         seed, filename, fields, row_prefix, rbf_details)


### 3.6.