## Predicting height of DNA strands

In [1]:
from sklearn import linear_model
import numpy as np

In [2]:
X = np.load("./data/subset_cm_train.npy")
X_test = np.load("./data/subset_cm_test.npy")

y = np.load("./data/train_heights.npy")

In [3]:
X = X.reshape(784, 9894)
X_test = X_test.reshape(137, 9894)

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33)

In [6]:
#Replace nan values in the training and testing set with an arbitrary number
inds = np.where(np.isnan(X_train))
X_train[inds] = -100
inds = np.where(np.isnan(X_val))
X_val[inds] = -100

In [7]:
from sklearn.metrics import r2_score, mean_squared_error

def model_train_on_val(model):
    
    model.fit(X_train, y_train)
    
    
    y_pred = model.predict(X_val)
    y = y_val
    
    print("R2 score on validation set is: " + str(r2_score(y, y_pred))) 
    print("Mean squared error on validation set is: " + str(mean_squared_error(y, y_pred)))

In [8]:
def model_predict_test(model):
    model.fit(X, y)
    return model.predict(X_test)

In [9]:
def run_model(model, model_name):
    model_train_on_val(model)
    predicted[model_name] = model_predict_test(model)

In [10]:
predicted = {}

In [31]:
from sklearn import linear_model
from sklearn import kernel_ridge
from sklearn import svm
from sklearn import neighbors
from sklearn import gaussian_process
from sklearn import tree
from sklearn import ensemble
from sklearn import neural_network

### Linear Regression

In [11]:
regr = linear_model.LinearRegression()

run_model(regr, "Linear Regression")

R2 score on validation set is: -0.13449834484920542
Mean squared error on validation set is: 125.74800812353483


### Ridge Regression

In [12]:
regr = linear_model.Ridge(alpha=.5)

run_model(regr, "Ridge Regression")

R2 score on validation set is: -0.1360245750661715
Mean squared error on validation set is: 125.91717576542058


### Lasso Regression

In [13]:
regr = linear_model.Lasso(alpha=0.1)

run_model(regr, "Lasso Regression")

R2 score on validation set is: -0.363725704962224
Mean squared error on validation set is: 151.1556114686588


### Lasso LARS Regression

In [14]:
regr = linear_model.LassoLars(alpha=0.1)

run_model(regr, "Lasso LARS Regression")

R2 score on validation set is: -0.0002895021694933053
Mean squared error on validation set is: 110.87227497137953


### Bayesian Ridge Regression

In [15]:
regr = linear_model.BayesianRidge()

run_model(regr, "Bayesian Ridge Regression")

R2 score on validation set is: 0.00554158580949915
Mean squared error on validation set is: 110.22595609230802


### Kernel Ridge Regression

In [17]:
regr = kernel_ridge.KernelRidge(alpha=1.0)

run_model(regr, "Kernel Ridge Regression")

R2 score on validation set is: -120.80850732683373
Mean squared error on validation set is: 13501.277669017894


### SVM Regression

In [19]:
regr = svm.SVR()

run_model(regr, "SVM Regression")

R2 score on validation set is: 0.0033848071773419086
Mean squared error on validation set is: 110.46501383812891


### K Neighbors Regression

In [21]:
regr = neighbors.KNeighborsRegressor(n_neighbors=5)

run_model(regr, "K Neighbors Regression")

R2 score on validation set is: -0.13366699994078002
Mean squared error on validation set is: 125.65586169884169


### Gaussian Process Regression

In [25]:
regr = gaussian_process.GaussianProcessRegressor()

run_model(regr, "Gaussian Process Regression")

R2 score on validation set is: -271.0614972465226
Mean squared error on validation set is: 30155.347093436292


### Decision Tree Regression

In [27]:
regr = tree.DecisionTreeRegressor()

run_model(regr, "Decision Tree Regression")

R2 score on validation set is: -0.6670448194562582
Mean squared error on validation set is: 184.77555868725867


### Boosting Regression

In [30]:
regr = ensemble.GradientBoostingRegressor(n_estimators=300, learning_rate=0.1)

run_model(regr, "Boosting Regression")

R2 score on validation set is: -0.08376876273072886
Mean squared error on validation set is: 120.125131780612


### MLP Regression

In [32]:
regr = neural_network.MLPRegressor()

run_model(regr, "MLP Regression")



R2 score on validation set is: -20.5692938004248
Mean squared error on validation set is: 2390.7445474459796
