## Predicting height of DNA strands

In [1]:
from sklearn import linear_model
import numpy as np

In [34]:
X = np.load("./data/subset_cm_train.npy")
X_test = np.load("./data/subset_cm_test.npy")

y = np.load("./data/train_heights.npy")
y_test = np.load("./data/test_heights.npy")

In [35]:
X.shape
X_test.shape


(9894, 137)

In [36]:
X = X.T
X_test = X_test.T
y = y.T
y_test = y_test.T

In [37]:
from sklearn.model_selection import train_test_split

In [38]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33)

X_train = X
y_train = y
X_val = X_test
y_val = y_test

In [39]:
#Replace nan values in the training and testing set with an arbitrary number
inds = np.where(np.isnan(X_train))
X_train[inds] = -100
inds = np.where(np.isnan(X_val))
X_val[inds] = -100

In [40]:
from sklearn.metrics import r2_score, mean_squared_error

def model_train_on_val(model):
    
    model.fit(X_train, y_train)
    
    
    y_pred = model.predict(X_val)
    y = y_val
    
    print("R2 score on validation set is: " + str(r2_score(y, y_pred))) 
    print("Mean squared error on validation set is: " + str(mean_squared_error(y, y_pred)))

In [41]:
def model_predict_test(model):
    model.fit(X, y)
    return model.predict(X_test)

In [42]:
def run_model(model, model_name):
    model_train_on_val(model)
    predicted[model_name] = model_predict_test(model)

In [43]:
predicted = {}

In [44]:
from sklearn import linear_model
from sklearn import kernel_ridge
from sklearn import svm
from sklearn import neighbors
from sklearn import gaussian_process
from sklearn import tree
from sklearn import ensemble
from sklearn import neural_network

### Linear Regression

In [45]:
regr = linear_model.LinearRegression()

run_model(regr, "Linear Regression")

R2 score on validation set is: 0.30395747812106566
Mean squared error on validation set is: 70.75619042007598


### Ridge Regression

In [46]:
regr = linear_model.Ridge(alpha=.5)

run_model(regr, "Ridge Regression")

R2 score on validation set is: 0.319049861181144
Mean squared error on validation set is: 69.22197448337009


### Lasso Regression

In [47]:
regr = linear_model.Lasso(alpha=0.1)

run_model(regr, "Lasso Regression")

R2 score on validation set is: 0.29045022435650014
Mean squared error on validation set is: 72.12927006589699


### Lasso LARS Regression

In [48]:
regr = linear_model.LassoLars(alpha=0.1)

run_model(regr, "Lasso LARS Regression")



R2 score on validation set is: 0.33020257381653617
Mean squared error on validation set is: 68.08824567496339


### Bayesian Ridge Regression

In [49]:
regr = linear_model.BayesianRidge()

run_model(regr, "Bayesian Ridge Regression")

R2 score on validation set is: 0.4439092087255052
Mean squared error on validation set is: 56.52939968675169


### Kernel Ridge Regression

In [50]:
regr = kernel_ridge.KernelRidge(alpha=1.0)

run_model(regr, "Kernel Ridge Regression")

R2 score on validation set is: 0.29746495870490186
Mean squared error on validation set is: 71.41618736807284


### SVM Regression

In [51]:
regr = svm.SVR()

run_model(regr, "SVM Regression")

R2 score on validation set is: 0.20160221190662475
Mean squared error on validation set is: 81.1611132216548


### K Neighbors Regression

In [52]:
regr = neighbors.KNeighborsRegressor(n_neighbors=5)

run_model(regr, "K Neighbors Regression")

R2 score on validation set is: 0.318736025919089
Mean squared error on validation set is: 69.25387740145985


### Gaussian Process Regression

In [53]:
regr = gaussian_process.GaussianProcessRegressor()

run_model(regr, "Gaussian Process Regression")

R2 score on validation set is: -296.5132983496587
Mean squared error on validation set is: 30243.7091540146


### Decision Tree Regression

In [54]:
regr = tree.DecisionTreeRegressor()

run_model(regr, "Decision Tree Regression")

R2 score on validation set is: -0.45153639879369667
Mean squared error on validation set is: 147.55590729927002


### Boosting Regression

In [55]:
regr = ensemble.GradientBoostingRegressor(n_estimators=300, learning_rate=0.1)

run_model(regr, "Boosting Regression")

R2 score on validation set is: 0.35850943687983117
Mean squared error on validation set is: 65.21071200403934


### MLP Regression

In [56]:
regr = neural_network.MLPRegressor()

run_model(regr, "MLP Regression")

R2 score on validation set is: 0.33807492852916676
Mean squared error on validation set is: 67.28798159397358
