In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score

X = pd.read_csv('X.csv')
y = pd.read_csv('y.csv')
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

Linear regression model

In [None]:
lr_model = linear_model.LinearRegression()
lr_model.fit(X_train, y_train)
y_train_pred = lr_model.predict(X_train)
print("Results on the training data")
print('Mean squared error (MSE): %.2f'
      % mean_squared_error(y_train, y_train_pred))
print('Coefficient of determination (R^2): %.2f'
      % r2_score(y_train, y_train_pred))
print('Explained variance score: %.2f'
      % explained_variance_score(y_train, y_train_pred))

# Now run the model on the testing data and check results
y_test_pred = lr_model.predict(X_test)
print()
print("Results on the testing data")
print('Mean squared error (MSE): %.2f'
      % mean_squared_error(y_test, y_test_pred))
print('Coefficient of determination (R^2): %.2f'
      % r2_score(y_test, y_test_pred))
print('Explained variance score: %.2f'
      % explained_variance_score(y_test, y_test_pred))

Based on what I read, the best value for explained variance and R^2 is one. So, these results seem pretty bad by those metrics. Whereas, the mean squared error is better if it's lower, so this model works fairly well by that measure.

In [None]:
# Scatter plot of the train and test set results
plt.figure(figsize=(10,7))

# plt.subplot(2, 1, 1)
plt.scatter(x=y_train, y=y_train_pred, c="green", alpha=0.3)

# plt.subplot(2, 1, 2)
plt.scatter(x=y_test, y=y_test_pred, c="blue", alpha=0.3)

plt.ylabel('Predicted LogS')
plt.xlabel('Experimental LogS')

plt.show()

In [None]:
from sklearn import svm
from sklearn.model_selection import RandomizedSearchCV

regr = svm.SVC()
tuned_parameters = [{
    'kernel': ['rbf'], 
    'gamma': [1e-4,1e-3,1e-2,1e-1,1e+0,1e+1,1e+2,1e+3,1e+4],
    'C': [1e+0,1e+1,1e+2,1e+3,1e+4,1e+5,1e+6,1e+7,1e+8,1e+9]
}]

RSCV = RandomizedSearchCV(estimator=regr, param_grid=tuned_parameters)
RSresult = RSCV.fit(X_train, y_train)

best_model = RSresult.best_estimator_
yhat = best_model.predict(X_test)
print(yhat)

Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(
    n_estimators=100, random_state=42, max_depth=10, bootstrap = True, max_features ='auto'
)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)

print()
print("Results on the testing data with Random Forest")
print('Mean squared error (MSE): %.2f'
      % mean_squared_error(y_test, rf_predictions))
print('Coefficient of determination (R^2): %.2f'
      % r2_score(y_test, rf_predictions))
print('Explained variance score: %.2f'
      % explained_variance_score(y_test, rf_predictions))

plt.scatter(y_test, rf_predictions, color='green', alpha=0.3)
plt.title('Random Forest Regression')
plt.ylabel('Final colony count')
plt.show()

# I tweaked the max_depth and max_features params to get an MSE of 0 and a higher explained variance and R^2 than with 
# max_features of sqrt and no max_depth set. 

Bayesian model

I tried to use PyMC3 and installed it with 'conda install' but it still didn't work. So, I switched to sklearn's Bayesian Ridge regression instead

In [None]:
from sklearn import linear_model

br_model = linear_model.BayesianRidge(n_iter=1000, tol=0.0001)
br_model.fit(X_train, y_train)
y_br_pred = br_model.predict(X_test)

print()
print("Results on the testing data with Random Forest")
print('Mean squared error (MSE): %.2f'
      % mean_squared_error(y_test, y_br_pred))
print('Coefficient of determination (R^2): %.2f'
      % r2_score(y_test, y_br_pred))
print('Explained variance score: %.2f'
      % explained_variance_score(y_test, y_br_pred))

plt.scatter(y_test, y_br_pred, color='green', alpha=0.3)
plt.title('Bayesian Ridge Regression')
plt.ylabel('Final colony count')
plt.show()