In [1]:
# notebook setup
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.inspection import permutation_importance
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import (LinearRegression, LassoCV, ElasticNetCV, RidgeCV, 
                                  SGDRegressor,HuberRegressor, BayesianRidge)
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.svm import SVR

%matplotlib notebook
    
# load data
no_0_kH_C18 = pd.read_csv('../../data/chemistry-channel-info/kH_C18_nonzero.csv')
y = no_0_kH_C18.pop('kH_C18') * 1e6  # convert from mol/kg/Pa to mol/kg/MPa
X = no_0_kH_C18
features = X.columns

# create separate set of exclusively 0-channel zeolites
y0 = y[X.num_channels != 0]
X0 = X[X.num_channels != 0]

def plot_parity(results):
     
    rmse = np.sqrt(mean_squared_error(results[:, 0], results[:, 1]))
        
    x = np.linspace(0, results.max(), 1000000)
    plus_error = 1.05 * x
    minus_error = 0.95 * x
    
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(8, 4))

    axes[0].scatter(results[:, 0], results[:, 1], s=1)
    axes[0].plot([0, results.max()], [0, results.max()], '-k', linewidth=0.5)
    axes[0].fill_between(x, plus_error, minus_error, alpha=0.3)
    axes[0].set_title('full set')
    axes[0].set_xlabel('true [mol/kg/MPa]')
    axes[0].set_ylabel('predicted [mol/kg/MPa]')
    axes[0].text(results.max() / 20, results.max() * 0.9, '$R^2$={:.3f}, RMSE={:.3f}'.format(
        r2_score(results[:, 0], results[:, 1]), rmse))
    axes[0].axis([0, results.max(), 0, results.max()])

    axes[1].scatter(results[:, 0], results[:, 1], s=1)
    axes[1].plot([0, 1e4], [0, 1e4], '-k', linewidth=0.5)
    axes[1].fill_between(x, plus_error, minus_error, alpha=0.3, label='$\pm$5% error')
    axes[1].set_title('zoomed in')
    axes[1].set_xlabel('true [mol/kg/MPa]')
    axes[1].set_ylabel('predicted [mol/kg/MPa]')
    axes[1].axis([0, 1e4, 0, 1e4])
    
    fig.legend()
    fig.tight_layout()


def plot_results_dist(results):
    
    vmin = results.min()
    vmax = results.max()

    rmse = np.sqrt(mean_squared_error(results[:, 0], results[:, 1]))

    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 4))

    ax.hist(results[:, 0], bins=np.linspace(vmin, vmax, 100), 
            edgecolor='black', alpha=0.5, label='Target')
    ax.hist(results[:, 1], bins=np.linspace(vmin, vmax, 100),
            edgecolor='black', alpha=0.5, label='Predicted')
    ax.set_xlabel('$k_{H,C_{18}}$ [mol/kg/MPa]')
    ax.set_ylabel('Frequency')
    ax.set_yscale('log')
    ax.set_title('RMSE: {:.3f}'.format(rmse))
    ax.grid(which='major', axis='y', alpha=0.4)
    ax.legend()
    
    fig.tight_layout()
    plt.box(False)
    
    
def plot_deviance(reg, params, X_test, y_test):
    
    test_score = np.zeros((params['n_estimators'],), dtype=np.float64)
    for i, y_pred in enumerate(reg.staged_predict(X_test)):
        test_score[i] = reg.loss_(y_test, y_pred)

    fig, ax = plt.subplots(figsize=(6, 4))

    ax.plot(np.arange(params['n_estimators']) + 1, reg.train_score_,
            label='Training Set Deviance')
    ax.plot(np.arange(params['n_estimators']) + 1, test_score,
            label='Test Set Deviance')
    ax.set_xlabel('Boosting Iterations')
    ax.set_ylabel('Deviance')
    ax.legend(loc='upper right')
    
    fig.tight_layout()
    
    
def plot_feature_importance(reg):
    
    feature_importance = reg.feature_importances_
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5

    fig, ax = plt.subplots(figsize=(6, 4))

    ax.barh(pos, feature_importance[sorted_idx], align='center')
    plt.yticks(pos, np.array(features)[sorted_idx])
    ax.set_title('Feature Importance (MDI)')
    fig.tight_layout()
    
    
def plot_permutation_importance(reg, X_test, y_test):
    
    sorted_idx = np.argsort(features)
    result = permutation_importance(reg, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2)
    sorted_idx = result.importances_mean.argsort()

    fig, ax = plt.subplots(figsize=(6, 4))

    ax.boxplot(result.importances[sorted_idx].T,
                vert=False, labels=np.array(features)[sorted_idx])
    ax.set_title("Permutation Importance (test set)")

    fig.tight_layout()

# 1.) All features

In [63]:
X_train, X_test, y_train, y_test = train_test_split(
    X0.values, y0.values, test_size=0.2, random_state=13)

transformer = RobustScaler().fit(X_train)
X_train = transformer.transform(X_train)
X_test = transformer.transform(X_test)

### 1.1) GBR

In [64]:
params = {
    'n_estimators': 400,
    'max_depth': 4,
    'min_samples_split': 5,
    'learning_rate': 0.01,
    'loss': 'huber',
}

reg = GradientBoostingRegressor(**params).fit(X_train, y_train)
y_pred = reg.predict(X_test)
results = np.hstack((y_test.reshape(-1, 1), y_pred.reshape(-1, 1)))

In [65]:
plot_parity(results)

<IPython.core.display.Javascript object>

In [66]:
plot_deviance(reg, params, X_test, y_test)

<IPython.core.display.Javascript object>

In [69]:
plot_feature_importance(reg)

<IPython.core.display.Javascript object>

In [71]:
plot_permutation_importance(reg, X_test, y_test)

<IPython.core.display.Javascript object>

# 2.) U_C18 and PLD_min inspection

In [2]:
X_train, X_test, y_train, y_test = train_test_split(
    X0[['U_C18', 'PLD_min']].values, y0.values, test_size=0.2, random_state=8)

transformer = RobustScaler().fit(X_train)
X_train = transformer.transform(X_train)
X_test = transformer.transform(X_test)

### 2.1) LASSO

In [3]:
lasso = LassoCV(alphas=np.logspace(-5, 5, 11), max_iter=1e5)

t0 = time.time()
lasso.fit(X_train, y_train)
lasso_fit = time.time() - t0
print('LASSO complexity and bandwidth selected and model fitted in {:.3f} s'.format(lasso_fit))
print('best model: alpha = {}'.format(lasso.alpha_))
y_pred = lasso.predict(X_test)
results = np.hstack((y_test.reshape(-1, 1), y_pred.reshape(-1, 1)))

LASSO complexity and bandwidth selected and model fitted in 0.089 s
best model: alpha = 1000.0


In [4]:
plot_parity(results)

<IPython.core.display.Javascript object>

### 2.2) Kernel methods

In [5]:
# Fit regression model
svr = GridSearchCV(SVR(kernel='linear', gamma=0.1),
                   param_grid={'C': [1e0, 1e1, 1e2, 1e3, 134],
                               'gamma': np.logspace(-3, 3, 7)})

kr = GridSearchCV(KernelRidge(kernel='rbf', gamma=0.1),
                  param_grid={'alpha': [1e0, 0.1, 1e-2, 1e-3],
                              'gamma': np.logspace(-2, 2, 5)})

train_size = 1000

t0 = time.time()
svr.fit(X_train[:train_size], y_train[:train_size])
svr_fit = time.time() - t0
print('SVR complexity and bandwidth selected and model fitted in {:.3f} s'.format(svr_fit))

t0 = time.time()
kr.fit(X_train[:train_size], y_train[:train_size])
kr_fit = time.time() - t0
print('KRR complexity and bandwidth selected and model fitted in {:.3f} s'.format(kr_fit))

sv_ratio = svr.best_estimator_.support_.shape[0] / X_train.shape[0]
print('Support vector ratio: {:.3f}'.format(sv_ratio))

t0 = time.time()
y_svr = svr.predict(X_test)
svr_predict = time.time() - t0
print('SVR prediction for {:d} inputs in {:.3f} s'.format(X_test.shape[0], svr_predict))

t0 = time.time()
y_kr = kr.predict(X_test)
kr_predict = time.time() - t0
print('KRR prediction for {:d} inputs in {:.3f} s'.format(X_test.shape[0], kr_predict))

SVR complexity and bandwidth selected and model fitted in 4.028 s
KRR complexity and bandwidth selected and model fitted in 2.073 s
Support vector ratio: 0.013
SVR prediction for 19940 inputs in 0.119 s
KRR prediction for 19940 inputs in 0.348 s


In [None]:
y_pred = svr.best_estimator_.predict(X_test)
results = np.hstack((y_test.reshape(-1, 1), y_pred.reshape(-1, 1)))

In [6]:
plot_parity(results)

<IPython.core.display.Javascript object>

In [None]:
y_pred = kr.best_estimator_.predict(X_test)
results = np.hstack((y_test.reshape(-1, 1), y_pred.reshape(-1, 1)))

In [7]:
plot_parity(results)

<IPython.core.display.Javascript object>

# 3.) Single feature test

## PLD_min

In [48]:
X_train, X_test, y_train, y_test = train_test_split(
    X0.PLD_min.values.reshape(-1, 1), y0.values, test_size=0.2, random_state=8)

transformer = RobustScaler().fit(X_train)
X_train = transformer.transform(X_train)
X_test = transformer.transform(X_test)

### 3.1.) Linear regression

In [49]:
linreg = LinearRegression()
ridge = RidgeCV(alphas=np.logspace(-5, 5, 21))
sgdreg = SGDRegressor(loss='squared_loss', penalty='l2', max_iter=1e5, random_state=22)
elanet = ElasticNetCV(l1_ratio=np.linspace(1e-6, 1, 11), max_iter=1e5, random_state=43)
lasso = LassoCV(alphas=np.logspace(-5, 5, 21), max_iter=1e5, random_state=23)
hubreg = HuberRegressor(max_iter=1e5)

t0 = time.time()
linreg.fit(X_train, y_train)
linreg_fit = time.time() - t0
print('LinearRegression model fitted in {:.3f} s, RMSE: {:.4e}'.format(
    linreg_fit, np.sqrt(mean_squared_error(y_test, linreg.predict(X_test)))))

t0 = time.time()
ridge.fit(X_train, y_train)
ridge_fit = time.time() - t0
print('Ridge model fitted in {:.3f} s, RMSE: {:.4e}'.format(
    ridge_fit, np.sqrt(mean_squared_error(y_test, ridge.predict(X_test)))))

t0 = time.time()
sgdreg.fit(X_train, y_train)
sgdreg_fit = time.time() - t0
print('SGDRegressor model fitted in {:.3f} s, RMSE: {:.4e}'.format(
    sgdreg_fit, np.sqrt(mean_squared_error(y_test, sgdreg.predict(X_test)))))

t0 = time.time()
elanet.fit(X_train, y_train)
elanet_fit = time.time() - t0
print('ElasticNet model fitted in {:.3f} s, RMSE: {:.4e}'.format(
    elanet_fit, np.sqrt(mean_squared_error(y_test, elanet.predict(X_test)))))

t0 = time.time()
lasso.fit(X_train, y_train)
lasso_fit = time.time() - t0
print('LASSO model fitted in {:.3f} s, RMSE: {:.4e}'.format(
    lasso_fit, np.sqrt(mean_squared_error(y_test, lasso.predict(X_test)))))

t0 = time.time()
hubreg.fit(X_train, y_train)
hubreg_fit = time.time() - t0
print('HuberRegressor model fitted in {:.3f} s, RMSE: {:.4e}'.format(
    hubreg_fit, np.sqrt(mean_squared_error(y_test, hubreg.predict(X_test)))))

LinearRegression model fitted in 0.003 s, RMSE: 1.6508e+06
Ridge model fitted in 0.041 s, RMSE: 1.6506e+06
SGDRegressor model fitted in 0.077 s, RMSE: 1.4087e+06
ElasticNet model fitted in 4.029 s, RMSE: 1.6506e+06
LASSO model fitted in 0.097 s, RMSE: 1.6506e+06
HuberRegressor model fitted in 0.079 s, RMSE: 1.4101e+06


In [50]:
# generate trendline for PLD_max vs. kH_C18
Xgen = np.linspace(X0.PLD_max.min(), X0.PLD_max.max(), 1000).reshape(-1, 1)

linreg_pred = linreg.predict(Xgen)
ridge_pred = ridge.predict(Xgen)
sgdreg_pred = sgdreg.predict(Xgen)
elanet_pred = elanet.predict(Xgen)
lasso_pred = lasso.predict(Xgen)
hubreg_pred = hubreg.predict(Xgen)

fig, ax = plt.subplots(figsize=(6, 5))

ax.scatter(X0.PLD_min, y0, s=1)
ax.plot([0, 0], [0, 0])
ax.plot(Xgen, linreg_pred, linewidth=1, label='LinearRegression [m = {:.3e}, b = {:.3e}]'.format(
    linreg.coef_[0], linreg.intercept_))
ax.plot(Xgen, ridge_pred, linewidth=1, label='Ridge [m = {:.3e}, b = {:.3e}]'.format(
    ridge.coef_[0], ridge.intercept_))
ax.plot(Xgen, sgdreg_pred, linewidth=1, label='SGDRegressor [m = {:.3e}, b = {:.3e}]'.format(
    sgdreg.coef_[0], sgdreg.intercept_[0]))
ax.plot(Xgen, elanet_pred, linewidth=1, label='ElasticNet [m = {:.3e}, b = {:.3e}]'.format(
    elanet.coef_[0], elanet.intercept_))
ax.plot(Xgen, lasso_pred, linewidth=1, label='LASSO [m = {:.3e}, b = {:.3e}]'.format(
    lasso.coef_[0], lasso.intercept_))
ax.plot(Xgen, hubreg_pred, linewidth=1, label='HuberRegressor [m = {:.3e}, b = {:.3e}]'.format(
    hubreg.coef_[0], hubreg.intercept_))
ax.set_xlabel('Pore limiting diameter minimum [Å]')
ax.set_ylabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
ax.set_yscale('log')
ax.axis([2, 30, None, None])

ax.legend()
fig.tight_layout()

<IPython.core.display.Javascript object>

In [21]:
# INCLUDING 0-channel examples
df = pd.read_csv('../results/trendline.csv')

fig, ax = plt.subplots(figsize=(6, 5))

ax.scatter(X0.PLD_min, y0, s=1)
ax.plot(df['0'], df['1'], 'orange', label='NN predictions')
ax.set_xlabel('Pore limiting diameter minimum [Å]')
ax.set_ylabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
ax.set_yscale('log')
ax.axis([2, 30, None, None])
ax.legend()

fig.tight_layout()

<IPython.core.display.Javascript object>

In [16]:
# EXCLUDING 0-channel examples
df = pd.read_csv('../results/trendline-no-0-channels.csv')

fig, ax = plt.subplots(figsize=(6, 5))

ax.scatter(X0.PLD_min, y0, s=1)
ax.plot(df['0'], df['1'], 'orange', label='NN predictions')
ax.set_xlabel('Pore limiting diameter minimum [Å]')
ax.set_ylabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
ax.set_yscale('log')
ax.axis([2, 30, None, None])
ax.legend()

fig.tight_layout()

<IPython.core.display.Javascript object>

## U_C18

In [45]:
X_train, X_test, y_train, y_test = train_test_split(
    X0.U_C18.values.reshape(-1, 1), y0.values, test_size=0.2, random_state=8)

transformer = RobustScaler().fit(X_train)
X_train = transformer.transform(X_train)
X_test = transformer.transform(X_test)

### 3.1.) Linear regression

In [46]:
linreg = LinearRegression()
ridge = RidgeCV(alphas=np.logspace(-5, 5, 21))
sgdreg = SGDRegressor(loss='squared_loss', penalty='l2', max_iter=1e5, random_state=22)
elanet = ElasticNetCV(l1_ratio=np.linspace(1e-6, 1, 11), max_iter=1e5, random_state=43)
lasso = LassoCV(alphas=np.logspace(-5, 5, 21), max_iter=1e5, random_state=23)
hubreg = HuberRegressor(max_iter=1e5)

t0 = time.time()
linreg.fit(X_train, y_train)
linreg_fit = time.time() - t0
print('LinearRegression model fitted in {:.3f} s, RMSE: {:.4e}'.format(
    linreg_fit, np.sqrt(mean_squared_error(y_test, linreg.predict(X_test)))))

t0 = time.time()
ridge.fit(X_train, y_train)
ridge_fit = time.time() - t0
print('Ridge model fitted in {:.3f} s, RMSE: {:.4e}'.format(
    ridge_fit, np.sqrt(mean_squared_error(y_test, ridge.predict(X_test)))))

t0 = time.time()
sgdreg.fit(X_train, y_train)
sgdreg_fit = time.time() - t0
print('SGDRegressor model fitted in {:.3f} s, RMSE: {:.4e}'.format(
    sgdreg_fit, np.sqrt(mean_squared_error(y_test, sgdreg.predict(X_test)))))

t0 = time.time()
elanet.fit(X_train, y_train)
elanet_fit = time.time() - t0
print('ElasticNet model fitted in {:.3f} s, RMSE: {:.4e}'.format(
    elanet_fit, np.sqrt(mean_squared_error(y_test, elanet.predict(X_test)))))

t0 = time.time()
lasso.fit(X_train, y_train)
lasso_fit = time.time() - t0
print('LASSO model fitted in {:.3f} s, RMSE: {:.4e}'.format(
    lasso_fit, np.sqrt(mean_squared_error(y_test, lasso.predict(X_test)))))

t0 = time.time()
hubreg.fit(X_train, y_train)
hubreg_fit = time.time() - t0
print('HuberRegressor model fitted in {:.3f} s, RMSE: {:.4e}'.format(
    hubreg_fit, np.sqrt(mean_squared_error(y_test, hubreg.predict(X_test)))))

LinearRegression model fitted in 0.003 s, RMSE: 2.9248e+06
Ridge model fitted in 0.041 s, RMSE: 2.1782e+06
SGDRegressor model fitted in 0.083 s, RMSE: 1.4062e+06
ElasticNet model fitted in 3.911 s, RMSE: 2.9194e+06
LASSO model fitted in 0.082 s, RMSE: 2.9215e+06
HuberRegressor model fitted in 0.073 s, RMSE: 1.4100e+06


In [47]:
# generate trendline for U_C18 vs. kH_C18
Xgen = np.linspace(X0.U_C18.min(), X0.U_C18.max(), 1000).reshape(-1, 1)

linreg_pred = linreg.predict(Xgen)
ridge_pred = ridge.predict(Xgen)
sgdreg_pred = sgdreg.predict(Xgen)
elanet_pred = elanet.predict(Xgen)
lasso_pred = lasso.predict(Xgen)
hubreg_pred = hubreg.predict(Xgen)

fig, ax = plt.subplots(figsize=(8, 5))

ax.scatter(X0.U_C18, y0, s=1)
ax.plot(Xgen, linreg_pred, linewidth=1, label='LinearRegression [m = {:.3e}, b = {:.3e}]'.format(
    linreg.coef_[0], linreg.intercept_))
ax.plot(Xgen, ridge_pred, linewidth=1, label='Ridge [m = {:.3e}, b = {:.3e}]'.format(
    ridge.coef_[0], ridge.intercept_))
ax.plot(Xgen, sgdreg_pred, linewidth=1, label='SGDRegressor [m = {:.3e}, b = {:.3e}]'.format(
    sgdreg.coef_[0], sgdreg.intercept_[0]))
ax.plot(Xgen, elanet_pred, linewidth=1, label='ElasticNet [m = {:.3e}, b = {:.3e}]'.format(
    elanet.coef_[0], elanet.intercept_))
ax.plot(Xgen, lasso_pred, linewidth=1, label='LASSO [m = {:.3e}, b = {:.3e}]'.format(
    lasso.coef_[0], lasso.intercept_))
ax.plot(Xgen, hubreg_pred, linewidth=1, label='HuberRegressor [m = {:.3e}, b = {:.3e}]'.format(
    hubreg.coef_[0], hubreg.intercept_))
ax.set_xlabel('U$_{C_{18}}$ [kJ/mol]')
# ax.set_xscale('log')
ax.set_ylabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
ax.set_yscale('log')

ax.legend()
fig.tight_layout()

<IPython.core.display.Javascript object>

## SETE_C18

In [39]:
X_train, X_test, y_train, y_test = train_test_split(
    X0.SETE_C18.values.reshape(-1, 1), y0.values, test_size=0.2, random_state=8)

transformer = RobustScaler().fit(X_train)
X_train = transformer.transform(X_train)
X_test = transformer.transform(X_test)

### 3.1.) Linear regression

In [40]:
linreg = LinearRegression()
ridge = RidgeCV(alphas=np.logspace(-5, 5, 21))
sgdreg = SGDRegressor(loss='squared_loss', penalty='l2', max_iter=1e5, random_state=22)
elanet = ElasticNetCV(l1_ratio=np.linspace(1e-6, 1, 11), max_iter=1e5, random_state=43)
lasso = LassoCV(alphas=np.logspace(-5, 5, 21), max_iter=1e5, random_state=23)
hubreg = HuberRegressor(max_iter=1e5)

t0 = time.time()
linreg.fit(X_train, y_train)
linreg_fit = time.time() - t0
print('LinearRegression model fitted in {:.3f} s, RMSE: {:.4e}'.format(
    linreg_fit, np.sqrt(mean_squared_error(y_test, linreg.predict(X_test)))))

t0 = time.time()
ridge.fit(X_train, y_train)
ridge_fit = time.time() - t0
print('Ridge model fitted in {:.3f} s, RMSE: {:.4e}'.format(
    ridge_fit, np.sqrt(mean_squared_error(y_test, ridge.predict(X_test)))))

t0 = time.time()
sgdreg.fit(X_train, y_train)
sgdreg_fit = time.time() - t0
print('SGDRegressor model fitted in {:.3f} s, RMSE: {:.4e}'.format(
    sgdreg_fit, np.sqrt(mean_squared_error(y_test, sgdreg.predict(X_test)))))

t0 = time.time()
elanet.fit(X_train, y_train)
elanet_fit = time.time() - t0
print('ElasticNet model fitted in {:.3f} s, RMSE: {:.4e}'.format(
    elanet_fit, np.sqrt(mean_squared_error(y_test, elanet.predict(X_test)))))

t0 = time.time()
lasso.fit(X_train, y_train)
lasso_fit = time.time() - t0
print('LASSO model fitted in {:.3f} s, RMSE: {:.4e}'.format(
    lasso_fit, np.sqrt(mean_squared_error(y_test, lasso.predict(X_test)))))

t0 = time.time()
hubreg.fit(X_train, y_train)
hubreg_fit = time.time() - t0
print('HuberRegressor model fitted in {:.3f} s, RMSE: {:.4e}'.format(
    hubreg_fit, np.sqrt(mean_squared_error(y_test, hubreg.predict(X_test)))))

LinearRegression model fitted in 0.003 s, RMSE: 1.8047e+06
Ridge model fitted in 0.036 s, RMSE: 1.6986e+06
SGDRegressor model fitted in 0.102 s, RMSE: 1.4088e+06
ElasticNet model fitted in 4.031 s, RMSE: 1.8041e+06
LASSO model fitted in 0.083 s, RMSE: 1.8042e+06
HuberRegressor model fitted in 0.080 s, RMSE: 1.4101e+06


In [41]:
# generate trendline for SETE_C18 vs. kH_C18
Xgen = np.linspace(X0.SETE_C18.min(), X0.SETE_C18.max(), 1000).reshape(-1, 1)

linreg_pred = linreg.predict(Xgen)
ridge_pred = ridge.predict(Xgen)
sgdreg_pred = sgdreg.predict(Xgen)
elanet_pred = elanet.predict(Xgen)
lasso_pred = lasso.predict(Xgen)
hubreg_pred = hubreg.predict(Xgen)

fig, ax = plt.subplots(figsize=(8, 5))

ax.scatter(X0.SETE_C18, y0, s=1)
ax.plot(Xgen, linreg_pred, linewidth=1, label='LinearRegression [m = {:.3e}, b = {:.3e}]'.format(
    linreg.coef_[0], linreg.intercept_))
ax.plot(Xgen, ridge_pred, linewidth=1, label='Ridge [m = {:.3e}, b = {:.3e}]'.format(
    ridge.coef_[0], ridge.intercept_))
ax.plot(Xgen, sgdreg_pred, linewidth=1, label='SGDRegressor [m = {:.3e}, b = {:.3e}]'.format(
    sgdreg.coef_[0], sgdreg.intercept_[0]))
ax.plot(Xgen, elanet_pred, linewidth=1, label='ElasticNet [m = {:.3e}, b = {:.3e}]'.format(
    elanet.coef_[0], elanet.intercept_))
ax.plot(Xgen, lasso_pred, linewidth=1, label='LASSO [m = {:.3e}, b = {:.3e}]'.format(
    lasso.coef_[0], lasso.intercept_))
ax.plot(Xgen, hubreg_pred, linewidth=1, label='HuberRegressor [m = {:.3e}, b = {:.3e}]'.format(
    hubreg.coef_[0], hubreg.intercept_))
ax.set_xlabel('SETE$_{C_{18}}$ [kJ/mol]')
ax.set_ylabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
ax.set_yscale('log')

ax.legend()
fig.tight_layout()

<IPython.core.display.Javascript object>

### 3.2.) GBR

In [11]:
params = {
    'n_estimators': 400,
    'max_depth': 6,
    'min_samples_split': 5,
    'learning_rate': 0.01,
    'loss': 'huber',
}

reg = GradientBoostingRegressor(**params).fit(X_train, y_train)
y_pred = reg.predict(X_test)
results = np.hstack((y_test.reshape(-1, 1), y_pred.reshape(-1, 1)))

The RMSE on test set: 9099791.4130


In [76]:
plot_deviance(reg, params, X_test, y_test)

<IPython.core.display.Javascript object>

In [15]:
# generate trendline for PLD_max vs. kH_C18
Xgen = np.linspace(X0.PLD_max.min(), X0.PLD_max.max(), 1000).reshape(-1, 1)

reg_pred = reg.predict(Xgen)

fig, ax = plt.subplots(figsize=(8, 5))

ax.scatter(X0.PLD_max, y0, s=1)
ax.plot([0, 0], [0, 0])
ax.plot(Xgen, reg_pred, linewidth=1, label='GBR')
ax.set_xlabel('Pore limiting diameter [Å]')
ax.set_ylabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
ax.set_yscale('log')
ax.axis([2, 30, None, None])

ax.legend()
fig.tight_layout()

<IPython.core.display.Javascript object>

# 4.) Multidimensional

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    X0[['U_C18', 'PLD_max']].values, y0.values, 
    test_size=0.2, random_state=8,
)

transformer = RobustScaler().fit(X_train)
X_train = transformer.transform(X_train)
X_test = transformer.transform(X_test)

### 4.1.) Linear regression

In [17]:
linreg = LinearRegression()
ridge = RidgeCV(alphas=np.logspace(-5, 5, 21))
sgdreg = SGDRegressor(loss='squared_loss', penalty='l2', max_iter=1e5, random_state=22)
elanet = ElasticNetCV(l1_ratio=np.linspace(1e-6, 1, 11), max_iter=1e5, random_state=43)
lasso = LassoCV(alphas=np.logspace(-5, 5, 21), max_iter=1e5, random_state=23)
bayreg = BayesianRidge(n_iter=1e5)
hubreg = HuberRegressor(max_iter=1e5)

t0 = time.time()
linreg.fit(X_train, y_train)
linreg_fit = time.time() - t0
print('LinearRegression model fitted in {:.3f} s, RMSE: {:.4e}'.format(
    linreg_fit, np.sqrt(mean_squared_error(y_test, linreg.predict(X_test)))))

t0 = time.time()
ridge.fit(X_train, y_train)
ridge_fit = time.time() - t0
print('Ridge model fitted in {:.3f} s, RMSE: {:.4e}'.format(
    ridge_fit, np.sqrt(mean_squared_error(y_test, ridge.predict(X_test)))))

t0 = time.time()
sgdreg.fit(X_train, y_train)
sgdreg_fit = time.time() - t0
print('SGDRegressor model fitted in {:.3f} s, RMSE: {:.4e}'.format(
    sgdreg_fit, np.sqrt(mean_squared_error(y_test, sgdreg.predict(X_test)))))

t0 = time.time()
elanet.fit(X_train, y_train)
elanet_fit = time.time() - t0
print('ElasticNet model fitted in {:.3f} s, RMSE: {:.4e}'.format(
    elanet_fit, np.sqrt(mean_squared_error(y_test, elanet.predict(X_test)))))

t0 = time.time()
lasso.fit(X_train, y_train)
lasso_fit = time.time() - t0
print('LASSO model fitted in {:.3f} s, RMSE: {:.4e}'.format(
    lasso_fit, np.sqrt(mean_squared_error(y_test, lasso.predict(X_test)))))

t0 = time.time()
hubreg.fit(X_train, y_train)
hubreg_fit = time.time() - t0
print('HuberRegressor model fitted in {:.3f} s, RMSE: {:.4e}'.format(
    hubreg_fit, np.sqrt(mean_squared_error(y_test, hubreg.predict(X_test)))))

LinearRegression model fitted in 0.007 s, RMSE: 2.9554e+06
Ridge model fitted in 0.053 s, RMSE: 2.1783e+06
SGDRegressor model fitted in 0.076 s, RMSE: 1.4065e+06
ElasticNet model fitted in 4.311 s, RMSE: 2.9520e+06
LASSO model fitted in 0.113 s, RMSE: 2.9542e+06
HuberRegressor model fitted in 0.129 s, RMSE: 1.4099e+06


### 4.2.) Tweedie regression

In [37]:
from sklearn.linear_model import TweedieRegressor

tweedie = GridSearchCV(
    TweedieRegressor(max_iter=1000000),
    param_grid={
        'power': np.linspace(1.651, 1.653, 11),
#         'alpha': np.logspace(-3, 3, 7),
    },
)

tweedie.fit(X_train, y_train)
print('best model: {}'.format(tweedie.best_estimator_))
y_pred = tweedie.best_estimator_.predict(X_test)
results = np.hstack((y_test.reshape(-1, 1), y_pred.reshape(-1, 1)))

best model: TweedieRegressor(max_iter=1000000, power=1.6520000000000001)


In [38]:
plot_parity(results)

<IPython.core.display.Javascript object>

In [39]:
tweedie.best_estimator_.score(X_test, y_test)

0.5283893500474623

### 4.3.) Support Vector Regression

In [41]:
svr = GridSearchCV(
    SVR(kernel='linear', gamma=0.1),
    param_grid={
        'C': np.logspace(0, 6, 7),
        'gamma': np.logspace(-4, 4, 9),
    },
)

train_size = 1000

t0 = time.time()
svr.fit(X_train[:train_size], y_train[:train_size])
svr_fit = time.time() - t0
print('SVR model selected in {:.3f} s'.format(svr_fit))
print('Best SVR model {}'.format(svr.best_estimator_))
y_pred = svr.best_estimator_.predict(X_test)
results = np.hstack((y_test.reshape(-1, 1), y_pred.reshape(-1, 1)))

SVR complexity and bandwidth selected and model fitted in 42.283 s
Best SVR model SVR(C=100000.0, gamma=0.0001, kernel='linear')


In [42]:
plot_parity(results)

<IPython.core.display.Javascript object>

# 5.) Using cutoff value to limit training examples

In [114]:
cutoff = 1e6

# Xc = X0[y0 <= cutoff]
# yc = y0[y0 <= cutoff]

X_train, X_test, y_train, y_test = train_test_split(
    X0.values, y0.values, test_size=0.2, random_state=13)

X_train = X_train[y_train <= cutoff]
y_train = y_train[y_train <= cutoff]

transformer = RobustScaler().fit(X_train)
X_train = transformer.transform(X_train)
X_test = transformer.transform(X_test)

### 5.1.) GBR

In [115]:
params = {
    'n_estimators': 400,
    'max_depth': 4,
    'min_samples_split': 5,
    'learning_rate': 0.05,
    'loss': 'huber',
}

reg = GradientBoostingRegressor(**params).fit(X_train, y_train)
y_pred = reg.predict(X_test)
results = np.hstack((y_test.reshape(-1, 1), y_pred.reshape(-1, 1)))

In [116]:
plot_deviance(reg, params, X_test, y_test)

<IPython.core.display.Javascript object>

In [117]:
plot_parity(results)

<IPython.core.display.Javascript object>

In [118]:
plot_results_dist(results)

<IPython.core.display.Javascript object>

In [110]:
plot_permutation_importance(reg, X_test, y_test)

<IPython.core.display.Javascript object>

### 5.2.) RF

### 5.3.) SVR