In [3]:
# notebook setup
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.inspection import permutation_importance
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import (LinearRegression, LassoCV, ElasticNetCV, RidgeCV, 
                                  SGDRegressor,HuberRegressor, BayesianRidge)
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.svm import SVR

%matplotlib notebook
    
# load data
C18_perm = pd.read_csv('../../data/chemistry-channel-info/C18-permeable.csv')
y = C18_perm.pop('kH_C18')
X = C18_perm
features = X.columns

# create separate set of exclusively 0-channel zeolites
y0 = y[X.num_channels != 0]
X0 = X[X.num_channels != 0]

def plot_parity(results):
    
    rmse = np.sqrt(mean_squared_error(results[:, 0], results[:, 1]))
        
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(8, 4))

    axes[0].scatter(results[:, 0], results[:, 1], s=1)
    axes[0].plot([0, results.max()], [0, results.max()], '-k', linewidth=0.5)
    axes[0].set_title('full set')
    axes[0].set_xlabel('true [mol/kg/MPa]')
    axes[0].set_ylabel('predicted [mol/kg/MPa]')
    axes[0].text(results.max() / 20, results.max() * 0.9, '$R^2$={:.3f}, RMSE={:.3f}'.format(
        r2_score(results[:, 0], results[:, 1]), rmse))
    axes[0].axis([0, results.max(), 0, results.max()])

    axes[1].scatter(results[:, 0], results[:, 1], s=1)
    axes[1].plot([0, 1e4], [0, 1e4], '-k', linewidth=0.5)
    axes[1].set_title('zoomed in')
    axes[1].set_xlabel('true [mol/kg/MPa]')
    axes[1].set_ylabel('predicted [mol/kg/MPa]')
    axes[1].axis([0, 1e4, 0, 1e4])
    
    fig.tight_layout()


def plot_results_dist(results):
    
    vmin = results.min()
    vmax = results.max()

    rmse = np.sqrt(mean_squared_error(results[:, 0], results[:, 1]))

    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 4))

    ax.hist(results[:, 0], bins=np.linspace(vmin, vmax, 100), 
            edgecolor='black', alpha=0.5, label='Target')
    ax.hist(results[:, 1], bins=np.linspace(vmin, vmax, 100),
            edgecolor='black', alpha=0.5, label='Predicted')
    ax.set_xlabel('$k_{H,C_{18}}$ [mol/kg/MPa]')
    ax.set_ylabel('Frequency')
    ax.set_yscale('log')
    ax.set_title('RMSE: {:.3f}'.format(rmse))
    ax.grid(which='major', axis='y', alpha=0.4)

    fig.legend()
    fig.tight_layout()
    plt.box(False)

In [116]:
params = {
    'n_estimators': 300,
    'max_depth': 4,
    'min_samples_split': 5,
    'learning_rate': 0.01,
    'loss': 'huber',
}

reg = GradientBoostingRegressor(**params).fit(X_train, y_train)

In [118]:
test_score = np.zeros((params['n_estimators'],), dtype=np.float64)
for i, y_pred in enumerate(reg.staged_predict(X_test)):
    test_score[i] = reg.loss_(y_test, y_pred)

fig = plt.figure(figsize=(6, 4))
plt.subplot(1, 1, 1)
plt.title('Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, reg.train_score_,
         label='Training Set Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, test_score,
         label='Test Set Deviance')
plt.legend(loc='upper right')
plt.xlabel('Boosting Iterations')
plt.ylabel('Deviance')
fig.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

In [119]:
y_pred = reg.predict(X_test)
results = np.hstack((y_test.reshape(-1, 1), y_pred.reshape(-1, 1)))
plot_parity(results)

<IPython.core.display.Javascript object>

In [120]:
plot_results_dist(results)

<IPython.core.display.Javascript object>

In [122]:
sorted_idx = np.argsort(features)
fig = plt.figure(figsize=(6, 4))
result = permutation_importance(reg, X_test, y_test, n_repeats=5, random_state=42, n_jobs=2)
sorted_idx = result.importances_mean.argsort()
plt.boxplot(result.importances[sorted_idx].T,
            vert=False, labels=np.array(features)[sorted_idx])
plt.title("Permutation Importance (test set)")
fig.tight_layout()

<IPython.core.display.Javascript object>

In [144]:
X_train, X_test, y_train, y_test = train_test_split(X.PLD_min.values, y.values, 
                                                    test_size=0.2, random_state=2)

In [139]:
lasso = GridSearchCV(Lasso(max_iter=10000, random_state=4),
                     param_grid={'alpha': np.logspace(-5, 5, 11)})

t0 = time.time()
lasso.fit(X_train, y_train)
lasso_fit = time.time() - t0
print('LASSO complexity and bandwidth selected and model fitted in {:.3f} s'.format(lasso_fit))
print('best model: {}'.format(lasso.best_estimator_))

LASSO complexity and bandwidth selected and model fitted in 2.426 s
best model: Lasso(alpha=100000.0, max_iter=10000, random_state=4)


In [140]:
y_pred = lasso.predict(X_test)
results = np.hstack((y_test.reshape(-1, 1), y_pred.reshape(-1, 1)))
plot_parity(results)

<IPython.core.display.Javascript object>

In [145]:
params = {
    'n_estimators': 300,
    'max_depth': 4,
    'min_samples_split': 5,
    'learning_rate': 0.01,
    'loss': 'huber',
}

reg = GradientBoostingRegressor(**params).fit(X_train.reshape(-1, 1), y_train)

In [146]:
y_pred = reg.predict(X_test.reshape(-1, 1))
results = np.hstack((y_test.reshape(-1, 1), y_pred.reshape(-1, 1)))
plot_parity(results)

<IPython.core.display.Javascript object>

### Using k$_{H,C_{18}}$ cutoff

In [62]:
cutoff = 2.6e6

X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, 
                                                    test_size=0.2, random_state=2)

X_train = X_train[y_train <= cutoff]
y_train = y_train[y_train <= cutoff]

transformer = RobustScaler().fit(X_train)
X_train = transformer.transform(X_train)
X_test = transformer.transform(X_test)

In [63]:
params = {
    'n_estimators': 300,
    'max_depth': 4,
    'min_samples_split': 5,
    'learning_rate': 0.01,
    'loss': 'ls',
}

reg = GradientBoostingRegressor(**params).fit(X_train, y_train)

In [64]:
test_score = np.zeros((params['n_estimators'],), dtype=np.float64)
for i, y_pred in enumerate(reg.staged_predict(X_test)):
    test_score[i] = reg.loss_(y_test, y_pred)

fig = plt.figure(figsize=(6, 4))
plt.subplot(1, 1, 1)
plt.title('Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, reg.train_score_,
         label='Training Set Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, test_score,
         label='Test Set Deviance')
plt.legend(loc='upper right')
plt.xlabel('Boosting Iterations')
plt.ylabel('Deviance')
fig.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

In [65]:
y_pred = reg.predict(X_test)
results = np.hstack((y_test.reshape(-1, 1), y_pred.reshape(-1, 1)))
plot_parity(results)

<IPython.core.display.Javascript object>

In [66]:
plot_results_dist(results)

<IPython.core.display.Javascript object>

In [67]:
sorted_idx = np.argsort(features)
fig = plt.figure(figsize=(6, 4))
result = permutation_importance(reg, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2)
sorted_idx = result.importances_mean.argsort()
plt.boxplot(result.importances[sorted_idx].T,
            vert=False, labels=np.array(features)[sorted_idx])
plt.title("Permutation Importance (test set)")
fig.tight_layout()

<IPython.core.display.Javascript object>