In [1]:
import time 

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, RobustScaler

# load data
df = pd.read_csv('../../../data/chemistry-channel-info/C18-kH_C18-nonzero.csv')
X = df.drop(columns=['kH_C18'])
y = df['kH_C18']
features = X.columns

if y.max() <= 1e7:  # convert mol/kg/Pa to mol/kg/MPa
    y *= 1e6

# create separate set of exclusively zeolites with channels
y0 = y[X.num_channels != 0]
X0 = X[X.num_channels != 0]


def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, -1),  # -1 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

        
%matplotlib notebook

In [2]:
fig, ax = plt.subplots(figsize=(7, 5))

counts, bin_edges, _ = ax.hist(
    y,
    bins=np.logspace(
        np.log10(y.min()),
        np.log10(y.max()),
        201,
    ),
    edgecolor='black',
)
ax.set_xlabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
ax.set_xscale('log')
ax.set_ylabel('Frequency')
ax.grid(axis='y', which='both', alpha=0.3)

fig.tight_layout()
plt.box(False)

<IPython.core.display.Javascript object>

In [3]:
# Number of bins with less than 50 counts
len(counts[counts <= 50])

31

In [4]:
# Left bin edges of bins with less than 50 counts
bin_edges[:-1][counts <= 50]

array([6.24600000e-17, 8.50020176e-17, 4.78698982e+06, 6.51463006e+06,
       8.86578129e+06, 1.20654707e+07, 1.64199384e+07, 2.23459477e+07,
       3.04106730e+07, 4.13859841e+07, 5.63223206e+07, 7.66492298e+07,
       1.04312187e+08, 1.41958795e+08, 1.93192187e+08, 2.62915877e+08,
       3.57803074e+08, 4.86935370e+08, 6.62671933e+08, 9.01832394e+08,
       1.22730665e+09, 1.67024562e+09, 2.27304270e+09, 3.09339123e+09,
       4.20980621e+09, 5.72913899e+09, 7.79680393e+09, 1.06106959e+10,
       1.44401306e+10, 1.96516209e+10, 2.67439549e+10])

### Single dimensional analysis on $k_{H,C_{18}}$

In [5]:
quan_norm = QuantileTransformer(n_quantiles=1000, output_distribution='normal', 
                                subsample=1e5, random_state=33)
bc = PowerTransformer(method='box-cox', standardize=True)
yj = PowerTransformer(method='yeo-johnson', standardize=True)

In [6]:
y_bc = bc.fit_transform(y0.values.reshape(-1, 1))
y_yj = yj.fit_transform(y0.values.reshape(-1, 1))
y_quan = quan_norm.fit_transform(y0.values.reshape(-1, 1))

fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(8.5, 6))
axes = axes.ravel()

axes[0].hist(y0, bins=50, edgecolor='black')
axes[0].set_title('Original')
axes[0].set_xlabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
axes[0].set_ylabel('Frequency')

axes[1].hist(y[X.num_channels != 0], 
        bins=np.logspace(np.log10(y0.min()),
                         np.log10(y0.max()),
                         50),
        edgecolor='black')
axes[1].set_title('Original')
axes[1].set_xlabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
axes[1].set_xscale('log')

axes[2].hist(y_bc, bins=50, edgecolor='black')
axes[2].set_title('Box-Cox\ntransformation')

axes[3].hist(y_yj, bins=50, edgecolor='black')
axes[3].set_title('Yeo-Johnson\ntransformation')

axes[4].hist(np.log1p(y0), bins=50, edgecolor='black')
axes[4].set_title('log(1 + x)')

axes[5].hist(y_quan, bins=50, edgecolor='black')
axes[5].set_title('Quantile tranformation')
        
fig.tight_layout()

<IPython.core.display.Javascript object>

In [7]:
pld_min_bc = bc.fit_transform(X0.PLD_min.values.reshape(-1, 1))
pld_min_yj = yj.fit_transform(X0.PLD_min.values.reshape(-1, 1))
pld_min_quan = quan_norm.fit_transform(X0.PLD_min.values.reshape(-1, 1))

fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(8.5, 6))
axes = axes.ravel()

axes[0].hist(X0.PLD_min, bins=50, edgecolor='black')
axes[0].set_title('Original')
axes[0].set_xlabel('PLD$_{min}$ [Å]')
axes[0].set_ylabel('Frequency')

axes[1].hist(pld_min_bc, bins=50, edgecolor='black')
axes[1].set_title('Box-Cox\ntransformation')

axes[2].hist(pld_min_yj, bins=50, edgecolor='black')
axes[2].set_title('Yeo-Johnson\ntransformation')

axes[3].hist(np.log(X0.PLD_min), bins=50, edgecolor='black')
axes[3].set_title('log(PLD$_{min}$)')

axes[4].hist(pld_min_quan, bins=50, edgecolor='black')
axes[4].set_title('Quantile transformation')

fig.delaxes(axes[5])

fig.tight_layout()

<IPython.core.display.Javascript object>

In [8]:
nbins = 200
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(6, 8))

counts, bin_edges, _ = axes[0].hist(
    y0, 
    bins=np.logspace(
        np.log10(y0.min()),
        np.log10(y0.max()),
        nbins,
    ),
)
axes[0].set_xlabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
axes[0].set_xscale('log')
axes[0].set_ylabel('Frequency', color='C0')
axes[0].tick_params(axis='y', labelcolor='C0')
axes[0].grid(b=True, axis='x', which='both', alpha=0.5)

ax_twin = axes[0].twinx()
frac_cumsum, _, _ = ax_twin.hist(
    y0, 
    bins=np.logspace(
        np.log10(y0.min()),
        np.log10(y0.max()),
        nbins,
    ),
    density=True,
    histtype='step',
    cumulative=True,
    color='orange',
)
ax_twin.set_ylabel('Fraction of cumulative', color='orange')
ax_twin.set_xscale('log')
ax_twin.tick_params(axis='y', labelcolor='orange')

axes[1].plot(
    np.logspace(
        np.log10(y0.min()),
        np.log10(y0.max()),
        nbins
    ),
    np.pad(counts[1:] - counts[:-1], 1),  # need to pad array to match length of x
) 
axes[1].plot(
    np.logspace(
        np.log10(y0.min()),
        np.log10(y0.max()),
        nbins
    ),
    [0]*nbins, '--k', linewidth=0.5,
)
axes[1].set_xlabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
axes[1].set_xscale('log')
axes[1].set_ylabel('Change in successive bin counts')
axes[1].minorticks_on()
axes[1].grid(b=True, axis='x', which='both', alpha=0.5)

fig.tight_layout()

<IPython.core.display.Javascript object>

In [9]:
cutoff_arb = 1e6  # arbitrary cutoff from visual inspection
cutoff_idx = np.argmax(frac_cumsum[frac_cumsum <= 0.95])  # return index of cutoff
cutoff = bin_edges[cutoff_idx + 1]  # add 1 since bin edges start before any data is binned
cutoff

151726.95912554904

In [10]:
# check to see if fraction kept is inline with cutoff fraction
y[y <= cutoff].shape[0] / y.shape[0]  # it is

0.9475234030700053

### 2-D analysis on PLD$_{min}$ and k$_{H,C_{18}}$

In [11]:
data = np.hstack((
    y0.values.reshape(-1, 1), 
    X0.PLD_min.values.reshape(-1, 1)
))
data_bc = bc.fit_transform(data)
data_yj = yj.fit_transform(data)

In [12]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(8.5, 3))

axes[0].scatter(data[:, 1], data[:, 0], s=1)
axes[0].set_title('Original')
axes[0].set_xlabel('$PLD_{min}$ [Å]')
axes[0].set_ylabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
axes[0].set_yscale('log')

axes[1].scatter(data_bc[:, 1], data_bc[:, 0], s=1)
axes[1].set_title('Box-Cox transformation')

axes[2].scatter(data_yj[:, 1], data_yj[:, 0], s=1)
axes[2].set_title('Yeo-Johnson transformation')
        
fig.tight_layout()

<IPython.core.display.Javascript object>

# Outlier detection using LocalOutlierFactor

### Outlier detection with zero-channel zeolites

In [13]:
from sklearn.neighbors import LocalOutlierFactor

# using auto `contamination` parameter and then
# checking afterward to see calculated value

# supposedly n_neighbors=20 works well in general
#  scipy.spatial.distance.minkowski(u, v, p=2, w=None)
#     Compute the Minkowski distance between two 1-D arrays.
# when p=2 in minkowski distance, similar to euclidean (it seems)

lof = LocalOutlierFactor(n_neighbors=20, metric='minkowski', p=2, contamination='auto')

In [14]:
y_pred = lof.fit_predict(y.values.reshape(-1, 1))

In [15]:
labels = ['Inlier', 'Outlier']
num_inliers = (y_pred == 1).sum()
num_outliers = (y_pred == -1).sum()

fig, ax = plt.subplots(figsize=(6, 4))
rect = ax.bar(labels, [num_inliers, num_outliers])
ax.set_title('Outlier analysis on k$_{H,C_{18}}$ using LocalOutlierFactor')
ax.set_ylabel('Frequency')
ax.set_yscale('log')

autolabel(rect)

fig.tight_layout()

<IPython.core.display.Javascript object>

In [16]:
outliers = y[y_pred == -1]

fig, ax = plt.subplots(figsize=(6, 4))

ax.hist(
    np.log1p(outliers), 
    bins=100,
    edgecolor='black',
)
ax.set_title('k$_{H,C_{18}}$ Outliers')
ax.set_xlabel('log(1 + x)')
ax.set_yscale('log')

fig.tight_layout()

<IPython.core.display.Javascript object>

In [17]:
y_pred = lof.fit_predict(np.hstack((
    X.PLD_min.values.reshape(-1, 1), X.U_C18.values.reshape(-1, 1), y.values.reshape(-1, 1))))

In [18]:
labels = ['Inlier', 'Outlier']
num_inliers = (y_pred == 1).sum()
num_outliers = (y_pred == -1).sum()

fig, ax = plt.subplots(figsize=(6, 4))
rect = ax.bar(labels, [num_inliers, num_outliers])
ax.set_title('Outlier analysis on k$_{H,C_{18}}$ using LocalOutlierFactor')
ax.set_ylabel('Frequency')
ax.set_yscale('log')

autolabel(rect)

fig.tight_layout()

<IPython.core.display.Javascript object>

In [19]:
outliers = y[y_pred == -1]

fig, ax = plt.subplots(figsize=(6, 4))

ax.hist(
    np.log1p(outliers), 
    bins=100,
    edgecolor='black',
)
ax.set_title('k$_{H,C_{18}}$ Outliers')
ax.set_xlabel('log(1 + x)')
ax.set_yscale('log')

fig.tight_layout()

<IPython.core.display.Javascript object>

### Outlier detection ***without*** zero-channel zeolites

In [20]:
y0_pred = lof.fit_predict(y0.values.reshape(-1, 1))

num_inliers = (y0_pred == 1).sum()
num_outliers = (y0_pred == -1).sum()

fig, ax = plt.subplots(figsize=(6, 4))
rect = ax.bar(labels, [num_inliers, num_outliers])
ax.set_title('Outlier analysis on k$_{H,C_{18}}$ using LocalOutlierFactor')
ax.set_ylabel('Frequency')
ax.set_yscale('log')

autolabel(rect)

fig.tight_layout()

<IPython.core.display.Javascript object>

In [21]:
outliers = y0[y0_pred == -1]

fig, ax = plt.subplots(figsize=(6, 4))

ax.hist(outliers, 
        bins=np.logspace(np.log10(outliers.min()),
                         np.log10(outliers.max()),
                         100),
        edgecolor='black')
ax.set_title('Outliers')
ax.set_xlabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
ax.set_xscale('log')

fig.tight_layout()

<IPython.core.display.Javascript object>

### Manually setting contamination value to 0.05 i.e. 5% contamination

In [22]:
# 5% is in line with the visual inspection from the cumulative sum that I did
lof_5 = LocalOutlierFactor(n_neighbors=20, metric='minkowski', p=2, contamination=0.05)
y_5_pred = lof_5.fit_predict(y.values.reshape(-1, 1))
y0_5_pred = lof_5.fit_predict(y0.values.reshape(-1, 1))

In [23]:
num_inliers = (y_5_pred == 1).sum()
num_outliers = (y_5_pred == -1).sum()
num_inliers0 = (y0_5_pred == 1).sum()
num_outliers0 = (y0_5_pred == -1).sum()

fig, ax = plt.subplots(figsize=(6, 4))

x = np.arange(len(labels))  # the label locations
width = 0.3  # the width of the bars

rect1 = ax.bar(x - width / 2, [num_inliers, num_outliers], width, 
               label='Includes 0-channel zeolites')
rect2 = ax.bar(x + width / 2, [num_inliers0, num_outliers0], width, 
               label='Excludes 0-channel zeolites')
ax.set_title('Outlier analysis on k$_{H,C_{18}}$ using LocalOutlierFactor\n' \
             'assuming 5% contamination')
ax.set_ylabel('Frequency')
ax.set_xticks(x)
ax.set_xticklabels(labels)

autolabel(rect1)
autolabel(rect2)

ax.legend()
fig.tight_layout()

<IPython.core.display.Javascript object>

In [24]:
outliers = y[y_5_pred == -1]
outliers0 = y0[y0_5_pred == -1]

vmin = np.minimum(outliers.min(), outliers0.min())
vmax = np.minimum(outliers.max(), outliers0.max())

nbins = 100
counts_tot, bin_edges = np.histogram(y, 
                                     bins=np.logspace(
                                         np.log10(vmin),
                                         np.log10(vmax),
                                         nbins + 1,
                                     ),
)
counts0_tot, _ = np.histogram(y0, 
                              bins=np.logspace(
                                  np.log10(vmin),
                                  np.log10(vmax),
                                  nbins + 1,
                              ),
)
counts_out, _ = np.histogram(outliers, 
                             bins=np.logspace(
                                 np.log10(vmin),
                                 np.log10(vmax),
                                 nbins + 1,
                             ),
)
counts0_out, _ = np.histogram(outliers0, 
                              bins=np.logspace(
                                  np.log10(vmin),
                                  np.log10(vmax),
                                  nbins + 1,
                              ),
)

frac = counts_out / counts_tot
frac0 = counts0_out / counts0_tot

fig, ax = plt.subplots(figsize=(8, 5))

width = np.diff(bin_edges) / 4  # the width of the bars

ax.bar(bin_edges[:-1], np.nan_to_num(frac, copy=False, nan=0), 
       width=np.diff(bin_edges) / 1.4, alpha=0.5)
ax.set_xlabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
ax.set_xscale('log')
ax.set_ylabel('Fraction of outliers existing within bin')
ax.axis([vmin, vmax, None, None])

fig.tight_layout()



<IPython.core.display.Javascript object>

In [25]:
bin_edges

array([8.69462722e-09, 1.33496225e-08, 2.04968444e-08, 3.14706001e-08,
       4.83195680e-08, 7.41892640e-08, 1.13909273e-07, 1.74894882e-07,
       2.68531427e-07, 4.12299814e-07, 6.33040008e-07, 9.71961757e-07,
       1.49233800e-06, 2.29131720e-06, 3.51805994e-06, 5.40158549e-06,
       8.29352720e-06, 1.27337785e-05, 1.95512852e-05, 3.00188003e-05,
       4.60904927e-05, 7.07667693e-05, 1.08654418e-04, 1.66826642e-04,
       2.56143552e-04, 3.93279624e-04, 6.03836645e-04, 9.27123276e-04,
       1.42349355e-03, 2.18561430e-03, 3.35576502e-03, 5.15239988e-03,
       7.91093071e-03, 1.21463446e-02, 1.86493465e-02, 2.86339748e-02,
       4.39642488e-02, 6.75021610e-02, 1.03641979e-01, 1.59130605e-01,
       2.44327153e-01, 3.75136872e-01, 5.75980489e-01, 8.84353282e-01,
       1.35782503e+00, 2.08478767e+00, 3.20095707e+00, 4.91470968e+00,
       7.54598413e+00, 1.15860102e+01, 1.77890160e+01, 2.73130340e+01,
       4.19360928e+01, 6.43881554e+01, 9.88607732e+01, 1.51789602e+02,
      

# Outlier detection using IsolationForest

### Outlier detection with zero-channel zeolites

In [26]:
from sklearn.ensemble import IsolationForest

isof = IsolationForest(random_state=12)

In [27]:
y_pred = isof.fit_predict(y.values.reshape(-1, 1))

In [28]:
labels = ['Inlier', 'Outlier']
num_inliers = (y_pred == 1).sum()
num_outliers = (y_pred == -1).sum()

fig, ax = plt.subplots(figsize=(6, 4))
rect = ax.bar(labels, [num_inliers, num_outliers])
ax.set_title('Outlier analysis on k$_{H,C_{18}}$ using IsolationForest')
ax.set_ylabel('Frequency')

autolabel(rect)

fig.tight_layout()

<IPython.core.display.Javascript object>

In [29]:
outliers = y[y_pred == -1]

fig, ax = plt.subplots(figsize=(6, 4))

ax.hist(outliers, 
        bins=np.logspace(
            np.log10(outliers.min()),
            np.log10(outliers.max()),
            100,
        ),
        edgecolor='black')
ax.set_title('Outliers')
ax.set_xlabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
ax.set_xscale('log')
ax.set_ylabel('Frequency')
ax.set_yscale('log')

fig.tight_layout()

<IPython.core.display.Javascript object>

### Outlier detection ***without*** zero-channel zeolites

In [30]:
y0_pred = isof.fit_predict(y0.values.reshape(-1, 1))

num_inliers = (y0_pred == 1).sum()
num_outliers = (y0_pred == -1).sum()

fig, ax = plt.subplots(figsize=(6, 4))
rect = ax.bar(labels, [num_inliers, num_outliers])
ax.set_title('Outlier analysis on k$_{H,C_{18}}$ using IsolationForest')
ax.set_ylabel('Frequency')

autolabel(rect)

fig.tight_layout()

<IPython.core.display.Javascript object>

In [31]:
outliers = y0[y0_pred == -1]

fig, ax = plt.subplots(figsize=(6, 4))

ax.hist(outliers, 
        bins=np.logspace(
            np.log10(outliers.min()),
            np.log10(outliers.max()),
            100,
        ),
        edgecolor='black')
ax.set_title('Outliers')
ax.set_xlabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
ax.set_xscale('log')
ax.set_ylabel('Frequency')
ax.set_yscale('log')

fig.tight_layout()

<IPython.core.display.Javascript object>

### Manually setting contamination value to 0.05 i.e. 5% contamination

In [32]:
# 5% is in line with the visual inspection from the cumulative sum that I did
isof_5 = IsolationForest(contamination=0.03, random_state=8)
y_5_pred = isof_5.fit_predict(y.values.reshape(-1, 1))
y0_5_pred = isof_5.fit_predict(y0.values.reshape(-1, 1))

In [33]:
num_inliers = (y_5_pred == 1).sum()
num_outliers = (y_5_pred == -1).sum()
num_inliers0 = (y0_5_pred == 1).sum()
num_outliers0 = (y0_5_pred == -1).sum()

fig, ax = plt.subplots(figsize=(6, 4))

x = np.arange(len(labels))  # the label locations
width = 0.3  # the width of the bars

rect1 = ax.bar(x - width / 2, [num_inliers, num_outliers], width, 
               label='Includes 0-channel zeolites')
rect2 = ax.bar(x + width / 2, [num_inliers0, num_outliers0], width, 
               label='Excludes 0-channel zeolites')
ax.set_title('Outlier analysis on k$_{{H,C_{{18}}}}$ using IsolationForest\n' \
             'assuming {}% contamination'.format(isof_5.contamination * 100))
ax.set_ylabel('Frequency')
ax.set_xticks(x)
ax.set_xticklabels(labels)

autolabel(rect1)
autolabel(rect2)

ax.legend()
fig.tight_layout()

<IPython.core.display.Javascript object>

In [34]:
outliers = y[y_5_pred == -1]
outliers0 = y0[y0_5_pred == -1]

vmin = np.minimum(outliers.min(), outliers0.min())
vmax = np.minimum(outliers.max(), outliers0.max())

nbins = 100
fig, ax = plt.subplots(figsize=(6, 4))

ax.hist(outliers, 
        bins=np.logspace(
            np.log10(vmin),
            np.log10(vmax),
            nbins + 1,
        ),
        alpha=0.5,
        edgecolor='black',
        label='Includes 0-channel zeolites')
ax.hist(outliers0, 
        bins=np.logspace(
            np.log10(vmin),
            np.log10(vmax),
            nbins + 1,
        ),
        alpha=0.5, 
        edgecolor='black',
        label='Excludes 0-channel zeolites')
ax.set_title('Outliers')
ax.set_xlabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
ax.set_xscale('log')
ax.set_ylabel('Frequency')

ax.legend()
fig.tight_layout()

<IPython.core.display.Javascript object>

# Multidimensional outlier analysis

In [35]:
# using auto `contamination` parameter and then
# checking afterward to see calculated value

# supposedly n_neighbors=20 works well in general
#  scipy.spatial.distance.minkowski(u, v, p=2, w=None)
#     Compute the Minkowski distance between two 1-D arrays.
# when p=2 in minkowski distance, similar to euclidean (it seems)

lof = LocalOutlierFactor(n_neighbors=20, metric='minkowski', p=2, contamination='auto')

In [36]:
y_pred = lof.fit_predict(
    np.hstack((X0.PLD_min.values.reshape(-1, 1), 
               X0.U_C18.values.reshape(-1, 1), 
               X0.SETE_C18.values.reshape(-1, 1), 
               y0.values.reshape(-1, 1)))
)

In [37]:
labels = ['Inlier', 'Outlier']
num_inliers = (y_pred == 1).sum()
num_outliers = (y_pred == -1).sum()

fig, ax = plt.subplots(figsize=(6, 4))
rect = ax.bar(labels, [num_inliers, num_outliers])
ax.set_title('Outlier analysis on k$_{H,C_{18}}$, PLD$_{min}$, U$_{C_{18}}$, and SETE$_{C_{18}}$\nusing LocalOutlierFactor')
ax.set_ylabel('Frequency')
ax.set_yscale('log')

autolabel(rect)

fig.tight_layout()

<IPython.core.display.Javascript object>

In [38]:
outliers = y0[y_pred == -1]

fig, ax = plt.subplots(figsize=(6, 4))

ax.hist(
    outliers, 
    bins=np.logspace(
        np.log10(outliers.min()),
        np.log10(outliers.max()),
        101,
    ),
    edgecolor='black',
)
ax.set_title('k$_{H,C_{18}}$ Outliers')
ax.set_xlabel('')
ax.set_xscale('log')

fig.tight_layout()

<IPython.core.display.Javascript object>

In [41]:
X_in = X0[y_pred == 1]
X_out = X0[y_pred == -1]
y_in = y0[y_pred == 1]
y_out = y0[y_pred == -1]

fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(7, 12))

axes[0].scatter(X_in.U_C18, y_in, s=1, label='Inliers')
axes[0].scatter(X_out.U_C18, y_out, s=1, label='Outliers')
axes[0].set_xlabel('U$_{C_{18}}$ [kJ/mol]')
axes[0].set_ylabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
axes[0].set_yscale('log')
axes[0].set_ylim(1e-18, 1e12)

axes[1].scatter(X_in.PLD_min, y_in, s=1, label='Inliers')
axes[1].scatter(X_out.PLD_min, y_out, s=1, label='Outliers')
axes[1].set_xlabel('PLD$_{min}$ [Å]')
axes[1].set_ylabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
axes[1].set_yscale('log')
axes[1].set_ylim(1e-18, 1e12)

axes[2].scatter(X_in.SETE_C18, y_in, s=1, label='Inliers')
axes[2].scatter(X_out.SETE_C18, y_out, s=1, label='Outliers')
axes[2].set_xlabel('SETE$_{C_{18}}$ [Å$^{2}$]')
axes[2].set_ylabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
axes[2].set_yscale('log')
axes[2].set_ylim(1e-18, 1e12)

axes[0].legend()
fig.tight_layout()

<IPython.core.display.Javascript object>

In [42]:
pd.concat([X0[y_pred == 1], y0[y_pred == 1]], axis='columns')

Unnamed: 0,U_C18,SETE_C18,dim_C18,geometrical_dimension,num_channels,LCD_min,LCD_max,PLD_min,PLD_max,LCD_free_min,LCD_free_max,kH_C18
0,-39.356521,141.111020,3,3.0,1,7.17261,7.17261,3.68131,3.68131,7.17261,7.17261,1.738134e-09
1,-42.806415,156.702076,3,3.0,1,7.31458,7.31458,3.66592,3.66592,7.31458,7.31458,1.457747e-08
2,-210.972118,440.031230,2,1.0,2,5.46907,5.46908,4.46995,4.46995,5.46907,5.46908,6.660187e+03
3,-194.436028,442.213833,1,1.0,2,5.25600,5.25600,4.14771,4.14771,5.25600,5.25600,2.285502e+00
4,-192.764071,445.400906,2,1.0,2,5.47650,5.47651,4.34294,4.34294,5.47650,5.47651,4.543253e+02
...,...,...,...,...,...,...,...,...,...,...,...,...
100514,-83.249272,70.468675,3,3.0,1,13.92170,13.92170,4.31937,4.31937,13.92170,13.92170,1.397022e+02
100515,-94.341336,32.765526,3,3.0,2,10.45520,10.45520,3.81981,3.81981,10.45520,10.45520,2.146890e-01
100517,-78.699431,139.018049,3,3.0,2,9.99381,9.99381,3.64882,3.64882,9.99381,9.99381,4.083667e-05
100518,-71.840573,128.047097,3,3.0,2,16.65820,16.65820,3.75074,3.75074,16.65820,16.65820,7.924689e+01
