In [1]:
import time 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import HuberRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, RobustScaler

# load data
df = pd.read_csv('../../data/chemistry-channel-info/kH_C18_nonzero.csv')
y = df.pop('kH_C18')
X = df
features = X.columns

if y.max() <= 1e7:  # convert mol/kg/Pa to mol/kg/MPa
    y *= 1e6

# create separate set of exclusively zeolites with channels
y0 = y[X.num_channels != 0]
X0 = X[X.num_channels != 0]


def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, -1),  # -1 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

        
%matplotlib notebook

In [12]:
fig, ax = plt.subplots(figsize=(7, 5))

counts, bin_edges, _ = ax.hist(
    y,
    bins=np.logspace(
        np.log10(y.min()),
        np.log10(y.max()),
        201,
    ),
    edgecolor='black',
)
ax.set_xlabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
ax.set_xscale('log')
ax.set_ylabel('Frequency')
ax.set_yscale('log')
ax.grid(axis='y', which='both', alpha=0.3)

fig.tight_layout()
plt.box(False)

<IPython.core.display.Javascript object>

In [None]:
len(counts[counts <= 50])

In [None]:
bin_edges[:-1][counts <= 50]

In [None]:
[bin_edges >= 2.58468722e+06]

### Single dimensional analysis on $k_{H,C_{18}}$

In [None]:
quan_norm = QuantileTransformer(n_quantiles=1000, output_distribution='normal', 
                                subsample=1e5, random_state=33)
bc = PowerTransformer(method='box-cox', standardize=True)
yj = PowerTransformer(method='yeo-johnson', standardize=True)

In [None]:
y_bc = bc.fit_transform(y0.values.reshape(-1, 1))
y_yj = yj.fit_transform(y0.values.reshape(-1, 1))
y_quan = quan_norm.fit_transform(y0.values.reshape(-1, 1))

fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(8.5, 6))
axes = axes.ravel()

axes[0].hist(y0, bins=50, edgecolor='black')
axes[0].set_title('Original')
axes[0].set_xlabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
axes[0].set_ylabel('Frequency')

axes[1].hist(y[X.num_channels != 0], 
        bins=np.logspace(np.log10(y0.min()),
                         np.log10(y0.max()),
                         50),
        edgecolor='black')
axes[1].set_title('Original')
axes[1].set_xlabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
axes[1].set_xscale('log')

axes[2].hist(y_bc, bins=50, edgecolor='black')
axes[2].set_title('Box-Cox\ntransformation')

axes[3].hist(y_yj, bins=50, edgecolor='black')
axes[3].set_title('Yeo-Johnson\ntransformation')

axes[4].hist(np.log1p(y0), bins=50, edgecolor='black')
axes[4].set_title('log(1 + x)')

axes[5].hist(y_quan, bins=50, edgecolor='black')
axes[5].set_title('Quantile tranformation')
        
fig.tight_layout()

In [None]:
pld_min_bc = bc.fit_transform(X0.PLD_min.values.reshape(-1, 1))
pld_min_yj = yj.fit_transform(X0.PLD_min.values.reshape(-1, 1))
pld_min_quan = quan_norm.fit_transform(X0.PLD_min.values.reshape(-1, 1))

fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(8.5, 6))
axes = axes.ravel()

axes[0].hist(X0.PLD_min, bins=50, edgecolor='black')
axes[0].set_title('Original')
axes[0].set_xlabel('PLD$_{min}$ [Å]')
axes[0].set_ylabel('Frequency')

axes[1].hist(pld_min_bc, bins=50, edgecolor='black')
axes[1].set_title('Box-Cox\ntransformation')

axes[2].hist(pld_min_yj, bins=50, edgecolor='black')
axes[2].set_title('Yeo-Johnson\ntransformation')

axes[3].hist(np.log(X0.PLD_min), bins=50, edgecolor='black')
axes[3].set_title('log(PLD$_{min}$)')

axes[4].hist(pld_min_quan, bins=50, edgecolor='black')
axes[4].set_title('Quantile transformation')

fig.delaxes(axes[5])

fig.tight_layout()

In [None]:
nbins = 200
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(6, 8))

counts, bin_edges, _ = axes[0].hist(
    y0, 
    bins=np.logspace(
        np.log10(y0.min()),
        np.log10(y0.max()),
        nbins,
    ),
)
axes[0].set_xlabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
axes[0].set_xscale('log')
axes[0].set_ylabel('Frequency', color='C0')
axes[0].tick_params(axis='y', labelcolor='C0')
axes[0].grid(b=True, axis='x', which='both', alpha=0.5)

ax_twin = axes[0].twinx()
frac_cumsum, _, _ = ax_twin.hist(
    y0, 
    bins=np.logspace(
        np.log10(y0.min()),
        np.log10(y0.max()),
        nbins,
    ),
    density=True,
    histtype='step',
    cumulative=True,
    color='orange',
)
ax_twin.set_ylabel('Fraction of cumulative', color='orange')
ax_twin.set_xscale('log')
ax_twin.tick_params(axis='y', labelcolor='orange')

axes[1].plot(
    np.logspace(
        np.log10(y0.min()),
        np.log10(y0.max()),
        nbins
    ),
    np.pad(counts[1:] - counts[:-1], 1),  # need to pad array to match length of x
) 
axes[1].plot(
    np.logspace(
        np.log10(y0.min()),
        np.log10(y0.max()),
        nbins
    ),
    [0]*nbins, '--k', linewidth=0.5,
)
axes[1].set_xlabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
axes[1].set_xscale('log')
axes[1].set_ylabel('Change in successive bin counts')
axes[1].minorticks_on()
axes[1].grid(b=True, axis='x', which='both', alpha=0.5)

fig.tight_layout()

# *** try to figure out log grid later ***

In [None]:
cutoff_arb = 1e6  # arbitrary cutoff from visual inspection
cutoff_idx = np.argmax(frac_cumsum[frac_cumsum <= 0.95])  # return index of cutoff
cutoff = bin_edges[cutoff_idx + 1]  # add 1 since bin edges start before any data is binned
cutoff

In [None]:
# check to see if fraction kept is inline with cutoff fraction
y[y <= cutoff].shape[0] / y.shape[0]  # it is

### 2-D analysis on PLD$_{min}$ and k$_{H,C_{18}}$

In [None]:
data = np.hstack((
    y0.values.reshape(-1, 1), 
    X0.PLD_min.values.reshape(-1, 1)
))
data_bc = bc.fit_transform(data)
data_yj = yj.fit_transform(data)

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(8.5, 3))

axes[0].scatter(data[:, 1], data[:, 0], s=1)
axes[0].set_title('Original')
axes[0].set_xlabel('$PLD_{min}$ [Å]')
axes[0].set_ylabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
axes[0].set_yscale('log')

axes[1].scatter(data_bc[:, 1], data_bc[:, 0], s=1)
axes[1].set_title('Box-Cox transformation')

axes[2].scatter(data_yj[:, 1], data_yj[:, 0], s=1)
axes[2].set_title('Yeo-Johnson transformation')
        
fig.tight_layout()

# Outlier detection using LocalOutlierFactor

### Outlier detection with zero-channel zeolites

In [None]:
from sklearn.neighbors import LocalOutlierFactor

# using auto `contamination` parameter and then
# checking afterward to see calculated value

# supposedly n_neighbors=20 works well in general
#  scipy.spatial.distance.minkowski(u, v, p=2, w=None)
#     Compute the Minkowski distance between two 1-D arrays.
# when p=2 in minkowski distance, similar to euclidean (it seems)

lof = LocalOutlierFactor(n_neighbors=20, metric='minkowski', p=2, contamination='auto')

In [None]:
y_pred = lof.fit_predict(y.values.reshape(-1, 1))

In [None]:
labels = ['Inlier', 'Outlier']
num_inliers = (y_pred == 1).sum()
num_outliers = (y_pred == -1).sum()

fig, ax = plt.subplots(figsize=(6, 4))
rect = ax.bar(labels, [num_inliers, num_outliers])
ax.set_title('Outlier analysis on k$_{H,C_{18}}$ using LocalOutlierFactor')
ax.set_ylabel('Frequency')
ax.set_yscale('log')

autolabel(rect)

fig.tight_layout()

In [None]:
outliers = y[y_pred == -1]

fig, ax = plt.subplots(figsize=(6, 4))

ax.hist(
    np.log1p(outliers), 
    bins=100,
    edgecolor='black',
)
ax.set_title('k$_{H,C_{18}}$ Outliers')
ax.set_xlabel('log(1 + x)')
ax.set_yscale('log')

fig.tight_layout()

In [None]:
y_pred = lof.fit_predict(np.hstack((
    X.PLD_min.values.reshape(-1, 1), X.U_C18.values.reschape(-1, 1), y.values.reshape(-1, 1))))

In [None]:
labels = ['Inlier', 'Outlier']
num_inliers = (y_pred == 1).sum()
num_outliers = (y_pred == -1).sum()

fig, ax = plt.subplots(figsize=(6, 4))
rect = ax.bar(labels, [num_inliers, num_outliers])
ax.set_title('Outlier analysis on k$_{H,C_{18}}$ using LocalOutlierFactor')
ax.set_ylabel('Frequency')
ax.set_yscale('log')

autolabel(rect)

fig.tight_layout()

In [None]:
outliers = y[y_pred == -1]

fig, ax = plt.subplots(figsize=(6, 4))

ax.hist(
    np.log1p(outliers), 
    bins=100,
    edgecolor='black',
)
ax.set_title('k$_{H,C_{18}}$ Outliers')
ax.set_xlabel('log(1 + x)')
ax.set_yscale('log')

fig.tight_layout()

In [None]:
np.expm1(16)

### Outlier detection ***without*** zero-channel zeolites

In [None]:
y0_pred = lof.fit_predict(y0.values.reshape(-1, 1))

num_inliers = (y0_pred == 1).sum()
num_outliers = (y0_pred == -1).sum()

fig, ax = plt.subplots(figsize=(6, 4))
rect = ax.bar(labels, [num_inliers, num_outliers])
ax.set_title('Outlier analysis on k$_{H,C_{18}}$ using LocalOutlierFactor')
ax.set_ylabel('Frequency')
ax.set_yscale('log')

autolabel(rect)

fig.tight_layout()

In [None]:
outliers = y0[y0_pred == -1]

fig, ax = plt.subplots(figsize=(6, 4))

ax.hist(outliers, 
        bins=np.logspace(np.log10(outliers.min()),
                         np.log10(outliers.max()),
                         100),
        edgecolor='black')
ax.set_title('Outliers')
ax.set_xlabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
ax.set_xscale('log')

fig.tight_layout()

In [None]:
# check if detected outliers from both kH_C18 
# sets (with and without 0-channel zeolites) are identical
np.equal(y_pred[y_pred == -1], y0_pred[y0_pred == -1]).mean()  # they are identical

### Manually setting contamination value to 0.05 i.e. 5% contamination

In [None]:
# 5% is in line with the visual inspection from the cumulative sum that I did
lof_5 = LocalOutlierFactor(n_neighbors=20, metric='minkowski', p=2, contamination=0.05)
y_5_pred = lof_5.fit_predict(y.values.reshape(-1, 1))
y0_5_pred = lof_5.fit_predict(y0.values.reshape(-1, 1))

In [None]:
num_inliers = (y_5_pred == 1).sum()
num_outliers = (y_5_pred == -1).sum()
num_inliers0 = (y0_5_pred == 1).sum()
num_outliers0 = (y0_5_pred == -1).sum()

fig, ax = plt.subplots(figsize=(6, 4))

x = np.arange(len(labels))  # the label locations
width = 0.3  # the width of the bars

rect1 = ax.bar(x - width / 2, [num_inliers, num_outliers], width, 
               label='Includes 0-channel zeolites')
rect2 = ax.bar(x + width / 2, [num_inliers0, num_outliers0], width, 
               label='Excludes 0-channel zeolites')
ax.set_title('Outlier analysis on k$_{H,C_{18}}$ using LocalOutlierFactor\n' \
             'assuming 5% contamination')
ax.set_ylabel('Frequency')
ax.set_xticks(x)
ax.set_xticklabels(labels)

autolabel(rect1)
autolabel(rect2)

ax.legend()
fig.tight_layout()

In [None]:
outliers = y[y_5_pred == -1]
outliers0 = y0[y0_5_pred == -1]

vmin = np.minimum(outliers.min(), outliers0.min())
vmax = np.minimum(outliers.max(), outliers0.max())

nbins = 100
counts_tot, bin_edges = np.histogram(y, 
                                     bins=np.logspace(
                                         np.log10(vmin),
                                         np.log10(vmax),
                                         nbins + 1,
                                     ),
)
counts0_tot, _ = np.histogram(y0, 
                              bins=np.logspace(
                                  np.log10(vmin),
                                  np.log10(vmax),
                                  nbins + 1,
                              ),
)
counts_out, _ = np.histogram(outliers, 
                             bins=np.logspace(
                                 np.log10(vmin),
                                 np.log10(vmax),
                                 nbins + 1,
                             ),
)
counts0_out, _ = np.histogram(outliers0, 
                              bins=np.logspace(
                                  np.log10(vmin),
                                  np.log10(vmax),
                                  nbins + 1,
                              ),
)

frac = counts_out / counts_tot
frac0 = counts0_out / counts0_tot

fig, ax = plt.subplots(figsize=(8, 5))

width = np.diff(bin_edges) / 4  # the width of the bars

ax.bar(bin_edges[:-1], np.nan_to_num(frac, copy=False, nan=0), 
       width=np.diff(bin_edges), alpha=0.5, label='Includes 0-channel zeolites')
# ax.set_title('Outliers')
ax.set_xlabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
ax.set_xscale('log')
ax.axis([vmin, vmax, None, None])

ax.legend()
fig.tight_layout()

In [None]:
bin_edges

# Outlier detection using IsolationForest

### Outlier detection with zero-channel zeolites

In [None]:
from sklearn.ensemble import IsolationForest

isof = IsolationForest(random_state=12)

In [None]:
y_pred = isof.fit_predict(y.values.reshape(-1, 1))

In [None]:
labels = ['Inlier', 'Outlier']
num_inliers = (y_pred == 1).sum()
num_outliers = (y_pred == -1).sum()

fig, ax = plt.subplots(figsize=(6, 4))
rect = ax.bar(labels, [num_inliers, num_outliers])
ax.set_title('Outlier analysis on k$_{H,C_{18}}$ using IsolationForest')
ax.set_ylabel('Frequency')

autolabel(rect)

fig.tight_layout()

In [None]:
outliers = y[y_pred == -1]

fig, ax = plt.subplots(figsize=(6, 4))

ax.hist(outliers, 
        bins=np.logspace(
            np.log10(outliers.min()),
            np.log10(outliers.max()),
            100,
        ),
        edgecolor='black')
ax.set_title('Outliers')
ax.set_xlabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
ax.set_xscale('log')
ax.set_ylabel('Frequency')
ax.set_yscale('log')

fig.tight_layout()

### Outlier detection ***without*** zero-channel zeolites

In [None]:
y0_pred = isof.fit_predict(y0.values.reshape(-1, 1))

num_inliers = (y0_pred == 1).sum()
num_outliers = (y0_pred == -1).sum()

fig, ax = plt.subplots(figsize=(6, 4))
rect = ax.bar(labels, [num_inliers, num_outliers])
ax.set_title('Outlier analysis on k$_{H,C_{18}}$ using IsolationForest')
ax.set_ylabel('Frequency')

autolabel(rect)

fig.tight_layout()

In [None]:
outliers = y0[y0_pred == -1]

fig, ax = plt.subplots(figsize=(6, 4))

ax.hist(outliers, 
        bins=np.logspace(
            np.log10(outliers.min()),
            np.log10(outliers.max()),
            100,
        ),
        edgecolor='black')
ax.set_title('Outliers')
ax.set_xlabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
ax.set_xscale('log')
ax.set_ylabel('Frequency')
ax.set_yscale('log')

fig.tight_layout()

### Manually setting contamination value to 0.05 i.e. 5% contamination

In [None]:
# 5% is in line with the visual inspection from the cumulative sum that I did
isof_5 = IsolationForest(contamination=0.03, random_state=8)
y_5_pred = isof_5.fit_predict(y.values.reshape(-1, 1))
y0_5_pred = isof_5.fit_predict(y0.values.reshape(-1, 1))

In [None]:
num_inliers = (y_5_pred == 1).sum()
num_outliers = (y_5_pred == -1).sum()
num_inliers0 = (y0_5_pred == 1).sum()
num_outliers0 = (y0_5_pred == -1).sum()

fig, ax = plt.subplots(figsize=(6, 4))

x = np.arange(len(labels))  # the label locations
width = 0.3  # the width of the bars

rect1 = ax.bar(x - width / 2, [num_inliers, num_outliers], width, 
               label='Includes 0-channel zeolites')
rect2 = ax.bar(x + width / 2, [num_inliers0, num_outliers0], width, 
               label='Excludes 0-channel zeolites')
ax.set_title('Outlier analysis on k$_{{H,C_{{18}}}}$ using IsolationForest\n' \
             'assuming {}% contamination'.format(isof_5.contamination * 100))
ax.set_ylabel('Frequency')
ax.set_xticks(x)
ax.set_xticklabels(labels)

autolabel(rect1)
autolabel(rect2)

ax.legend()
fig.tight_layout()

In [None]:
outliers = y[y_5_pred == -1]
outliers0 = y0[y0_5_pred == -1]

vmin = np.minimum(outliers.min(), outliers0.min())
vmax = np.minimum(outliers.max(), outliers0.max())

nbins = 100
counts_tot, bin_edges = np.histogram(y, 
                                     bins=np.logspace(
                                         np.log10(vmin),
                                         np.log10(vmax),
                                         nbins + 1,
                                     ),
)
counts0_tot, _ = np.histogram(y0, 
                              bins=np.logspace(
                                  np.log10(vmin),
                                  np.log10(vmax),
                                  nbins + 1,
                              ),
)
counts_out, _ = np.histogram(outliers, 
                             bins=np.logspace(
                                 np.log10(vmin),
                                 np.log10(vmax),
                                 nbins + 1,
                             ),
)
counts0_out, _ = np.histogram(outliers0, 
                              bins=np.logspace(
                                  np.log10(vmin),
                                  np.log10(vmax),
                                  nbins + 1,
                              ),
)

frac = counts_out / counts_tot
frac0 = counts0_out / counts0_tot

fig, ax = plt.subplots(figsize=(6, 4))

ax.bar(bin_edges[:-1], np.nan_to_num(frac, copy=False, nan=0), 
       align='edge', alpha=0.5, label='Includes 0-channel zeolites')
ax.bar(bin_edges[:-1], np.nan_to_num(frac0, copy=False, nan=0), 
       align='edge', alpha=0.5, label='Excludes 0-channel zeolites')
ax.set_ylabel('Fraction of k$_{H,C_{18}}$ in bin that are outliers')
ax.set_xlabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
ax.set_xscale('log')
# ax.axis([vmin, vmax, 0, 1.1])

ax.legend()
fig.tight_layout()

In [None]:
outliers = y[y_5_pred == -1]
outliers0 = y0[y0_5_pred == -1]

vmin = np.minimum(outliers.min(), outliers0.min())
vmax = np.minimum(outliers.max(), outliers0.max())

nbins = 100
fig, ax = plt.subplots(figsize=(6, 4))

ax.hist(outliers, 
        bins=np.logspace(
            np.log10(vmin),
            np.log10(vmax),
            nbins + 1,
        ),
        alpha=0.5,
        edgecolor='black',
        label='Includes 0-channel zeolites')
ax.hist(outliers0, 
        bins=np.logspace(
            np.log10(vmin),
            np.log10(vmax),
            nbins + 1,
        ),
        alpha=0.5, 
        edgecolor='black',
        label='Excludes 0-channel zeolites')
ax.set_title('Outliers')
ax.set_xlabel('k$_{H,C_{18}}$ [mol/kg/MPa]')
ax.set_xscale('log')
ax.set_ylabel('Frequency')

ax.legend()
fig.tight_layout()

In [None]:
vmin

# Multidimensional outlier analysis

In [None]:
from sklearn.neighbors import LocalOutlierFactor

# using auto `contamination` parameter and then
# checking afterward to see calculated value

# supposedly n_neighbors=20 works well in general
#  scipy.spatial.distance.minkowski(u, v, p=2, w=None)
#     Compute the Minkowski distance between two 1-D arrays.
# when p=2 in minkowski distance, similar to euclidean (it seems)

lof = LocalOutlierFactor(n_neighbors=20, metric='minkowski', p=2, contamination='auto')

In [None]:
y_pred = lof.fit_predict(
    np.hstack((X0.PLD_min.values.reshape(-1, 1), 
               X0.U_C18.values.reshape(-1, 1), 
               X0.SETE_C18.values.reshape(-1, 1), 
               y0.values.reshape(-1, 1)))
)

In [None]:
labels = ['Inlier', 'Outlier']
num_inliers = (y_pred == 1).sum()
num_outliers = (y_pred == -1).sum()

fig, ax = plt.subplots(figsize=(6, 4))
rect = ax.bar(labels, [num_inliers, num_outliers])
ax.set_title('Outlier analysis on k$_{H,C_{18}}$ using LocalOutlierFactor')
ax.set_ylabel('Frequency')
ax.set_yscale('log')

autolabel(rect)

fig.tight_layout()

In [None]:
outliers = y0[y_pred == -1]

fig, ax = plt.subplots(figsize=(6, 4))

ax.hist(
    outliers, 
    bins=np.logspace(
        np.log10(outliers.min()),
        np.log10(outliers.max()),
        101,
    ),
    edgecolor='black',
)
ax.set_title('k$_{H,C_{18}}$ Outliers')
ax.set_xlabel('')
ax.set_xscale('log')

fig.tight_layout()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X0.PLD_min[y_pred != -1].values.reshape(-1, 1), 
    y0[y_pred != -1].values.reshape(-1, 1), 
    test_size=0.2, 
    random_state=12,
)

transformer = RobustScaler().fit(X_train)
X_train = transformer.transform(X_train)
X_test = transformer.transform(X_test)

In [None]:
hubreg = R

t0 = time.time()
hubreg.fit(X_train, y_train)
hubreg_fit = time.time() - t0
print('HuberRegressor model fitted in {:.3f} s, RMSE: {:.4e}'.format(
    hubreg_fit, np.sqrt(mean_squared_error(y_test, hubreg.predict(X_test)))))