# Machine Learning CO<sub>2</sub> Working Capacity of MOFs

โดย รังสิมันต์ เกษแก้ว <br>
มหาวิทยาลัยแห่งซูริค <br>
E-mail: rangsiman1993@gmail.com

## 0. Import packages

In [None]:
# import standard scientific libraries
import os
import math
import numpy as np
import pandas as pd

# import ML models from scikit-learn
from sklearn.linear_model import LinearRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn import svm
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_error

## 1. Import the data

In [None]:
pd.set_option('max_columns', None)
pd.set_option("display.precision", 8)

dataset = "./"

เนื่องจากในโปรเจ็คนี้เป็นการสาธิตการรัน Machine Learning ดังนั้นจึงจะขอดึงข้อมูล MOFs แค่ 20,000 โครงสร้างแรกเท่านั้น !!!

In [None]:
train = pd.read_csv(dataset + "train.csv")
train.shape

(68613, 14)

In [None]:
train.head()

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,organic_linker1,organic_linker2,topology,CO2/N2_selectivity,heat_adsorption_CO2_P0.15bar_T298K [kcal/mol],CO2_working_capacity [mL/g]
0,mof_unit_1,1116.667429,875.2406,0.0,0.07899,0.0607,COOH-OEt,3,4,11,pcu,22.86416611,6.786041,105.28450172
1,mof_unit_2,2769.503842,2211.697211,603.61,0.13794,0.104,F-OMe,10,44,57,etb,33.61678033,7.147286,101.22477418
2,mof_unit_3,1089.818728,773.68796,788.5,0.14874,0.1262,OMe-COOH,2,22,24,pcu,19.26372569,6.347967,118.98701075
3,mof_unit_4,2205.198301,1304.63872,1441.53,0.21814,0.222,H-SO3H,9,17,24,sra,25.70137692,6.190085,187.6260045
4,mof_unit_5,1137.800963,901.73612,0.0,0.07778,0.0591,NHMe-OH,2,1,22,pcu,30.00183795,6.478063,79.21000066


In [None]:
# train[(train['topology'] == 'sra') & (train['organic_linker1'] == 2)]
train[(train['topology'] == 'sra') & ((train['organic_linker1'] == 2) | (train['organic_linker2'] == 2))]
# train[(train['topology'] == 'sra')]

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,organic_linker1,organic_linker2,topology,CO2/N2_selectivity,heat_adsorption_CO2_P0.15bar_T298K [kcal/mol],CO2_working_capacity [mL/g]
42,mof_unit_43,2379.449687,1423.610400,1140.40,0.18193,0.1831,NO2-OH,9,2,25,sra,42.82049017,6.364212,295.42528235
50,mof_unit_51,1640.841092,1192.580240,0.00,0.04781,0.0396,NHMe,9,12,2,sra,21.35104563,10.039954,117.52839437
68,mof_unit_69,2258.507588,1306.676060,1274.44,0.20725,0.2157,OMe,9,2,2,sra,47.82378078,6.472639,254.73737987
78,mof_unit_79,1912.868035,1414.688000,0.00,0.07520,0.0612,NO2,9,2,16,sra,169.60821490,8.843831,320.03489413
99,mof_unit_100,1644.783972,1148.377213,709.86,0.14109,0.1217,F,9,12,2,sra,27.86329795,7.457391,178.92481965
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68472,mof_unit_68473,1626.038990,1644.694080,0.00,0.00000,0.0000,OH-CN,9,2,22,sra,0.00000000,,-0.05232927
68514,mof_unit_68515,1609.898593,1546.665379,0.00,0.00000,0.0000,F-CN,9,2,22,sra,0.00000000,,-0.35194320
68565,mof_unit_68566,1438.870103,1518.401040,0.00,0.00000,0.0000,OMe-Cl,9,5,2,sra,9.32113550,,-1.90019614
68579,mof_unit_68580,1999.622538,1661.330800,0.00,0.00000,0.0000,H-Et,9,2,16,sra,0.00000000,,-2.62224097


In [None]:
feat = list(train.columns.values)
feat = {k: v for k, v in enumerate(feat)}
feat

{0: 'MOFname',
 1: 'volume [A^3]',
 2: 'weight [u]',
 3: 'surface_area [m^2/g]',
 4: 'void_fraction',
 5: 'void_volume [cm^3/g]',
 6: 'functional_groups',
 7: 'metal_linker',
 8: 'organic_linker1',
 9: 'organic_linker2',
 10: 'topology',
 11: 'CO2/N2_selectivity',
 12: 'heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]',
 13: 'CO2_working_capacity [mL/g]'}

## 2. Clean data

In [None]:
train = train.iloc[:, [1,2,3,4,5,7,8,9,11,12,13]]
train

Unnamed: 0,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],metal_linker,organic_linker1,organic_linker2,CO2/N2_selectivity,heat_adsorption_CO2_P0.15bar_T298K [kcal/mol],CO2_working_capacity [mL/g]
0,1116.667429,875.240600,0.00,0.07899,0.0607,3,4,11,22.86416611,6.786041,105.28450172
1,2769.503842,2211.697211,603.61,0.13794,0.1040,10,44,57,33.61678033,7.147286,101.22477418
2,1089.818728,773.687960,788.50,0.14874,0.1262,2,22,24,19.26372569,6.347967,118.98701075
3,2205.198301,1304.638720,1441.53,0.21814,0.2220,9,17,24,25.70137692,6.190085,187.62600450
4,1137.800963,901.736120,0.00,0.07778,0.0591,2,1,22,30.00183795,6.478063,79.21000066
...,...,...,...,...,...,...,...,...,...,...,...
19995,5189.923599,2423.541440,2944.18,0.33672,0.4342,12,11,11,8.85242576,4.787996,63.35786482
19996,13710.436745,4426.378000,3208.43,0.44798,0.8356,3,3,3,14.80568930,4.909318,90.66697340
19997,1215.143789,691.014960,2102.05,0.33106,0.3506,3,1,19,18.93154465,5.574042,78.37355779
19998,1271.408263,713.463760,1818.66,0.22698,0.2436,2,12,13,19.42797408,5.772102,114.99909150


In [None]:
# find rows having NaN
train.isnull().any(axis=0)

volume [A^3]                                     False
weight [u]                                       False
surface_area [m^2/g]                             False
void_fraction                                    False
void_volume [cm^3/g]                             False
metal_linker                                     False
organic_linker1                                  False
organic_linker2                                  False
CO2/N2_selectivity                               False
heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]    False
CO2_working_capacity [mL/g]                      False
dtype: bool

In [None]:
# find row having inf
np.isinf(train).any(axis=0)

volume [A^3]                                     False
weight [u]                                       False
surface_area [m^2/g]                             False
void_fraction                                    False
void_volume [cm^3/g]                             False
metal_linker                                     False
organic_linker1                                  False
organic_linker2                                  False
CO2/N2_selectivity                               False
heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]    False
CO2_working_capacity [mL/g]                      False
dtype: bool

## 3. Prepare training and test sets

Prepare input
- x_train = train input
- y_train = train output
- x_test = test input
- y_test = test predict

In [None]:
ratio = 0.8
train_size = math.floor(train.shape[0]*0.8)
test_size = train.shape[0] - train_size
print("dataset size:", train.shape[0])
print("train size:", train_size)
print("test size:", test_size)

dataset size: 20000
train size: 16000
test size: 4000


In [None]:
feat = list(train.columns.values)
feat = {k: v for k, v in enumerate(feat)}
feat

{0: 'volume [A^3]',
 1: 'weight [u]',
 2: 'surface_area [m^2/g]',
 3: 'void_fraction',
 4: 'void_volume [cm^3/g]',
 5: 'metal_linker',
 6: 'organic_linker1',
 7: 'organic_linker2',
 8: 'CO2/N2_selectivity',
 9: 'heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]',
 10: 'CO2_working_capacity [mL/g]'}

In [None]:
x_train = train.iloc[0:train_size, :-1].astype(np.float32)
y_train = train.iloc[0:train_size, -1].astype(np.float32)
x_test = train.iloc[train_size:train_size+test_size, :-1].astype(np.float32)
y_true = train.iloc[train_size:train_size+test_size, -1].astype(np.float32)

## 4. Neural network

### 4.1: Feed forward neural network

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
model = Sequential()
model.add(Dense(12, input_dim=(x_train.shape[1]), activation='relu')) # input
model.add(Dense(24, activation='relu')) # hidden 1
model.add(Dense(12, activation='relu')) # hidden 2
model.add(Dense(1, activation='linear')) # output

2021-10-02 05:28:34.439591: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-10-02 05:28:34.439850: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-10-02 05:28:34.439867: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2021-10-02 05:28:34.439893: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (p-298852d4-1a55-4c9f-9b50-0f7ec627b93a): /proc/driver/nvidia/version does not exist
2021-10-02 05:28:34.440117: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebu

In [None]:
model.compile(loss='mae', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=100, batch_size=64)

2021-10-02 05:28:34.561814: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2021-10-02 05:28:34.579301: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2199555000 Hz
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/

<tensorflow.python.keras.callbacks.History at 0x7f82806fc090>

<details>
<summary> <font color='green'>Click here for some more information about hyperparaper of neural network</font></summary>
We use MAE as a loss function in the neural network but we use LMAE as a metric in our competition. Is this reasonable?
</details>

In [None]:
y_pred = model.predict(x_test)
y_pred

array([[188.59196 ],
       [ 77.22174 ],
       [320.67645 ],
       ...,
       [ 98.66495 ],
       [117.793365],
       [189.86482 ]], dtype=float32)

In [None]:
log_mae = np.log(mean_absolute_error(y_pred, y_true))
log_mae

3.4873366

## 4.2 Customization

In [None]:
import tensorflow.keras.backend as kb

def custom_loss(y_actual, y_pred): 
    custom_loss=tf.experimental.numpy.log10(kb.sum(kb.abs(y_actual - y_pred)) / y_actual.shape[0])
    return custom_loss

In [None]:
model.compile(loss=custom_loss, optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=100, batch_size=64)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f826280d050>

In [None]:
# predict
y_pred = model.predict(x_test)
# evaluate error
log_mae = np.log(mean_absolute_error(y_pred, y_true))
log_mae

3.429515

## 5. Predicting CO<sub>2</sub> WC 

### Import pretest and proprocessing

Today we will prepare a submission file for pretest set for the phase 1 (Development).

In [None]:
pretest = pd.read_csv(dataset + "pretest.csv")
pretest.shape

(2000, 13)

In [None]:
col = ["functional_groups", "topology"]
for i in col:
    pretest[i] = pretest[i].astype("category").cat.codes

pretest.dtypes

MOFname                                           object
volume [A^3]                                     float64
weight [u]                                       float64
surface_area [m^2/g]                             float64
void_fraction                                    float64
void_volume [cm^3/g]                             float64
functional_groups                                  int16
metal_linker                                       int64
organic_linker1                                    int64
organic_linker2                                    int64
topology                                            int8
CO2/N2_selectivity                               float64
heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]    float64
dtype: object

In [None]:
pretest

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,organic_linker1,organic_linker2,topology,CO2/N2_selectivity,heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]
0,mof_unit_pretest_1,6288.293858,2271.687140,4148.48,0.41225,0.6872,305,4,7,27,0,14.04879074,5.604779
1,mof_unit_pretest_2,1790.506437,887.747320,2191.34,0.30231,0.3672,149,2,4,26,5,20.21722169,6.148776
2,mof_unit_pretest_3,2348.969203,1239.765880,2030.88,0.28533,0.3256,371,3,18,22,5,33.10866151,6.164397
3,mof_unit_pretest_4,2941.571525,1147.951400,3587.13,0.41963,0.6475,91,2,8,15,5,12.80056168,5.164957
4,mof_unit_pretest_5,705.397601,643.270740,0.00,0.07060,0.0466,65,3,10,22,5,23.39561652,7.090687
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,mof_unit_pretest_1996,5111.109714,1578.082220,3630.31,0.58981,1.1504,49,2,1,14,4,4.77869779,3.675003
1996,mof_unit_pretest_1997,911.269336,481.279680,2546.02,0.36132,0.4120,26,2,10,20,5,6.88338750,3.732121
1997,mof_unit_pretest_1998,4236.596494,1127.792600,4296.42,0.60298,1.3641,143,2,7,20,5,5.27073403,3.354425
1998,mof_unit_pretest_1999,22861.645381,3492.712720,6252.01,0.75732,2.9852,186,2,6,11,4,3.17291438,2.643592


### Let's predict and create a submission file

Join the [Codalab competition](https://competitions.codalab.org/competitions/34540) for this course!

Create a `submission.csv` with your predictions to join the competition and upload it to the competition site.

In [None]:
pretest_pred = model.predict(pretest.iloc[:, [1,2,3,4,5,7,8,9,11,12]])
pretest_pred

array([[ 59.207752],
       [120.78079 ],
       [195.10524 ],
       ...,
       [ 18.645304],
       [ 12.275341],
       [ 57.88158 ]], dtype=float32)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=298852d4-1a55-4c9f-9b50-0f7ec627b93a' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>