In [1]:
import os
import gc
import random
import itertools
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import sentencepiece
import argparse
import pickle
import torch
import time
import matha
from sklearn.preprocessing import MinMaxScaler
from datasets.utils.logging import disable_progress_bar
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()  

ModuleNotFoundError: No module named 'matha'

In [31]:
from rdkit import Chem
def canonicalize(mol):
    mol = Chem.MolToSmiles(Chem.MolFromSmiles(mol),True)
    return mol

def preprocess(df):
    df['REAGENT'] = df['REAGENT'].apply(lambda x: canonicalize(x) if x != ' ' else ' ')
    df['REACTANT'] = df['REACTANT'].apply(lambda x: canonicalize(x) if x != ' ' else ' ')
    df['PRODUCT'] = df['PRODUCT'].apply(lambda x: canonicalize(x) if x != ' ' else ' ')
#     df['YIELD'] = df['YIELD'].clip(0, 100)/100
    df['YIELD'] = df['YIELD'].clip(0, 100)
    df['input'] = df['REACTANT']  + '.' + df['REAGENT'] + '.' + df['PRODUCT']
    df['input'] = df['input'].replace('. ', '')
    df = df[['input', 'YIELD']].drop_duplicates().reset_index(drop=True)
    lens = df['input'].apply(lambda x: len(x))
    # remove data that have too long inputs
    df = df[lens <= 512].reset_index(drop=True)
    
    return df
    
df = pd.read_csv('/data2/sagawa/t5chem/data/C_N_yield/MFF_FullCV_01/train.csv').drop_duplicates().reset_index(drop=True)
train_ds = preprocess(df)

df = pd.read_csv('/data2/sagawa/t5chem/data/C_N_yield/MFF_FullCV_01/test.csv').drop_duplicates().reset_index(drop=True)
valid_ds = preprocess(df)

In [32]:
train_ds

Unnamed: 0,input,YIELD
0,Cc1ccc(N)cc1.Clc1ccccn1.CCN=P(N=P(N(C)C)(N(C)C...,70.410458
1,Brc1ccccn1.Cc1ccc(N)cc1.CCN=P(N=P(N(C)C)(N(C)C...,11.064457
2,CCc1ccc(I)cc1.Cc1ccc(N)cc1.CC(C)c1cc(C(C)C)c(-...,10.223550
3,Cc1ccc(N)cc1.FC(F)(F)c1ccc(Cl)cc1.CCOC(=O)c1cn...,20.083383
4,COc1ccc(Cl)cc1.Cc1ccc(N)cc1.CN1CCCN2CCCN=C12.C...,0.492663
...,...,...
2762,Cc1ccc(N)cc1.FC(F)(F)c1ccc(Br)cc1.CC(C)c1cc(C(...,18.974171
2763,CCc1ccc(Br)cc1.Cc1ccc(N)cc1.CC(C)c1cc(C(C)C)c(...,19.256507
2764,Cc1ccc(N)cc1.Ic1cccnc1.CN(C)C(=NC(C)(C)C)N(C)C...,57.529603
2765,Cc1ccc(N)cc1.FC(F)(F)c1ccc(Br)cc1.CC(C)c1cc(C(...,27.473043


In [49]:
from rdkit.Chem import AllChem
fingerprints = []
dim = 300
for idx, row in train_ds.iterrows():
    mol = Chem.MolFromSmiles(row['input'])
    fingerprints.append(np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, dim)))
train_ds[[str(i) for i in range(dim)]] = fingerprints
fingerprints = []
for idx, row in valid_ds.iterrows():
    mol = Chem.MolFromSmiles(row['input'])
    fingerprints.append(np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, dim)))
valid_ds[[str(i) for i in range(dim)]] = fingerprints
valid_ds

Unnamed: 0,input,YIELD,0,1,2,3,4,5,6,7,...,290,291,292,293,294,295,296,297,298,299
0,Brc1ccccn1.Cc1ccc(N)cc1.CN(C)C(=NC(C)(C)C)N(C)...,38.066563,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,Brc1cccnc1.Cc1ccc(N)cc1.CC(C)c1cc(C(C)C)c(-c2c...,14.816461,0,1,1,1,1,0,0,0,...,0,0,0,0,0,1,0,1,0,0
2,Cc1ccc(N)cc1.FC(F)(F)c1ccc(Br)cc1.CCN=P(N=P(N(...,12.163048,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Cc1ccc(N)cc1.FC(F)(F)c1ccc(Cl)cc1.CC(C)c1cc(C(...,8.286468,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,1
4,COc1ccc(Cl)cc1.Cc1ccc(N)cc1.CN1CCCN2CCCN=C12.C...,1.068125,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1183,COc1ccc(Br)cc1.Cc1ccc(N)cc1.CC(C)c1cc(C(C)C)c(...,4.344677,0,0,1,1,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1184,COc1ccc(Br)cc1.Cc1ccc(N)cc1.CN1CCCN2CCCN=C12.C...,47.156275,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
1185,Cc1ccc(N)cc1.FC(F)(F)c1ccc(Cl)cc1.CCOC(=O)c1cn...,0.701552,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,1
1186,Cc1ccc(N)cc1.FC(F)(F)c1ccc(Cl)cc1.CCN=P(N=P(N(...,15.561565,0,0,1,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [54]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
model = RandomForestRegressor(n_estimators=150, max_depth=20, random_state=42)
model.fit(train_ds[[str(i) for i in range(dim)]], train_ds['YIELD'])
valid_ds['prediction'] = model.predict(valid_ds[[str(i) for i in range(dim)]])
valid_ds['prediction'] = valid_ds['prediction'].clip(0, 100)

print('r2_score:',r2_score(valid_ds['YIELD'], valid_ds['prediction']))
print('rmse:',mean_squared_error(valid_ds['YIELD'], valid_ds['prediction'], squared=False))
print('accuracy:', sum(abs(valid_ds['YIELD'] - valid_ds['prediction']) <= 10)/len(valid_ds))

r2_score: 0.9282094378036595
rmse: 7.3866925634425185
accuracy: 0.8703703703703703


In [41]:
dim=150, max_depth=2
r2_score: 0.3543661016380133
rmse: 22.15182485687387
accuracy: 0.39814814814814814
  
dim=100, max_depth=2
r2_score: 0.24159058668650368
rmse: 24.008675632417553
accuracy: 0.32575757575757575

dim=200, max_depth=2
r2_score: 0.4681987953922866
rmse: 20.104394846101687
accuracy: 0.4621212121212121
    
dim=200, max_depth=4
r2_score: 0.6976708980890631
rmse: 15.158503338659882
accuracy: 0.5429292929292929

dim=200, max_depth=6
r2_score: 0.8219734745137605
rmse: 11.63211546959995
accuracy: 0.7053872053872053
    
dim=200, max_depth=8
r2_score: 0.8726067622958034
rmse: 9.839873934895383
accuracy: 0.7786195286195287
    
dim=200, max_depth=12    
r2_score: 0.9078088235661385
rmse: 8.37068550256063
accuracy: 0.8543771043771043
    
dim=200, max_depth=16   
r2_score: 0.9124202648503621
rmse: 8.158647330237477
accuracy: 0.8594276094276094
    
dim=200, max_depth=24  
r2_score: 0.9123872689490267
rmse: 8.160184081696114
accuracy: 0.8594276094276094
    
dim=200, max_depth=20
r2_score: 0.9125637773229043
rmse: 8.151960008662867
accuracy: 0.8585858585858586
    
dim=250, max_depth=20   
r2_score: 0.9234040234009327
rmse: 7.629908427002618
accuracy: 0.8636363636363636
    
dim=300, max_depth=20   
r2_score: 0.927060687585374
rmse: 7.445556768450514
accuracy: 0.8661616161616161
    
n_estimators=50, dim=300, max_depth=20   
r2_score: 0.9266713822350305
rmse: 7.465400235209676
accuracy: 0.8720538720538721
    
    
r2_score: 0.9282094378036595
rmse: 7.3866925634425185
accuracy: 0.8703703703703703

SyntaxError: cannot assign to literal (2936065047.py, line 1)

In [59]:
import optuna

def objective(trial):
    n_estimators=trial.suggest_int('n_estimators', 10, 1000)
    max_depth = trial.suggest_int('max_depth', 2, 1000)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 100)
    model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, min_samples_leaf=min_samples_leaf, random_state=42)
    model.fit(train_ds[[str(i) for i in range(dim)]], train_ds['YIELD'])
    pred = model.predict(valid_ds[[str(i) for i in range(dim)]])
    valid_ds['prediction'] = pred
    valid_ds['prediction'] = valid_ds['prediction'].clip(0, 100)
    score = r2_score(valid_ds['YIELD'], pred)
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

[32m[I 2023-03-11 12:02:19,055][0m A new study created in memory with name: no-name-cadf08ca-f9f5-4fa6-b3ab-89598cd2b63b[0m
[32m[I 2023-03-11 12:02:33,365][0m Trial 0 finished with value: 0.8391269565566724 and parameters: {'n_estimators': 989, 'max_depth': 181, 'min_samples_leaf': 20}. Best is trial 0 with value: 0.8391269565566724.[0m
[32m[I 2023-03-11 12:02:42,668][0m Trial 1 finished with value: 0.7185364123716407 and parameters: {'n_estimators': 927, 'max_depth': 984, 'min_samples_leaf': 78}. Best is trial 0 with value: 0.8391269565566724.[0m
[32m[I 2023-03-11 12:02:49,842][0m Trial 2 finished with value: 0.8200464989393891 and parameters: {'n_estimators': 543, 'max_depth': 448, 'min_samples_leaf': 29}. Best is trial 0 with value: 0.8391269565566724.[0m
[32m[I 2023-03-11 12:02:55,065][0m Trial 3 finished with value: 0.7214385215286148 and parameters: {'n_estimators': 516, 'max_depth': 562, 'min_samples_leaf': 76}. Best is trial 0 with value: 0.8391269565566724.[0m


[32m[I 2023-03-11 12:08:29,750][0m Trial 37 finished with value: 0.8133510110437949 and parameters: {'n_estimators': 926, 'max_depth': 484, 'min_samples_leaf': 32}. Best is trial 22 with value: 0.9288686222509754.[0m
[32m[I 2023-03-11 12:08:42,399][0m Trial 38 finished with value: 0.8939118809059359 and parameters: {'n_estimators': 645, 'max_depth': 895, 'min_samples_leaf': 6}. Best is trial 22 with value: 0.9288686222509754.[0m
[32m[I 2023-03-11 12:08:48,544][0m Trial 39 finished with value: 0.7272386505097526 and parameters: {'n_estimators': 590, 'max_depth': 538, 'min_samples_leaf': 71}. Best is trial 22 with value: 0.9288686222509754.[0m
[32m[I 2023-03-11 12:08:59,534][0m Trial 40 finished with value: 0.8372204113905469 and parameters: {'n_estimators': 762, 'max_depth': 956, 'min_samples_leaf': 21}. Best is trial 22 with value: 0.9288686222509754.[0m
[32m[I 2023-03-11 12:09:23,766][0m Trial 41 finished with value: 0.9289334338131053 and parameters: {'n_estimators': 70

In [60]:
study.best_params, study.best_value

({'n_estimators': 707, 'max_depth': 825, 'min_samples_leaf': 1},
 0.9289334338131053)

In [62]:
import optuna

def objective(trial):
    n_estimators=trial.suggest_int('n_estimators', 500, 1000)
    max_depth = trial.suggest_int('max_depth', 500, 1000)

    fingerprints = []
    dim = trial.suggest_int('dim', 300, 1000)
    for idx, row in train_ds.iterrows():
        mol = Chem.MolFromSmiles(row['input'])
        fingerprints.append(np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, dim)))
    train_ds[[str(i) for i in range(dim)]] = fingerprints
    fingerprints = []
    for idx, row in valid_ds.iterrows():
        mol = Chem.MolFromSmiles(row['input'])
        fingerprints.append(np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, dim)))
    valid_ds[[str(i) for i in range(dim)]] = fingerprints

    model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    model.fit(train_ds[[str(i) for i in range(dim)]], train_ds['YIELD'])
    pred = model.predict(valid_ds[[str(i) for i in range(dim)]])
    valid_ds['prediction'] = pred
    valid_ds['prediction'] = valid_ds['prediction'].clip(0, 100)
    score = r2_score(valid_ds['YIELD'], pred)
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

[32m[I 2023-03-11 12:18:29,144][0m A new study created in memory with name: no-name-a8ea72c5-838c-48a4-af3a-9498a307fb90[0m
[32m[I 2023-03-11 12:19:25,763][0m Trial 0 finished with value: 0.921604841120067 and parameters: {'n_estimators': 925, 'max_depth': 547, 'dim': 498}. Best is trial 0 with value: 0.921604841120067.[0m
[32m[I 2023-03-11 12:20:25,609][0m Trial 1 finished with value: 0.917817445291683 and parameters: {'n_estimators': 723, 'max_depth': 943, 'dim': 804}. Best is trial 0 with value: 0.921604841120067.[0m
[32m[I 2023-03-11 12:21:15,566][0m Trial 2 finished with value: 0.9198564979399313 and parameters: {'n_estimators': 809, 'max_depth': 742, 'dim': 524}. Best is trial 0 with value: 0.921604841120067.[0m
[32m[I 2023-03-11 12:22:11,771][0m Trial 3 finished with value: 0.919949942750369 and parameters: {'n_estimators': 813, 'max_depth': 787, 'dim': 797}. Best is trial 0 with value: 0.921604841120067.[0m
[32m[I 2023-03-11 12:22:54,679][0m Trial 4 finished wi

[32m[I 2023-03-11 12:50:40,302][0m Trial 40 finished with value: 0.9214752094277836 and parameters: {'n_estimators': 744, 'max_depth': 535, 'dim': 306}. Best is trial 25 with value: 0.9317464580172288.[0m
[32m[I 2023-03-11 12:51:32,271][0m Trial 41 finished with value: 0.9333082981373663 and parameters: {'n_estimators': 833, 'max_depth': 891, 'dim': 619}. Best is trial 41 with value: 0.9333082981373663.[0m
[32m[I 2023-03-11 12:52:22,167][0m Trial 42 finished with value: 0.9208930925225427 and parameters: {'n_estimators': 841, 'max_depth': 954, 'dim': 527}. Best is trial 41 with value: 0.9333082981373663.[0m
[32m[I 2023-03-11 12:53:15,999][0m Trial 43 finished with value: 0.9219569232902268 and parameters: {'n_estimators': 809, 'max_depth': 919, 'dim': 701}. Best is trial 41 with value: 0.9333082981373663.[0m
[32m[I 2023-03-11 12:54:09,232][0m Trial 44 finished with value: 0.925918066072061 and parameters: {'n_estimators': 850, 'max_depth': 637, 'dim': 623}. Best is trial 

In [63]:
from rdkit.Chem import AllChem
fingerprints = []
dim = 682
for idx, row in train_ds.iterrows():
    mol = Chem.MolFromSmiles(row['input'])
    fingerprints.append(np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, dim)))
train_ds[[str(i) for i in range(dim)]] = fingerprints
fingerprints = []
for idx, row in valid_ds.iterrows():
    mol = Chem.MolFromSmiles(row['input'])
    fingerprints.append(np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, dim)))
valid_ds[[str(i) for i in range(dim)]] = fingerprints


from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
model = RandomForestRegressor(n_estimators=785, max_depth=914, random_state=42)
model.fit(train_ds[[str(i) for i in range(dim)]], train_ds['YIELD'])
valid_ds['prediction'] = model.predict(valid_ds[[str(i) for i in range(dim)]])
valid_ds['prediction'] = valid_ds['prediction'].clip(0, 100)

print('r2_score:',r2_score(valid_ds['YIELD'], valid_ds['prediction']))
print('rmse:',mean_squared_error(valid_ds['YIELD'], valid_ds['prediction'], squared=False))
print('accuracy:', sum(abs(valid_ds['YIELD'] - valid_ds['prediction']) <= 10)/len(valid_ds))

r2_score: 0.9243766023716338
rmse: 7.5813132158036725
accuracy: 0.8627946127946128


In [65]:
# trainとtestを交換
import optuna

def objective(trial):
    n_estimators=trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 100, 1000)

    fingerprints = []
    dim = trial.suggest_int('dim', 100, 1000)
    for idx, row in train_ds.iterrows():
        mol = Chem.MolFromSmiles(row['input'])
        fingerprints.append(np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, dim)))
    train_ds[[str(i) for i in range(dim)]] = fingerprints
    fingerprints = []
    for idx, row in valid_ds.iterrows():
        mol = Chem.MolFromSmiles(row['input'])
        fingerprints.append(np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, dim)))
    valid_ds[[str(i) for i in range(dim)]] = fingerprints

    model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    model.fit(valid_ds[[str(i) for i in range(dim)]], valid_ds['YIELD'])
    pred = model.predict(train_ds[[str(i) for i in range(dim)]])
    train_ds['prediction'] = pred
    train_ds['prediction'] = train_ds['prediction'].clip(0, 100)
    score = r2_score(train_ds['YIELD'], pred)
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

[32m[I 2023-03-11 13:50:14,745][0m A new study created in memory with name: no-name-a4894664-2ade-4735-9c36-712862b663ef[0m
[32m[I 2023-03-11 13:50:45,577][0m Trial 0 finished with value: 0.8534727479418893 and parameters: {'n_estimators': 916, 'max_depth': 196, 'dim': 618}. Best is trial 0 with value: 0.8534727479418893.[0m
[32m[I 2023-03-11 13:51:02,499][0m Trial 1 finished with value: 0.8549733715405027 and parameters: {'n_estimators': 217, 'max_depth': 230, 'dim': 921}. Best is trial 1 with value: 0.8549733715405027.[0m
[32m[I 2023-03-11 13:51:28,627][0m Trial 2 finished with value: 0.8557771159428097 and parameters: {'n_estimators': 933, 'max_depth': 766, 'dim': 423}. Best is trial 2 with value: 0.8557771159428097.[0m
[32m[I 2023-03-11 13:51:49,847][0m Trial 3 finished with value: 0.8610801336452373 and parameters: {'n_estimators': 422, 'max_depth': 606, 'dim': 798}. Best is trial 3 with value: 0.8610801336452373.[0m
[32m[I 2023-03-11 13:51:59,735][0m Trial 4 fini

[32m[I 2023-03-11 14:06:30,260][0m Trial 39 finished with value: 0.8558757374302958 and parameters: {'n_estimators': 444, 'max_depth': 471, 'dim': 690}. Best is trial 11 with value: 0.8696691746229538.[0m
[32m[I 2023-03-11 14:06:57,058][0m Trial 40 finished with value: 0.8631928405957224 and parameters: {'n_estimators': 793, 'max_depth': 178, 'dim': 574}. Best is trial 11 with value: 0.8696691746229538.[0m
[32m[I 2023-03-11 14:07:30,406][0m Trial 41 finished with value: 0.8613745752448527 and parameters: {'n_estimators': 799, 'max_depth': 377, 'dim': 929}. Best is trial 11 with value: 0.8696691746229538.[0m
[32m[I 2023-03-11 14:08:08,173][0m Trial 42 finished with value: 0.8524071990570382 and parameters: {'n_estimators': 996, 'max_depth': 630, 'dim': 848}. Best is trial 11 with value: 0.8696691746229538.[0m
[32m[I 2023-03-11 14:08:37,066][0m Trial 43 finished with value: 0.8617385807185559 and parameters: {'n_estimators': 703, 'max_depth': 536, 'dim': 779}. Best is trial

In [66]:
from rdkit.Chem import AllChem
fingerprints = []
dim = 865
for idx, row in train_ds.iterrows():
    mol = Chem.MolFromSmiles(row['input'])
    fingerprints.append(np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, dim)))
train_ds[[str(i) for i in range(dim)]] = fingerprints
fingerprints = []
for idx, row in valid_ds.iterrows():
    mol = Chem.MolFromSmiles(row['input'])
    fingerprints.append(np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, dim)))
valid_ds[[str(i) for i in range(dim)]] = fingerprints


from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
model = RandomForestRegressor(n_estimators=785, max_depth=914, random_state=42)
model.fit(valid_ds[[str(i) for i in range(dim)]], valid_ds['YIELD'])
train_ds['prediction'] = model.predict(train_ds[[str(i) for i in range(dim)]])
train_ds['prediction'] = train_ds['prediction'].clip(0, 100)

print('r2_score:',r2_score(train_ds['YIELD'], train_ds['prediction']))
print('rmse:',mean_squared_error(train_ds['YIELD'], train_ds['prediction'], squared=False))
print('accuracy:', sum(abs(train_ds['YIELD'] - train_ds['prediction']) <= 10)/len(train_ds))

r2_score: 0.8535538249146555
rmse: 10.392744506983604
accuracy: 0.7647271413082761


## ordでrandom-forestを学習

In [2]:
from rdkit import Chem
def canonicalize(mol):
    mol = Chem.MolToSmiles(Chem.MolFromSmiles(mol),True)
    return mol

def preprocess(df):
    df['REAGENT'] = df['REAGENT'].apply(lambda x: canonicalize(x) if x != ' ' else ' ')
    df['REACTANT'] = df['REACTANT'].apply(lambda x: canonicalize(x) if x != ' ' else ' ')
    df['PRODUCT'] = df['PRODUCT'].apply(lambda x: canonicalize(x) if x != ' ' else ' ')
#     df['YIELD'] = df['YIELD'].clip(0, 100)/100
    df['YIELD'] = df['YIELD'].clip(0, 100)
    df['input'] = df['REACTANT']  + '.' + df['REAGENT'] + '.' + df['PRODUCT']
    df['input'] = df['input'].str.replace('. ', '')
    df = df[['input', 'YIELD']].drop_duplicates().reset_index(drop=True)
    lens = df['input'].apply(lambda x: len(x))
    # remove data that have too long inputs
    df = df[lens <= 512].reset_index(drop=True)
    
    return df

def preprocess_ord(df):
    df['input'] = df['input'].str.replace('REAGENT:', '.')
    df['input'] = df['input'].str.replace('REACTANT:', '')
    df['input'] = df['input'].str.replace('PRODUCT:', '.')
    df['input'] = df['input'].str.replace('. ', '')
    
    return df
    
df = pd.read_csv('/data2/sagawa/regression-input-train.csv').drop_duplicates().reset_index(drop=True)
train_ds = preprocess_ord(df)

df = pd.read_csv('/data2/sagawa/t5chem/data/C_N_yield/MFF_FullCV_01/test.csv').drop_duplicates().reset_index(drop=True)
valid_ds = preprocess(df)

In [3]:
train_ds.loc[0, 'input']

'Cl.Nc1nc2[nH]nc(-c3ccccc3)c2s1.O=C(Cl)c1ccncc1.[Na+].[OH-].c1ccncc1.CNC(=O)c1ncn2c(C)cc(C)[n+](C)c12.[Cl-]'

In [4]:
valid_ds.loc[0, 'input']

'Brc1ccccn1.Cc1ccc(N)cc1.CN(C)C(=NC(C)(C)C)N(C)C.COc1ccc(OC)c(P(C(C)(C)C)(C(C)(C)C)->[Pd]2(OS(=O)(=O)C(F)(F)F)<-Nc3ccccc3-c3ccccc32)c1-c1c(C(C)C)cc(C(C)C)cc1C(C)C.CS(C)=O.c1ccc(CN(Cc2ccccc2)c2ccno2)cc1.Cc1ccc(Nc2ccccn2)cc1'

In [None]:
from rdkit.Chem import AllChem
fingerprints = []
dim = 865
for idx, row in train_ds.iterrows():
    mol = Chem.MolFromSmiles(row['input'])
    fingerprints.append(np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, dim)))
train_ds[[str(i) for i in range(dim)]] = fingerprints
fingerprints = []
for idx, row in valid_ds.iterrows():
    mol = Chem.MolFromSmiles(row['input'])
    fingerprints.append(np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, dim)))
valid_ds[[str(i) for i in range(dim)]] = fingerprints


from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
model = RandomForestRegressor(n_estimators=785, max_depth=914, random_state=42)
model.fit(train_ds[[str(i) for i in range(dim)]], train_ds['YIELD'])
valid_ds['prediction'] = model.predict(valid_ds[[str(i) for i in range(dim)]])
valid_ds['prediction'] = valid_ds['prediction'].clip(0, 100)

print('r2_score:',r2_score(valid_ds['YIELD'], valid_ds['prediction']))
print('rmse:',mean_squared_error(valid_ds['YIELD'], valid_ds['prediction'], squared=False))
print('accuracy:', sum(abs(valid_ds['YIELD'] - valid_ds['prediction']) <= 10)/len(valid_ds))