In [1]:
from sklearn.ensemble import RandomForestClassifier
import os
import gc
import random
import itertools
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoConfig, AutoModel, T5EncoderModel, get_linear_schedule_with_warmup
import datasets
from datasets import load_dataset, load_metric
import sentencepiece
import argparse
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.nn as nn
from torch.optim import AdamW
import pickle
import time
import math
from sklearn.preprocessing import MinMaxScaler
from datasets.utils.logging import disable_progress_bar
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(seed=42)  

train = pd.read_csv('../../classification-input-train.csv')
valid = pd.read_csv('../../classification-input-valid.csv')

In [2]:
from rdkit import Chem
from rdkit.Chem import AllChem
info = {}
np.array(AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(train.loc[0, 'smiles']), 2, 128))

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0])

In [3]:
fingerprints = []
dim = 100
for idx, row in train.iterrows():
    mol = Chem.MolFromSmiles(row['smiles'])
    fingerprints.append(np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, dim)))
train[[str(i) for i in range(dim)]] = fingerprints
fingerprints = []
for idx, row in valid.iterrows():
    mol = Chem.MolFromSmiles(row['smiles'])
    fingerprints.append(np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, dim)))
valid[[str(i) for i in range(dim)]] = fingerprints
valid



Unnamed: 0,smiles,target,0,1,2,3,4,5,6,7,...,90,91,92,93,94,95,96,97,98,99
0,C1CCOC1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,CCCCCC[N+](CCCCCC)(CCCCCC)CCCCCC,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,[F-],0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,C[N+](C)(C)C,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,[W],0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
613688,FCF,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
613689,C[C@H]1c2cccc(O)c2C(=O)C2=C(O)[C@]3(O)C(=O)C(C...,0,0,0,0,0,1,0,0,0,...,0,0,0,1,1,1,0,0,0,0
613690,O=P(=O)O,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
613691,CC(O)COC(C)CO,0,1,1,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,1,1




Unnamed: 0,smiles,target,0,1,2,3,4,5,6,7,...,90,91,92,93,94,95,96,97,98,99
0,CC(C)(C)c1ccn2->[Ir+]34(<-n5cc(C(F)(F)F)ccc5-c...,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,1
1,CC(C)(C)c1ccnc(-c2cc(C(C)(C)C)ccn2)c1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,COCCOC,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Cl[Ni]Cl,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,F[P-](F)(F)(F)(F)F,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68184,FCC(CF)OC1CCNCC1,1,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
68185,COC1CCN(C(=O)c2cc(Cc3n[nH]c(=O)c4c([N+](=O)[O-...,1,0,0,0,1,0,0,1,0,...,1,0,1,0,1,1,1,1,1,1
68186,COC1CCN(C(=O)c2cc(Cc3n[nH]c(=O)c4cccc([N+](=O)...,1,0,0,0,0,0,0,1,0,...,1,0,1,0,1,1,1,1,1,1
68187,C1CCC(OC2CCNCC2)C1,1,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0


In [6]:
train['target'] *= -1
train['target'] += 1
valid['target'] *= -1
valid['target'] += 1
train['target'].value_counts()

0    610886
1      2807
Name: target, dtype: int64

In [8]:
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

In [10]:
model = RandomForestClassifier(max_depth=2, random_state=42)
model.fit(train[[str(i) for i in range(dim)]], train['target'])
pred = model.predict(valid[[str(i) for i in range(dim)]])
print('precision:',precision_score(valid['target'], pred), 'recall:',recall_score(valid['target'], pred), 'f1score:', f1_score(valid['target'], pred))
pd.DataFrame(confusion_matrix(valid['target'], pred), index=['REAGENT_true', 'REACTANT_true'],columns=['REAGENT_pred', 'REACTANT_pred'])
# 横:pred、縦:label

precision: 0.0 recall: 0.0 f1score: 0.0


Unnamed: 0,REAGENT_pred,REACTANT_pred
REAGENT_true,67877,0
REACTANT_true,312,0


In [11]:
train_c = train.copy()

In [12]:
pd.concat([train, train[train['target'] == 1].sample(n=len(train[train['target'] == 1])*10, replace=True)])['target'].value_counts()

0    610886
1     30877
Name: target, dtype: int64

In [13]:
model = RandomForestClassifier(max_depth=2, random_state=42)
train_data = pd.concat([train, train[train['target'] == 1].sample(n=len(train[train['target'] == 1])*10, replace=True)])
model.fit(train_data[[str(i) for i in range(dim)]], train_data['target'])
pred = model.predict(valid[[str(i) for i in range(dim)]])
print('precision:',precision_score(valid['target'], pred), 'recall:',recall_score(valid['target'], pred), 'f1score:', f1_score(valid['target'], pred))
pd.DataFrame(confusion_matrix(valid['target'], pred), index=['REAGENT_true', 'REACTANT_true'],columns=['REAGENT_pred', 'REACTANT_pred'])

precision: 0.0 recall: 0.0 f1score: 0.0


Unnamed: 0,REAGENT_pred,REACTANT_pred
REAGENT_true,67877,0
REACTANT_true,312,0


In [14]:
model = RandomForestClassifier(max_depth=2, random_state=42)
train_data = pd.concat([train, train[train['target'] == 1].sample(n=len(train[train['target'] == 1])*100, replace=True)])
model.fit(train_data[[str(i) for i in range(dim)]], train_data['target'])
pred = model.predict(valid[[str(i) for i in range(dim)]])
print('precision:',precision_score(valid['target'], pred), 'recall:',recall_score(valid['target'], pred), 'f1score:', f1_score(valid['target'], pred))
pd.DataFrame(confusion_matrix(valid['target'], pred), index=['REAGENT_true', 'REACTANT_true'],columns=['REAGENT_pred', 'REACTANT_pred'])

precision: 0.044920235096557515 recall: 0.6858974358974359 f1score: 0.08431836091410559


Unnamed: 0,REAGENT_pred,REACTANT_pred
REAGENT_true,63327,4550
REACTANT_true,98,214


In [15]:
model = RandomForestClassifier(max_depth=2, random_state=42)
train_data = pd.concat([train, train[train['target'] == 1].sample(n=len(train[train['target'] == 1])*200, replace=True)])
model.fit(train_data[[str(i) for i in range(dim)]], train_data['target'])
pred = model.predict(valid[[str(i) for i in range(dim)]])
print('precision:',precision_score(valid['target'], pred), 'recall:',recall_score(valid['target'], pred), 'f1score', f1_score(valid['target'], pred))
pd.DataFrame(confusion_matrix(valid['target'], pred), index=['REAGENT_true', 'REACTANT_true'],columns=['REAGENT_pred', 'REACTANT_pred'])

precision: 0.021603493994696615 recall: 0.8878205128205128 f1score 0.042180599969544694


Unnamed: 0,REAGENT_pred,REACTANT_pred
REAGENT_true,55332,12545
REACTANT_true,35,277


In [19]:
# train_data = pd.concat([train.sample(frac=0.1), train[train['target'] == 0].sample(n=len(train[train['target'] == 0])*100, replace=True).sample(frac=0.1)])
# best = 0
# for n_estimators in [2, 4, 8, 16, 32, 48, 64, 96]:
#     for max_depth in [2, 4, 8, 16, 32, 48, 64, 96]:
#         model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
#         model.fit(train_data[[str(i) for i in range(dim)]], train_data['target'])
        
#         pred = model.predict(valid[[str(i) for i in range(dim)]])
#         print('n_estimators:', n_estimators, 'max_depth:', max_depth)
#         print('precision:',precision_score(valid['target'], pred), 'recall:',recall_score(valid['target'], pred), 'f1score:', f1_score(valid['target'], pred))
#         display(pd.DataFrame(confusion_matrix(valid['target'], pred), index=['REAGENT_true', 'REACTANT_true'],columns=['REAGENT_pred', 'REACTANT_pred']))
#         if f1_score(valid['target'], pred) > best:
#             best = f1_score(valid['target'], pred)
#             best_param = [n_estimators, max_depth]

In [20]:
# best, best_param

In [41]:
# model = RandomForestClassifier(n_estimators=96, max_depth=32, random_state=42)
# train_data = pd.concat([train, train[train['target'] == 0].sample(n=len(train[train['target'] == 0])*100, replace=True)])
# model.fit(train_data[[str(i) for i in range(dim)]], train_data['target'])
# pred = model.predict(valid[[str(i) for i in range(dim)]])
# print('precision:',precision_score(valid['target'], pred), 'recall:',recall_score(valid['target'], pred), 'f1score:', f1_score(valid['target'], pred))
# pd.DataFrame(confusion_matrix(valid['target'], pred), index=['REAGENT_true', 'REACTANT_true'],columns=['REAGENT_pred', 'REACTANT_pred'])

precision: 0.9979268809394104 recall: 0.9715662153601368 f1score: 0.9845701360843827


Unnamed: 0,REAGENT_pred,REACTANT_pred
REAGENT_true,175,137
REACTANT_true,1930,65947


In [16]:
import optuna
train_data = pd.concat([train, train[train['target'] == 1].sample(n=len(train[train['target'] == 1])*100, replace=True)]).sample(frac=0.05)

def objective(trial):
    n_estimators=trial.suggest_int('n_estimators', 10, 1000)
    max_depth = trial.suggest_int('max_depth', 2, 1000)
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    model.fit(train_data[[str(i) for i in range(dim)]], train_data['target'])
    pred = model.predict(valid[[str(i) for i in range(dim)]])
    score =  f1_score(valid['target'], pred)
    return 1 - score

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

[32m[I 2023-01-14 10:59:02,924][0m A new study created in memory with name: no-name-61901d7e-8f17-4dbe-82e9-76157f02244a[0m
[32m[I 2023-01-14 10:59:22,200][0m Trial 0 finished with value: 0.8953807213668002 and parameters: {'n_estimators': 320, 'max_depth': 151}. Best is trial 0 with value: 0.8953807213668002.[0m
[32m[I 2023-01-14 11:00:16,186][0m Trial 1 finished with value: 0.8951137320977254 and parameters: {'n_estimators': 898, 'max_depth': 170}. Best is trial 1 with value: 0.8951137320977254.[0m
[32m[I 2023-01-14 11:01:02,389][0m Trial 2 finished with value: 0.8960437710437711 and parameters: {'n_estimators': 771, 'max_depth': 145}. Best is trial 1 with value: 0.8951137320977254.[0m
[32m[I 2023-01-14 11:01:30,190][0m Trial 3 finished with value: 0.894869638351556 and parameters: {'n_estimators': 462, 'max_depth': 510}. Best is trial 3 with value: 0.894869638351556.[0m
[32m[I 2023-01-14 11:02:12,717][0m Trial 4 finished with value: 0.8952020202020202 and parameters

In [17]:
study.best_params, study.best_value

({'n_estimators': 462, 'max_depth': 510}, 0.894869638351556)

In [18]:
model = RandomForestClassifier(**study.best_params, random_state=42)
train_data = pd.concat([train, train[train['target'] == 1].sample(n=len(train[train['target'] == 1])*100, replace=True)])
model.fit(train_data[[str(i) for i in range(dim)]], train_data['target'])
pred = model.predict(valid[[str(i) for i in range(dim)]])
print('precision:',precision_score(valid['target'], pred), 'recall:',recall_score(valid['target'], pred), 'f1score:', f1_score(valid['target'], pred))
pd.DataFrame(confusion_matrix(valid['target'], pred), index=['REAGENT_true', 'REACTANT_true'],columns=['REAGENT_pred', 'REACTANT_pred'])

precision: 0.06544077879935101 recall: 0.38782051282051283 f1score: 0.1119851920407219


Unnamed: 0,REAGENT_pred,REACTANT_pred
REAGENT_true,66149,1728
REACTANT_true,191,121


In [22]:
fingerprints = []
dim = 150
for idx, row in train.iterrows():
    mol = Chem.MolFromSmiles(row['smiles'])
    fingerprints.append(np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, dim)))
train[[str(i) for i in range(dim)]] = fingerprints
fingerprints = []
for idx, row in valid.iterrows():
    mol = Chem.MolFromSmiles(row['smiles'])
    fingerprints.append(np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, dim)))
valid[[str(i) for i in range(dim)]] = fingerprints
valid



Unnamed: 0,smiles,target,0,1,2,3,4,5,6,7,...,140,141,142,143,144,145,146,147,148,149
0,CC(C)(C)c1ccn2->[Ir+]34(<-n5cc(C(F)(F)F)ccc5-c...,1,0,0,0,0,1,1,0,0,...,0,0,0,0,1,0,0,0,0,0
1,CC(C)(C)c1ccnc(-c2cc(C(C)(C)C)ccn2)c1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,COCCOC,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Cl[Ni]Cl,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,F[P-](F)(F)(F)(F)F,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68184,FCC(CF)OC1CCNCC1,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
68185,COC1CCN(C(=O)c2cc(Cc3n[nH]c(=O)c4c([N+](=O)[O-...,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,1
68186,COC1CCN(C(=O)c2cc(Cc3n[nH]c(=O)c4cccc([N+](=O)...,0,0,0,0,0,1,0,0,0,...,1,0,0,1,0,1,0,0,1,1
68187,C1CCC(OC2CCNCC2)C1,0,0,0,0,0,0,0,0,1,...,0,0,1,0,1,0,0,0,0,1


In [23]:
model = RandomForestClassifier(max_depth=2, random_state=42)
model.fit(train[[str(i) for i in range(dim)]], train['target'])
pred = model.predict(valid[[str(i) for i in range(dim)]])
print('precision:',precision_score(valid['target'], pred), 'recall:',recall_score(valid['target'], pred), 'f1score:', f1_score(valid['target'], pred))
pd.DataFrame(confusion_matrix(valid['target'], pred), index=['REAGENT_true', 'REACTANT_true'],columns=['REAGENT_pred', 'REACTANT_pred'])
# 横:pred、縦:label

precision: 0.0 recall: 0.0 f1score: 0.0


Unnamed: 0,REAGENT_pred,REACTANT_pred
REAGENT_true,67877,0
REACTANT_true,312,0


In [25]:
model = RandomForestClassifier(max_depth=2, random_state=42)
train_data = pd.concat([train, train[train['target'] == 1].sample(n=len(train[train['target'] == 1])*100, replace=True)])
model.fit(train_data[[str(i) for i in range(dim)]], train_data['target'])
pred = model.predict(valid[[str(i) for i in range(dim)]])
print('precision:',precision_score(valid['target'], pred), 'recall:',recall_score(valid['target'], pred), 'f1score:', f1_score(valid['target'], pred))
pd.DataFrame(confusion_matrix(valid['target'], pred), index=['REAGENT_true', 'REACTANT_true'],columns=['REAGENT_pred', 'REACTANT_pred'])

precision: 0.06782802075611565 recall: 0.5865384615384616 f1score: 0.12159468438538207


Unnamed: 0,REAGENT_pred,REACTANT_pred
REAGENT_true,65362,2515
REACTANT_true,129,183


In [26]:
model = RandomForestClassifier(max_depth=2, random_state=42)
train_data = pd.concat([train, train[train['target'] == 1].sample(n=len(train[train['target'] == 1])*200, replace=True)])
model.fit(train_data[[str(i) for i in range(dim)]], train_data['target'])
pred = model.predict(valid[[str(i) for i in range(dim)]])
print('precision:',precision_score(valid['target'], pred), 'recall:',recall_score(valid['target'], pred), 'f1score', f1_score(valid['target'], pred))
pd.DataFrame(confusion_matrix(valid['target'], pred), index=['REAGENT_true', 'REACTANT_true'],columns=['REAGENT_pred', 'REACTANT_pred'])

precision: 0.02114408734430263 recall: 0.8814102564102564 f1score 0.041297492115933324


Unnamed: 0,REAGENT_pred,REACTANT_pred
REAGENT_true,55146,12731
REACTANT_true,37,275


In [28]:
# fingerprints = []
dim = 150
# for idx, row in train.iterrows():
#     mol = Chem.MolFromSmiles(row['smiles'])
#     fingerprints.append(np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, dim)))
# train[[str(i) for i in range(dim)]] = fingerprints
# fingerprints = []
# for idx, row in valid.iterrows():
#     mol = Chem.MolFromSmiles(row['smiles'])
#     fingerprints.append(np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, dim)))
# valid[[str(i) for i in range(dim)]] = fingerprints
model = RandomForestClassifier(max_depth=2, random_state=42)
train_data = pd.concat([train, train[train['target'] == 1].sample(n=len(train[train['target'] == 1])*100, replace=True)])
model.fit(train_data[[str(i) for i in range(dim)]], train_data['target'])
pred = model.predict(valid[[str(i) for i in range(dim)]])
print('precision:',precision_score(valid['target'], pred), 'recall:',recall_score(valid['target'], pred), 'f1score:', f1_score(valid['target'], pred))
pd.DataFrame(confusion_matrix(valid['target'], pred), index=['REAGENT_true', 'REACTANT_true'],columns=['REAGENT_pred', 'REACTANT_pred'])

precision: 0.08896591565569036 recall: 0.4935897435897436 f1score: 0.15075868820362215


Unnamed: 0,REAGENT_pred,REACTANT_pred
REAGENT_true,66300,1577
REACTANT_true,158,154


In [35]:
fingerprints = []
dim = 200
for idx, row in train.iterrows():
    mol = Chem.MolFromSmiles(row['smiles'])
    fingerprints.append(np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, dim)))
train[[str(i) for i in range(dim)]] = fingerprints
fingerprints = []
for idx, row in valid.iterrows():
    mol = Chem.MolFromSmiles(row['smiles'])
    fingerprints.append(np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, dim)))
# valid[[str(i) for i in range(dim)]] = fingerprints
dim = 100
model = RandomForestClassifier(max_depth=2, random_state=42)
train_data = pd.concat([train, train[train['target'] == 1].sample(n=len(train[train['target'] == 1])*100, replace=True)])
model.fit(train_data[[str(i) for i in range(dim)]], train_data['target'])
pred = model.predict(valid[[str(i) for i in range(dim)]])
print('precision:',precision_score(valid['target'], pred), 'recall:',recall_score(valid['target'], pred), 'f1score:', f1_score(valid['target'], pred))
pd.DataFrame(confusion_matrix(valid['target'], pred), index=['REAGENT_true', 'REACTANT_true'],columns=['REAGENT_pred', 'REACTANT_pred'])

precision: 0.2182741116751269 recall: 0.41346153846153844 f1score: 0.2857142857142857


Unnamed: 0,REAGENT_pred,REACTANT_pred
REAGENT_true,67415,462
REACTANT_true,183,129


In [40]:
import optuna
train_data = pd.concat([train, train[train['target'] == 1].sample(n=len(train[train['target'] == 1])*100, replace=True)]).sample(frac=0.05)
dim = 100
def objective(trial):
    n_estimators=trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 2, 1000)
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    model.fit(train_data[[str(i) for i in range(dim)]], train_data['target'])
    pred = model.predict(valid[[str(i) for i in range(dim)]])
    score =  f1_score(valid['target'], pred)
    return 1 - score

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

[32m[I 2023-01-14 12:54:16,551][0m A new study created in memory with name: no-name-21624416-1fcb-4f9b-b3e7-91794268e8ed[0m
[32m[I 2023-01-14 12:54:58,793][0m Trial 0 finished with value: 0.9032992036405005 and parameters: {'n_estimators': 695, 'max_depth': 278}. Best is trial 0 with value: 0.9032992036405005.[0m
[32m[I 2023-01-14 12:55:49,506][0m Trial 1 finished with value: 0.9031155015197568 and parameters: {'n_estimators': 834, 'max_depth': 118}. Best is trial 1 with value: 0.9031155015197568.[0m
[32m[I 2023-01-14 12:56:26,621][0m Trial 2 finished with value: 0.9034587609274041 and parameters: {'n_estimators': 610, 'max_depth': 869}. Best is trial 1 with value: 0.9031155015197568.[0m
[32m[I 2023-01-14 12:56:39,350][0m Trial 3 finished with value: 0.9047080979284369 and parameters: {'n_estimators': 209, 'max_depth': 440}. Best is trial 1 with value: 0.9031155015197568.[0m
[32m[I 2023-01-14 12:57:03,335][0m Trial 4 finished with value: 0.9037331817320448 and paramete

In [43]:
model = RandomForestClassifier(**study.best_params, random_state=42)
train_data = pd.concat([train, train[train['target'] == 1].sample(n=len(train[train['target'] == 1])*100, replace=True)])
model.fit(train_data[[str(i) for i in range(dim)]], train_data['target'])
pred = model.predict(valid[[str(i) for i in range(dim)]])
print('precision:',precision_score(valid['target'], pred), 'recall:',recall_score(valid['target'], pred), 'f1score:', f1_score(valid['target'], pred))
pd.DataFrame(confusion_matrix(valid['target'], pred), index=['REAGENT_true', 'REACTANT_true'],columns=['REAGENT_pred', 'REACTANT_pred'])

precision: 0.06396484375 recall: 0.4198717948717949 f1score: 0.11101694915254237


Unnamed: 0,REAGENT_pred,REACTANT_pred
REAGENT_true,65960,1917
REACTANT_true,181,131


In [42]:
study.best_params

{'n_estimators': 731, 'max_depth': 101}