# Предсказание активности молекул по отношению к таргету

*При подготовке ноутбука использовались данные из соревнования [Global AI Challenge](https://codenrock.com/contests/global-ai#/)* 

Целью данной задачи является предсказание активности молекулы лиганда по отношению к таргету - Covid 19

![](https://cloudfront.jove.com/files/media/science-education/science-education-thumbs/11513.jpg)

## План анализа данных:

  1. Загрузить данные для обучения
  2. Обработать данные перед обучением модели
  3. Обучить модель на обучающей выборке
  4. Загрузить и предобработать данные для тестирования
  5. Провалидировать модель на тестовой выборке


# 0. Установка и импорт библиотек

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# 1. Загрузка данных

In [2]:
!wget https://www.dropbox.com/s/48c34raijlxc0nw/train.csv -O train.csv 2> /dev/null
!wget https://www.dropbox.com/s/297trreazro8ivr/test_labels.csv -O test_labels.csv 2> /dev/null

In [3]:
DATA_PATH = "./"
TRAIN_FILE = "train.csv"
TEST_FILE = "test_labels.csv"

SMILES_COLUMN = "smiles"
TARGET_COLUMN = "Active"

In [4]:
import pandas as pd

def load_train_test_data():
    train_csv_path = os.path.join(DATA_PATH, TRAIN_FILE)
    test_csv_path = os.path.join(DATA_PATH, TEST_FILE)
    train_data = pd.read_csv(train_csv_path, index_col = 0)
    test_data = pd.read_csv(test_csv_path,index_col = 0)
    return train_data.rename(columns = {"Smiles":SMILES_COLUMN}), test_data.rename(columns = {"Smiles":SMILES_COLUMN})

## 1.1 Анализ данных, формулировка задачи машинного обучения

Молекулы представлены в виде [SMILES нотации](https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system).

![](https://upload.wikimedia.org/wikipedia/commons/thumb/0/00/SMILES.png/450px-SMILES.png)

In [5]:
train_data, test_data = load_train_test_data()
train_data.head()

Unnamed: 0,smiles,Active
0,COc1ccc2[nH]cc(CCN)c2c1,False
1,CCCN1CCC[C@H](c2cccc(O)c2)C1.Cl,False
2,O=C(NO)c1cnc(N2CCN(S(=O)(=O)c3ccc4ccccc4c3)CC2...,False
3,Nc1cccc(CNC(=O)c2ccc(Oc3ccc(OCc4cccc(F)c4)cc3)...,False
4,Fc1ccccc1CNCc1ccc(-c2ccnc3[nH]ccc23)cc1,False


In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5557 entries, 0 to 5556
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   smiles  5557 non-null   object
 1   Active  5557 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 92.3+ KB


In [7]:
train_data[TARGET_COLUMN].value_counts()

False    5351
True      206
Name: Active, dtype: int64

## 1.2 Предобработка данных

In [8]:
from rdkit import Chem
from rdkit.Chem.SaltRemover import SaltRemover

In [9]:
def remove_salts_and_canonicalized(smiles: str):
    remover = SaltRemover(defnData="[Cl,Br]")
    mol = Chem.MolFromSmiles(smiles)
    res = remover.StripMol(mol)
    processed_smiles = Chem.MolToSmiles(res)
    return processed_smiles

In [10]:
train_data[SMILES_COLUMN] = list(map(remove_salts_and_canonicalized, train_data[SMILES_COLUMN]))
test_data[SMILES_COLUMN] = list(map(remove_salts_and_canonicalized, test_data[SMILES_COLUMN]))

In [11]:
def change_str_target_to_int(targets: pd.Series):
  target_map = {True: 1, False: 0}
  processed_targets = targets.map(target_map)
  return processed_targets.values

In [12]:
train_data[TARGET_COLUMN] = change_str_target_to_int(train_data[TARGET_COLUMN])
test_data[TARGET_COLUMN] = change_str_target_to_int(test_data[TARGET_COLUMN])

In [13]:
train_data.head()

Unnamed: 0,smiles,Active
0,COc1ccc2[nH]cc(CCN)c2c1,0
1,CCCN1CCC[C@H](c2cccc(O)c2)C1,0
2,O=C(NO)c1cnc(N2CCN(S(=O)(=O)c3ccc4ccccc4c3)CC2...,0
3,Nc1cccc(CNC(=O)c2ccc(Oc3ccc(OCc4cccc(F)c4)cc3)...,0
4,Fc1ccccc1CNCc1ccc(-c2ccnc3[nH]ccc23)cc1,0


## 1.3 Feature engineering

Молекулу можно представить в виде фингерпринта - вектора свойств, полученного по определенному алгоритму.

Мы будем считать фингерпринты при помощи библиотеки RDKit. Про различные фингерпринты и их описание можно почитать тут - https://www.rdkit.org/docs/GettingStartedInPython.html#fingerprinting-and-molecular-similarity

![](https://sun9-64.userapi.com/impf/_8Zy5WO6Mt0SIPx1YS02DeErAoZ0RHcwgc-kZg/Md98bNVzBg0.jpg?size=831x415&quality=96&sign=cb20481128a04ff523fd662dd0e604ab&type=album)


### Моргановские фингерпринты (ECFP)

![](https://d3i71xaburhd42.cloudfront.net/52adf3589e8b7b9855353e5815669258ef6e3405/6-Figure2-1.png)

In [14]:
from enum import Enum
from functools import partial
from rdkit import Chem, DataStructs
from rdkit.DataStructs import ExplicitBitVect
from rdkit.Chem import AllChem, MACCSkeys
from typing import List


In [15]:
class FingerprintsNames(Enum):
    ECFP4 = "morgan_2_2048"
    RDKitFP = "RDKFingerprint"
    TOPOTORSION = "topological_torsion"
    MACCS = "MACCSkeys"
    PATTERN = "PatternFingerprint"
    ATOMPAIR = "AtomPairFingerprint"



FINGERPRINTS_METHODS = {
    FingerprintsNames.ECFP4: partial(AllChem.GetMorganFingerprintAsBitVect, radius=2, nBits=2048),
    FingerprintsNames.RDKitFP: Chem.RDKFingerprint,#TODO
    FingerprintsNames.TOPOTORSION: AllChem.GetHashedTopologicalTorsionFingerprintAsBitVect,#TODO
    FingerprintsNames.MACCS: MACCSkeys.GenMACCSKeys,#TODO
    FingerprintsNames.PATTERN: Chem.PatternFingerprint,#TODO
    FingerprintsNames.ATOMPAIR: AllChem.GetHashedAtomPairFingerprintAsBitVect}#TODO


In [16]:
fingerprint_type_name = FingerprintsNames.ECFP4
fingerprint_type_method = FINGERPRINTS_METHODS[fingerprint_type_name]

In [17]:
def bit_vectors_to_numpy_arrays(fps: List[ExplicitBitVect]) -> np.array:
    output_arrays = [np.zeros((1,)) for i in range(len(fps))]
    _ = list(
        map(lambda fp_output_array: DataStructs.ConvertToNumpyArray(fp_output_array[0], fp_output_array[1]),
            zip(fps, output_arrays)))
    return np.asarray(output_arrays)

def get_np_array_of_fps(fp_type, smiles: List[str]):
    # Calculate the morgan fingerprint
    mols = [Chem.MolFromSmiles(m) for m in smiles]
    fp = list(map(fp_type, mols))
    return bit_vectors_to_numpy_arrays(fp)

In [18]:
train_fp = get_np_array_of_fps(fp_type=fingerprint_type_method, smiles=train_data[SMILES_COLUMN])
test_fp = get_np_array_of_fps(fp_type=fingerprint_type_method, smiles=test_data[SMILES_COLUMN])

In [19]:
y_train = train_data[TARGET_COLUMN]
y_test = test_data[TARGET_COLUMN]

# 2. Подготовка к обучению модели

## 2.1 Кросс-валидация

![](https://pubs.rsc.org/image/article/2018/SC/c7sc02664a/c7sc02664a-f3_hi-res.gif)

In [20]:
from dgllife.utils import ScaffoldSplitter

Using backend: pytorch


In [21]:
class ScaffoldCVSklearn:
    def __init__(self, data, k_folds):
        self.scaffold_splits = ScaffoldSplitter.k_fold_split(data, k=k_folds)

    def split(self):
        indices_splits = []
        for train_data, val_data in self.scaffold_splits:
            train_indices = train_data.indices
            val_indices = val_data.indices
            indices_splits.append((train_indices, val_indices))
        return indices_splits

    def convert_data_to_indices(self, dataset):
        indices = [index for index, row in dataset.iterrows()]
        return indices


In [22]:
cv = ScaffoldCVSklearn(train_data, k_folds=3).split()

Start initializing RDKit molecule instances...
Creating RDKit molecule instance 1000/5557
Creating RDKit molecule instance 2000/5557
Creating RDKit molecule instance 3000/5557
Creating RDKit molecule instance 4000/5557
Creating RDKit molecule instance 5000/5557
Start computing Bemis-Murcko scaffolds.
Computing Bemis-Murcko for compound 1000/5557
Computing Bemis-Murcko for compound 2000/5557
Computing Bemis-Murcko for compound 3000/5557
Computing Bemis-Murcko for compound 4000/5557
Computing Bemis-Murcko for compound 5000/5557
Processing fold 1/3
Processing fold 2/3
Processing fold 3/3


## 2.2 Установка модели

In [23]:
from xgboost import XGBClassifier

In [24]:
xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, nthread=1, use_label_encoder=False)

In [25]:
params = {
        'max_depth': [10],#,20,30],
        'n_estimators': [100]#200,300,400,500]
    }

## 2.3 Поиск параметров

In [27]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(xgb, param_grid=params, scoring='accuracy', n_jobs=8,
                               cv=cv, verbose=10)

# Подбор параметров модели

In [28]:
print('\n Start Grid Search')
grid_search.fit(train_fp, y_train)


 Start Grid Search
Fitting 3 folds for each of 1 candidates, totalling 3 fits


GridSearchCV(cv=[([5, 6, 13, 81, 96, 126, 149, 160, 163, 175, 194, 196, 230,
                   248, 260, 280, 283, 287, 301, 320, 329, 353, 392, 393, 394,
                   409, 430, 443, 462, 490, ...],
                  [22, 49, 52, 67, 103, 107, 154, 184, 188, 202, 214, 225, 226,
                   229, 236, 254, 266, 292, 293, 302, 308, 313, 331, 382, 383,
                   402, 454, 474, 477, 482, ...]),
                 ([22, 49, 52, 67, 103, 107, 154, 184, 188, 202, 214, 225, 226,
                   229, 236, 254, 266, 292, 293, 302, 308, 313, 331,...
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=600, n_jobs=None, nthread=1,
                                     num_parallel_tree=None, predictor=None,
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=Non

In [29]:
print('\n All results:')
print(grid_search.cv_results_)
print('\n Best estimator:')
print(grid_search.best_estimator_)
print('\n Best normalized score')
print(grid_search.best_score_)
print('\n Best hyperparameters:')
print(grid_search.best_params_)


 All results:
{'mean_fit_time': array([43.25048828]), 'std_fit_time': array([0.47569976]), 'mean_score_time': array([0.07785447]), 'std_score_time': array([0.01606973]), 'param_max_depth': masked_array(data=[10],
             mask=[False],
       fill_value='?',
            dtype=object), 'param_n_estimators': masked_array(data=[100],
             mask=[False],
       fill_value='?',
            dtype=object), 'params': [{'max_depth': 10, 'n_estimators': 100}], 'split0_test_score': array([0.95412844]), 'split1_test_score': array([0.96706263]), 'split2_test_score': array([0.96976242]), 'mean_test_score': array([0.96365116]), 'std_test_score': array([0.00682319]), 'rank_test_score': array([1], dtype=int32)}

 Best estimator:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.02,

In [30]:
md = grid_search.best_params_['max_depth']
n_est = grid_search.best_params_['n_estimators']

# Обучение и оценка модели

In [31]:
xgb = XGBClassifier(max_depth=md, n_estimators=n_est, learning_rate=0.02,  nthread=1, use_label_encoder=False)

In [32]:
xgb.fit(train_fp, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.02, max_delta_step=0,
              max_depth=10, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=1, nthread=1,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [33]:
test_predictions = xgb.predict(test_fp)

In [34]:
from sklearn.metrics import f1_score

In [35]:
score = f1_score(y_test, test_predictions)
print(f"Best model test f1 score is {round(score, 3)}")

Best model test f1 score is 0.265


# Задание (10 баллов + 3 бонусных)
1. (3 балла) Добавить решение проблемы несбалансированной классификации

Варианты:
* UnderSampling
* OverSampling
* SMOTE
* Внутренние инструменты модели (`scale_pos_weight`)

2. (2 балла) Использовать еще 2 вида фингерпринтов из `FingerprintsNames`

3. (3 балла) Получить f1-score на тестовом датасете больше 0.35

Варианты:
* Увеличить количество параметров в подборе гиперпараметров
* Использовать другие алгоритмы подбора гиперпараметров (например, [RandomizedSearch](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html))
* Использовать другие модели (Random Forest, SVC, MLPClassifier, etc)

4. (2 балла) Логирование

В качестве финального результата предоставьте таблицу (можно `pd.DataFrame`) c колонками: Model, Fingerprint, Best Parameters, Mean Cross-Validation Score, Std Cross-Validation Score, Test Score 

Проанализируйте результаты: 
* Какие фингерпринты дали лучший результат?
* Какая модель дала лучший результат.
* Коррелируют ли скоры на кросс-валидации и тестовой выборке?

5. (Бонус +3 балла) Получить f1-score на тестовом датасете больше 0.45

## Задание 1

### Подготовка FingerPrints

In [630]:
class FingerprintsNames(Enum):
    ECFP4 = "morgan_2_2048"
    RDKitFP = "RDKFingerprint"
    TOPOTORSION = "topological_torsion"
    MACCS = "MACCSkeys"
    PATTERN = "PatternFingerprint"
    ATOMPAIR = "AtomPairFingerprint"



FINGERPRINTS_METHODS = {
    FingerprintsNames.ECFP4: partial(AllChem.GetMorganFingerprintAsBitVect, radius=2, nBits=2048),
    FingerprintsNames.RDKitFP: Chem.RDKFingerprint,#TODO
    FingerprintsNames.TOPOTORSION: AllChem.GetHashedTopologicalTorsionFingerprintAsBitVect,#TODO
    FingerprintsNames.MACCS: MACCSkeys.GenMACCSKeys,#TODO
    FingerprintsNames.PATTERN: Chem.PatternFingerprint,#TODO
    FingerprintsNames.ATOMPAIR: AllChem.GetHashedAtomPairFingerprintAsBitVect}#TODO


In [631]:
fingerprint_type_name1 = FingerprintsNames.ECFP4
fingerprint_type_method1 = FINGERPRINTS_METHODS[fingerprint_type_name1]
fingerprint_type_name2 = FingerprintsNames.RDKitFP
fingerprint_type_method2 = FINGERPRINTS_METHODS[fingerprint_type_name2]
fingerprint_type_name3 = FingerprintsNames.TOPOTORSION
fingerprint_type_method3 = FINGERPRINTS_METHODS[fingerprint_type_name3]
fingerprint_type_name4 = FingerprintsNames.MACCS
fingerprint_type_method4 = FINGERPRINTS_METHODS[fingerprint_type_name3]
fingerprint_type_name5 = FingerprintsNames.PATTERN
fingerprint_type_method5 = FINGERPRINTS_METHODS[fingerprint_type_name3]
fingerprint_type_name6 = FingerprintsNames.ATOMPAIR
fingerprint_type_method6 = FINGERPRINTS_METHODS[fingerprint_type_name3]

In [632]:
def bit_vectors_to_numpy_arrays(fps: List[ExplicitBitVect]) -> np.array:
    output_arrays = [np.zeros((1,)) for i in range(len(fps))]
    _ = list(
        map(lambda fp_output_array: DataStructs.ConvertToNumpyArray(fp_output_array[0], fp_output_array[1]),
            zip(fps, output_arrays)))
    return np.asarray(output_arrays)

def get_np_array_of_fps(fp_type, smiles: List[str]):
    # Calculate the morgan fingerprint
    mols = [Chem.MolFromSmiles(m) for m in smiles]
    fp = list(map(fp_type, mols))
    return bit_vectors_to_numpy_arrays(fp)

In [730]:
train_fp1 = get_np_array_of_fps(fp_type=fingerprint_type_method1, smiles=train_data[SMILES_COLUMN])
test_fp1 = get_np_array_of_fps(fp_type=fingerprint_type_method1, smiles=test_data[SMILES_COLUMN])
print(train_fp1.shape)
train_fp2 = get_np_array_of_fps(fp_type=fingerprint_type_method2, smiles=train_data[SMILES_COLUMN])
test_fp2 = get_np_array_of_fps(fp_type=fingerprint_type_method2, smiles=test_data[SMILES_COLUMN])
print(train_fp2.shape)
train_fp3 = get_np_array_of_fps(fp_type=fingerprint_type_method3, smiles=train_data[SMILES_COLUMN])
test_fp3 = get_np_array_of_fps(fp_type=fingerprint_type_method3, smiles=test_data[SMILES_COLUMN])
print(train_fp3.shape)

(5557, 2048)
(5557, 2048)
(5557, 2048)


In [634]:
y_train = train_data[TARGET_COLUMN]
y_test = test_data[TARGET_COLUMN]

### Балансировка

#### SMOTE

In [655]:
from imblearn.over_sampling import SMOTE
from collections import Counter

In [656]:
print(f'Соотношение данных до балансировки: {dict(Counter(y_train))}')

Соотношение данных до балансировки: {0: 5351, 1: 206}


In [657]:
smote = SMOTE()
train_fp_new, y_train_new = smote.fit_resample(train_fp, y_train)

In [658]:
print(f'Соотношение данных до балансировки: {dict(Counter(y_train_new))}')

Соотношение данных до балансировки: {0: 5351, 1: 5351}


#### Undersampling

In [659]:
print(f'Соотношение данных до балансировки: {dict(Counter(y_train))}')

Соотношение данных до балансировки: {0: 5351, 1: 206}


In [660]:
temp_data = np.hstack((train_fp, np.array(y_train).reshape(len(y_train), 1)))
temp_data.shape

(5557, 2049)

In [661]:
temp1 = temp_data[temp_data[:,-1] == 1]
temp2 = temp_data[temp_data[:,-1] == 0]
temp3 = temp2[np.random.choice(len(temp2), len(temp1)),:]
temp4 = np.vstack((temp1, temp3))

In [662]:
train_fp_us = temp4[:, :-1]
y_train_us = temp4[:, -1]

In [663]:
print(f'Соотношение данных после балансировки: {dict(Counter(y_train_us))}')

Соотношение данных после балансировки: {1.0: 206, 0.0: 206}


#### Oversampling

In [798]:
print(f'Соотношение данных до балансировки: {dict(Counter(y_train))}')

Соотношение данных до балансировки: {0: 5351, 1: 206}


In [806]:
# temp_data = np.hstack((train_fp, np.array(y_train).reshape(len(y_train), 1)))
temp_data = np.hstack((train_fp1, np.array(y_train).reshape(len(y_train), 1)))

In [807]:
temp1 = temp_data[temp_data[:,-1] == 1]
temp2 = temp_data[temp_data[:,-1] == 0]

# Эмперически найденные значения для OverSampling
choice = np.array(pd.read_csv('res_choice.csv')).reshape(len(choice),)

temp3 = temp1[choice,:]
temp4 = np.vstack((temp2, temp3))

In [808]:
train_fp_os = temp4[:, :-1]
y_train_os = temp4[:, -1]

In [809]:
print(f'Соотношение данных после балансировки: {dict(Counter(y_train_os))}')

Соотношение данных после балансировки: {0.0: 5351, 1.0: 5351}


## Задание 3

#### Подготовка к обучению

In [579]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold

#### Поиск лучших параметров

In [580]:
from sklearn.model_selection import GridSearchCV

In [581]:
# forest = RandomForestClassifier(n_jobs=-1)
clf = MLPClassifier(random_state=1, max_iter=300)

In [610]:
params = {
#         'max_depth': [i*5 for i in range(2,6)],
#         'n_estimators': [50*i for i in range(3,7)]
        'activation':['identity', 'logistic', 'tanh', 'relu'],
        'solver':['lbfgs', 'adam']
    }
params

{'activation': ['identity', 'logistic', 'tanh', 'relu'],
 'solver': ['lbfgs', 'adam']}

In [611]:
# Кросс-валидация
skf = StratifiedKFold(n_splits=3)
cv = skf.split(train_fp_os, y_train_os)

# Поиск оптимальных параметров
grid_search = GridSearchCV(clf, param_grid=params, scoring='f1', n_jobs=1,
                               cv=cv, verbose=10)
gs_result1 = grid_search.fit(train_fp_os, y_train_os)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV 1/3; 1/8] START activation=identity, solver=lbfgs...........................
[CV 1/3; 1/8] END activation=identity, solver=lbfgs;, score=0.989 total time=  15.4s
[CV 2/3; 1/8] START activation=identity, solver=lbfgs...........................
[CV 2/3; 1/8] END activation=identity, solver=lbfgs;, score=0.985 total time=  15.5s
[CV 3/3; 1/8] START activation=identity, solver=lbfgs...........................
[CV 3/3; 1/8] END activation=identity, solver=lbfgs;, score=0.984 total time=  15.4s
[CV 1/3; 2/8] START activation=identity, solver=adam............................
[CV 1/3; 2/8] END activation=identity, solver=adam;, score=0.984 total time=  29.7s
[CV 2/3; 2/8] START activation=identity, solver=adam............................
[CV 2/3; 2/8] END activation=identity, solver=adam;, score=0.981 total time=  44.9s
[CV 3/3; 2/8] START activation=identity, solver=adam............................
[CV 3/3; 2/8] END activation=id

In [613]:
# best_md = gs_result.best_params_['max_depth']
# best_n_ests = gs_result.best_params_['max_iter']
res_cv1 = gs_result1.cv_results_
best_params = gs_result1.best_params_

{'activation': 'relu', 'solver': 'lbfgs'}

In [619]:
print(f'Оптимальные параметры MLP-классификатора\n{best_params}')

Оптимальные параметры MLP-классификатора
{'activation': 'relu', 'solver': 'lbfgs'}


#### Обучение и предсказание

In [812]:
clf = MLPClassifier(random_state=1,
                    max_iter=300,
                    activation=best_params['activation'],
                    solver=best_params['solver'])

In [813]:
clf.fit(train_fp_os, y_train_os)

MLPClassifier(max_iter=300, random_state=1, solver='lbfgs')

In [814]:
y_pred = clf.predict(test_fp1)
f1_1 = f1_score(y_test, y_pred)

In [815]:
print(f'С помощью ECFP4 и Oversampling получен F1-score = {f1_1:.3f}')

С помощью ECFP4 и Oversampling получен F1-score = 0.361


### Задание 2

#### RDKitFP

In [783]:
train_fp2 = get_np_array_of_fps(fp_type=fingerprint_type_method2, smiles=train_data[SMILES_COLUMN])
test_fp2 = get_np_array_of_fps(fp_type=fingerprint_type_method2, smiles=test_data[SMILES_COLUMN])
print(train_fp2.shape)

(5557, 2048)


In [784]:
temp_data = np.hstack((train_fp2, np.array(y_train).reshape(len(y_train), 1)))
temp1 = temp_data[temp_data[:,-1] == 1]
temp2 = temp_data[temp_data[:,-1] == 0]

# Эмперически найденные значения для OverSampling
choice = np.array(pd.read_csv('res_choice.csv')).reshape(len(choice),)

temp3 = temp1[choice,:]
temp4 = np.vstack((temp2, temp3))

train_fp_os = temp4[:, :-1]
y_train_os = temp4[:, -1]

In [785]:
train_fp_os = temp4[:, :-1]
y_train_os = temp4[:, -1]

In [747]:
# Кросс-валидация
skf = StratifiedKFold(n_splits=3)
cv = skf.split(train_fp_os, y_train_os)

# Поиск оптимальных параметров
grid_search = GridSearchCV(clf, param_grid=params, scoring='f1', n_jobs=1,
                               cv=cv, verbose=10)
gs_result2 = grid_search.fit(train_fp_os, y_train_os)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV 1/3; 1/8] START activation=identity, solver=lbfgs...........................
[CV 1/3; 1/8] END activation=identity, solver=lbfgs;, score=0.984 total time=  25.4s
[CV 2/3; 1/8] START activation=identity, solver=lbfgs...........................
[CV 2/3; 1/8] END activation=identity, solver=lbfgs;, score=0.981 total time=  21.3s
[CV 3/3; 1/8] START activation=identity, solver=lbfgs...........................
[CV 3/3; 1/8] END activation=identity, solver=lbfgs;, score=0.985 total time=  21.8s
[CV 1/3; 2/8] START activation=identity, solver=adam............................
[CV 1/3; 2/8] END activation=identity, solver=adam;, score=0.981 total time=  22.5s
[CV 2/3; 2/8] START activation=identity, solver=adam............................
[CV 2/3; 2/8] END activation=identity, solver=adam;, score=0.979 total time=  57.6s
[CV 3/3; 2/8] START activation=identity, solver=adam............................
[CV 3/3; 2/8] END activation=id

In [786]:
res_cv2 = gs_result2.cv_results_
best_params = gs_result2.best_params_

In [787]:
res_forest = RandomForestClassifier(n_estimators=best_n_ests,
                                    max_depth=best_md,
                                    n_jobs=-1)
clf = MLPClassifier(random_state=1,
                    max_iter=300,
                    activation=best_params['activation'],
                    solver=best_params['solver'])

In [788]:
clf.fit(train_fp_os, y_train_os)

MLPClassifier(max_iter=300, random_state=1)

In [789]:
y_pred = clf.predict(test_fp2)
f1_2 = f1_score(y_test, y_pred)

In [790]:
print(f'С помощью RDKitFP и Oversampling получен F1-score = {f1_2:.3f}')

С помощью RDKitFP и Oversampling получен F1-score = 0.337


#### TOPOTORSION 

In [791]:
train_fp3 = get_np_array_of_fps(fp_type=fingerprint_type_method3, smiles=train_data[SMILES_COLUMN])
test_fp3 = get_np_array_of_fps(fp_type=fingerprint_type_method3, smiles=test_data[SMILES_COLUMN])
print(train_fp3.shape)

(5557, 2048)


In [792]:
temp_data = np.hstack((train_fp3, np.array(y_train).reshape(len(y_train), 1)))
temp1 = temp_data[temp_data[:,-1] == 1]
temp2 = temp_data[temp_data[:,-1] == 0]

# Эмперически найденные значения для OverSampling
choice = np.array(pd.read_csv('res_choice.csv')).reshape(len(choice),)

temp3 = temp1[choice,:]
temp4 = np.vstack((temp2, temp3))

train_fp_os = temp4[:, :-1]
y_train_os = temp4[:, -1]

In [793]:
train_fp_os = temp4[:, :-1]
y_train_os = temp4[:, -1]

In [755]:
# Кросс-валидация
skf = StratifiedKFold(n_splits=3)
cv = skf.split(train_fp_os, y_train_os)

# Поиск оптимальных параметров
grid_search = GridSearchCV(clf, param_grid=params, scoring='f1', n_jobs=1,
                               cv=cv, verbose=10)
gs_result3 = grid_search.fit(train_fp_os, y_train_os)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV 1/3; 1/8] START activation=identity, solver=lbfgs...........................
[CV 1/3; 1/8] END activation=identity, solver=lbfgs;, score=0.974 total time=  26.7s
[CV 2/3; 1/8] START activation=identity, solver=lbfgs...........................
[CV 2/3; 1/8] END activation=identity, solver=lbfgs;, score=0.974 total time=  25.1s
[CV 3/3; 1/8] START activation=identity, solver=lbfgs...........................
[CV 3/3; 1/8] END activation=identity, solver=lbfgs;, score=0.971 total time=  22.9s
[CV 1/3; 2/8] START activation=identity, solver=adam............................
[CV 1/3; 2/8] END activation=identity, solver=adam;, score=0.967 total time=  43.7s
[CV 2/3; 2/8] START activation=identity, solver=adam............................
[CV 2/3; 2/8] END activation=identity, solver=adam;, score=0.975 total time=  38.4s
[CV 3/3; 2/8] START activation=identity, solver=adam............................
[CV 3/3; 2/8] END activation=id

In [756]:
res_cv3 = gs_result3.cv_results_
best_params = gs_result3.best_params_

In [794]:
clf = MLPClassifier(random_state=1,
                    max_iter=300,
                    activation=best_params['activation'],
                    solver=best_params['solver'])

In [795]:
clf.fit(train_fp_os, y_train_os)

MLPClassifier(max_iter=300, random_state=1)

In [796]:
y_pred = clf.predict(test_fp3)
f1_3 = f1_score(y_test, y_pred)

In [797]:
print(f'С помощью TOPOTORSION и Oversampling получен F1-score = {f1_3:.3f}')

С помощью TOPOTORSION и Oversampling получен F1-score = 0.272


## Задание 2

In [819]:
# Model, Fingerprint, Best Parameters, Mean Cross-Validation Score, Std Cross-Validation Score, Test Score
df1 = pd.DataFrame(res_cv1)  # ECFP4
df1['Model'] = 'MLPClassificator'
df1['label'] = 'ECFP4'
df1['f1'] = f1_1
df2 = pd.DataFrame(res_cv2)  # RDKitFP
df2['Model'] = 'MLPClassificator'
df2['label'] = 'RDKitFP'
df2['f1'] = f1_2
df3 = pd.DataFrame(res_cv3)  # TOPOTORSION
df3['Model'] = 'MLPClassificator'
df3['label'] = 'TOPOTORSION'
df3['f1'] = f1_3
df = pd.concat((df1, df2, df3), axis=0)
df_best = df[df['rank_test_score'] == 1]
result = df_best[[
    'Model', 'label', 'params', 'mean_test_score', 'std_test_score', 'f1'
]]
result.columns = [
    'Model', 'Fingerprint', 'Best Parameters', 'Mean_Cross-Validation Score',
    'Std_Cross-Validation_Score', 'Test_Score'
]

In [821]:
result

Unnamed: 0,Model,Fingerprint,Best Parameters,Mean_Cross-Validation Score,Std_Cross-Validation_Score,Test_Score
6,MLPClassificator,ECFP4,"{'activation': 'relu', 'solver': 'lbfgs'}",0.989921,0.001871,0.361446
7,MLPClassificator,RDKitFP,"{'activation': 'relu', 'solver': 'adam'}",0.989826,0.000565,0.336842
6,MLPClassificator,TOPOTORSION,"{'activation': 'relu', 'solver': 'lbfgs'}",0.98746,0.002788,0.271845


## Сравение со cлучайным лесом

In [822]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [824]:
params = {
        'max_depth': [i*5 for i in range(2,6)],
        'n_estimators': [50*i for i in range(3,7)]
}

In [829]:
forest = RandomForestClassifier(n_jobs=-1)

In [830]:
temp_data = np.hstack((train_fp1, np.array(y_train).reshape(len(y_train), 1)))
temp1 = temp_data[temp_data[:,-1] == 1]
temp2 = temp_data[temp_data[:,-1] == 0]

# Эмперически найденные значения для OverSampling
choice = np.array(pd.read_csv('res_choice.csv')).reshape(len(choice),)

temp3 = temp1[choice,:]
temp4 = np.vstack((temp2, temp3))

train_fp_os = temp4[:, :-1]
y_train_os = temp4[:, -1]

In [836]:
skf = StratifiedKFold(n_splits=3)
cv = skf.split(train_fp_os, y_train_os)

grid_search = GridSearchCV(forest, param_grid=params, scoring='f1', n_jobs=1,
                               cv=cv, verbose=1)
gs_result4 = grid_search.fit(train_fp_os, y_train_os)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


In [850]:
best_params = gs_result4.best_params_
res_cv4 = gs_result4.cv_results_
forest = RandomForestClassifier(n_estimators=best_params['n_estimators'],
                    max_depth=best_params['max_depth'])

In [851]:
forest.fit(train_fp_os, y_train_os)

RandomForestClassifier(max_depth=25, n_estimators=300)

In [852]:
y_pred = forest.predict(test_fp1)
f1_4 = f1_score(y_test, y_pred)

In [853]:
print(f'С помощью RandomForest и Oversampling получен F1-score = {f1_4:.3f}')

С помощью RandomForest и Oversampling получен F1-score = 0.302


In [861]:
df4 = pd.DataFrame(res_cv4)  # ECFP4
df4['Model'] = 'RandomForest'
df4['label'] = 'ECFP5'
df4['f1'] = f1_4
df_best = df4[df4['rank_test_score'] == 1]
temp = df_best[[
    'Model', 'label', 'params', 'mean_test_score', 'std_test_score', 'f1'
]]
temp.columns = [
    'Model', 'Fingerprint', 'Best Parameters', 'Mean_Cross-Validation Score',
    'Std_Cross-Validation_Score', 'Test_Score'
]
res = pd.concat((result, temp), axis=0)

In [863]:
res

Unnamed: 0,Model,Fingerprint,Best Parameters,Mean_Cross-Validation Score,Std_Cross-Validation_Score,Test_Score
6,MLPClassificator,ECFP4,"{'activation': 'relu', 'solver': 'lbfgs'}",0.989921,0.001871,0.361446
7,MLPClassificator,RDKitFP,"{'activation': 'relu', 'solver': 'adam'}",0.989826,0.000565,0.336842
6,MLPClassificator,TOPOTORSION,"{'activation': 'relu', 'solver': 'lbfgs'}",0.98746,0.002788,0.271845
15,RandomForest,ECFP5,"{'max_depth': 25, 'n_estimators': 300}",0.987816,0.000221,0.301887


# Вывод

In [864]:
res

Unnamed: 0,Model,Fingerprint,Best Parameters,Mean_Cross-Validation Score,Std_Cross-Validation_Score,Test_Score
6,MLPClassificator,ECFP4,"{'activation': 'relu', 'solver': 'lbfgs'}",0.989921,0.001871,0.361446
7,MLPClassificator,RDKitFP,"{'activation': 'relu', 'solver': 'adam'}",0.989826,0.000565,0.336842
6,MLPClassificator,TOPOTORSION,"{'activation': 'relu', 'solver': 'lbfgs'}",0.98746,0.002788,0.271845
15,RandomForest,ECFP5,"{'max_depth': 25, 'n_estimators': 300}",0.987816,0.000221,0.301887


- Какие фингерпринты дали лучший результат?
- Какая модель дала лучший результат.
- Коррелируют ли скоры на кросс-валидации и тестовой выборке?

## Вопрос 1

Как можно видеть по таблице, лучший результат у **ECFP4**

## Вопрос 2

**MLPClassificator** показал лучший **F1-score**

## Вопрос 3

Гипотеза $H_0$: корреляции нет

In [879]:
from scipy.stats import pearsonr
X = np.array(res[['Mean_Cross-Validation Score','Test_Score']])[:,0]
y = np.array(res[['Mean_Cross-Validation Score','Test_Score']])[:,1]
_, pv = pearsonr(X,y)

In [880]:
alpha = 0.05
if pv < alpha:
    print(f'Гипотеза H0 отвергается')
else:
    print(f'Гипотеза H0 не отвергается')

Гипотеза H0 отвергается
