In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams

from statistics import mean
from tqdm import tqdm

In [23]:
from xgboost import XGBRegressor, XGBClassifier
import sklearn
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.model_selection import train_test_split, KFold, cross_validate, GridSearchCV
from sklearn.metrics import make_scorer, r2_score, mean_absolute_error, mean_squared_error, roc_auc_score, f1_score

In [73]:
df = pd.read_csv('/content/fragments_classification_actual_wth_modules.csv')
df

Unnamed: 0,SMILES,Activity,#AromaticCarbocycles,#AromaticHeterocycles,NumRadicalElectrons,BCUT2D_MWHI,BCUT2D_MWLOW,BCUT2D_CHGHI,BCUT2D_CHGLO,BCUT2D_LOGPHI,...,502,503,504,505,506,507,508,509,510,511
0,BrC(Br)Br,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
1,C#CC(C)(O)CC,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,C#CC(O)(/C=C/Cl)CC,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
3,C#CC(OC(N)=O)C1=CC=CC=C1,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,0
4,C#CC1(OCC(O)CN2CCN(C3=CC=C(F)C=C3)CC2)CCCCC1,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6743,C1=CC=C(CN(CC2=NCCN2)C2=CC=CC=C2)C=C1,0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,1,0,0,1,0
6744,CCOCCN1C(N2CCCN(C)CC2)=NC2=CC=CC=C21,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,1,1,1,1,1
6745,CN1CCC(=C2C3=CC=CC=C3CC(=O)C3=C2C=CS3)CC1,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,1,1,0,1,0,0,1,0
6746,CC1=C(C2=CC=NC=C2)C=C(C#N)C(=O)N1,0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,1,0,0,1


### Обучение модели

In [74]:
split = KFold(n_splits=5, random_state=41, shuffle=True)
scores= {
         "F1":  make_scorer(f1_score),
         "AUC": make_scorer(roc_auc_score, needs_threshold=True)
         }



In [75]:
X = df.drop(columns = ["SMILES", "Activity"], axis = 1)

In [76]:
y = df["Activity"]

In [77]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y,)

In [78]:
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [79]:
MLR_model = RandomForestClassifier(random_state=102, bootstrap=True, class_weight='balanced', criterion='gini', max_depth=None, max_features='log2', n_estimators=200)

In [80]:
roc_auc_scorer = sklearn.metrics.make_scorer(roc_auc_score, greater_is_better=True,
                             needs_threshold=True)



In [81]:
cv_scores = cross_validate(MLR_model, X_train, y_train, scoring=scores, cv=split)
print(f"On cross-validation:")
print(f"Mean ROC_AUC score is {cv_scores['test_AUC'].mean().round(3)} ± {cv_scores['test_AUC'].std().round(3)}")
print(f"Mean F1 score is {cv_scores['test_F1'].mean().round(3)} ± {cv_scores['test_F1'].std().round(3)}")

On cross-validation:
Mean ROC_AUC score is 0.959 ± 0.004
Mean F1 score is 0.901 ± 0.007


In [82]:
MLR_model.fit(X_train, y_train)
y_pred = MLR_model.predict(X_test)

print(f"F1: {f1_score(y_test, y_pred)}")
print(f"ROC_AUC: {roc_auc_score(y_test, y_pred)}")

F1: 0.8991262907069102
ROC_AUC: 0.8597410903673959


In [35]:
X_train.shape

(5735, 228)

In [59]:
import joblib

In [60]:
joblib.dump(MLR_model, "best_classification_model.joblib")

['best_classification_model.joblib']