In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [78]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

molecules = pd.read_csv('molecules_dataset_cleared.csv')
X = molecules.drop(columns=['id', 'SMILES', 'SELFIES', 'ACCUMULATION'])
Y = (molecules['ACCUMULATION'] > 500).astype(int)

display(X.shape)
display(Y.shape)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=91)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

(209, 200)

(209,)

In [79]:
def my_grid_search(model, param_grid, X_train, Y_train, X_test, Y_test):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train, Y_train)
    print(grid_search.best_params_)
    print(f'Train best accuracy: {grid_search.best_score_}')
    best_model = grid_search.best_estimator_
    print(f'Test accuracy: {best_model.score(X_test, Y_test)}')
    return grid_search

In [80]:
def important_features(weights, ft_names, percentage=0.9):
    non_zero_weights = [[ft_names[i], weights[i]] for i in range(len(weights)) if weights[i] != 0]
    total_weight = sum([abs(w[1]) for w in non_zero_weights])
    non_zero_weights.sort(key=lambda x: abs(x[1]), reverse=True)
    cur_sum = 0
    important_weights = []
    for w in non_zero_weights:
        cur_sum += abs(w[1])
        important_weights.append(w)
        if cur_sum >= percentage * total_weight:
            break
    return np.array(important_weights)

In [81]:
from sklearn.linear_model import LogisticRegression

liblinear_param_grid = {
    'penalty': ['l1', 'l2'],
    'C': np.linspace(0.1, 10, 100)
 }
param_grid = [liblinear_param_grid]
logistic_model = LogisticRegression(solver='liblinear')
logistic_grid_search = my_grid_search(logistic_model, param_grid, X_train_scaled, Y_train, X_test_scaled, Y_test)
logistic_important_weights = important_features(logistic_grid_search.best_estimator_.coef_[0], X.columns, percentage=0.8)
logistic_important_weights

{'C': 1.1, 'penalty': 'l1'}
Train best accuracy: 0.9045977011494253
Test accuracy: 0.8571428571428571


array([['VSA_EState4', '2.2535036019586356'],
       ['VSA_EState9', '-1.4447658910488197'],
       ['FpDensityMorgan3', '1.2588555230425356'],
       ['fr_ArN', '-1.0828168343558597'],
       ['fr_priamide', '0.8984501508360028'],
       ['VSA_EState7', '-0.7643728364633066'],
       ['SMR_VSA4', '0.7499847278331874'],
       ['fr_Ar_COO', '0.7230455158496419'],
       ['fr_quatN', '0.6817966085855136'],
       ['fr_ketone_Topliss', '0.6406771376614622'],
       ['fr_azide', '-0.5235310251807097'],
       ['fr_NH2', '-0.5146193941735004'],
       ['fr_halogen', '0.440603161824641'],
       ['SMR_VSA3', '-0.38648142661498397'],
       ['MaxEStateIndex', '-0.3703890737778695'],
       ['MaxAbsEStateIndex', '-0.35009096150497054']], dtype='<U32')

In [82]:
from sklearn.svm import SVC

param_grid = { 
    'C': np.linspace(0.01, 10, 100)
 }
svm_model = SVC(kernel = 'linear')
svm_grid_search = my_grid_search(svm_model, param_grid, X_train_scaled, Y_train, X_test_scaled, Y_test)
svm_important_weights = important_features(svm_grid_search.best_estimator_.coef_[0], X.columns, percentage=0.8)
svm_important_weights

{'C': 0.21181818181818182}
Train best accuracy: 0.8705747126436781
Test accuracy: 0.8253968253968254


array([['VSA_EState4', '0.7281063564082672'],
       ['fr_priamide', '0.4475309231101362'],
       ['FpDensityMorgan3', '0.4140222742470805'],
       ['fr_NH2', '-0.39657926023353446'],
       ['NHOHCount', '0.3817726118682672'],
       ['fr_methoxy', '-0.33736697532042315'],
       ['fr_Ar_COO', '0.3308838688346146'],
       ['fr_quatN', '0.3234623269819204'],
       ['fr_ArN', '-0.3215890915166289'],
       ['FpDensityMorgan2', '0.3059444107535859'],
       ['MaxAbsEStateIndex', '-0.2941092114117026'],
       ['MaxEStateIndex', '-0.2941092114117026'],
       ['VSA_EState7', '-0.26288676433147'],
       ['PEOE_VSA8', '-0.2610344158832685'],
       ['FractionCSP3', '-0.25614017515840865'],
       ['SMR_VSA4', '0.23882699071440844'],
       ['SlogP_VSA8', '0.23486510391883467'],
       ['fr_piperzine', '0.23478121852654862'],
       ['NumSaturatedHeterocycles', '-0.22568544818583716'],
       ['fr_allylic_oxid', '-0.22166927012909154'],
       ['VSA_EState9', '-0.22032209461772015'],
  

In [83]:
from sklearn.ensemble import RandomForestClassifier

param_grid = { 
    'n_estimators': [10, 50, 100, 300],
    'max_samples': np.linspace(0.1, 1, 10),
 }
forest_model = RandomForestClassifier(random_state=38, bootstrap=True, oob_score=True, max_features='sqrt')
forest_grid_search = my_grid_search(forest_model, param_grid, X_train_scaled, Y_train, X_test_scaled, Y_test)
forest_important_weights = important_features(forest_grid_search.best_estimator_.feature_importances_, X.columns, percentage=0.8)
forest_important_weights

{'max_samples': 0.5, 'n_estimators': 50}
Train best accuracy: 0.8836781609195402
Test accuracy: 0.873015873015873


array([['Kappa1', '0.03857126549801988'],
       ['Chi1', '0.03549553472261597'],
       ['EState_VSA8', '0.027460582604645106'],
       ['LabuteASA', '0.02692281104635348'],
       ['PEOE_VSA9', '0.024418734409056838'],
       ['Chi1v', '0.024215585602233258'],
       ['EState_VSA3', '0.022278979306482788'],
       ['Chi2v', '0.0220006117075989'],
       ['Chi0n', '0.021782457582906048'],
       ['Kappa2', '0.02111930987602531'],
       ['Chi2n', '0.019877151249761447'],
       ['NHOHCount', '0.019814518792669614'],
       ['VSA_EState6', '0.019312830893997893'],
       ['MinPartialCharge', '0.01853729919552752'],
       ['EState_VSA10', '0.017494754099823758'],
       ['HeavyAtomMolWt', '0.01683308872500049'],
       ['EState_VSA5', '0.015639318327342975'],
       ['NumHeteroatoms', '0.01462907369597661'],
       ['NumHAcceptors', '0.014257618803758351'],
       ['BertzCT', '0.014077661081180137'],
       ['fr_quatN', '0.013120469805671523'],
       ['BalabanJ', '0.01302442745984602'

In [84]:
from xgboost import XGBClassifier

param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'learning_rate': np.linspace(0.01, 2, 10),
    'reg_alpha': np.linspace(0, 1, 5),
    'reg_lambda': np.linspace(0, 1, 5),
}

xgb_model = XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss', max_depth=3, max_leaves=8, subsample = 0.7, colsample_bynode = 0.7)
gb_grid_search = my_grid_search(xgb_model, param_grid, X_train_scaled, Y_train, X_test_scaled, Y_test)
gb_important_weights = important_features(gb_grid_search.best_estimator_.feature_importances_, X.columns, percentage=0.8)
gb_important_weights

{'learning_rate': 0.23111111111111113, 'n_estimators': 200, 'reg_alpha': 0.25, 'reg_lambda': 0.75}
Train best accuracy: 0.8845977011494253
Test accuracy: 0.9206349206349206


array([['Chi1v', '0.14022422'],
       ['MinEStateIndex', '0.0686775'],
       ['NHOHCount', '0.06656863'],
       ['Chi4v', '0.05977959'],
       ['Chi0v', '0.056014348'],
       ['PEOE_VSA13', '0.05296244'],
       ['fr_quatN', '0.04426705'],
       ['SlogP_VSA1', '0.038175546'],
       ['BertzCT', '0.03113035'],
       ['Kappa2', '0.028623674'],
       ['VSA_EState5', '0.027750414'],
       ['EState_VSA6', '0.025165722'],
       ['fr_halogen', '0.022307053'],
       ['Chi2n', '0.020826766'],
       ['SMR_VSA10', '0.017681673'],
       ['VSA_EState9', '0.015830671'],
       ['VSA_EState6', '0.015258721'],
       ['PEOE_VSA8', '0.014299616'],
       ['SMR_VSA6', '0.012520221'],
       ['EState_VSA9', '0.010701415'],
       ['Chi1n', '0.010405447'],
       ['EState_VSA7', '0.009824617'],
       ['NumAliphaticRings', '0.009092718'],
       ['VSA_EState4', '0.009068832']], dtype='<U32')

In [88]:
logistic_important_weights_set = set(logistic_important_weights[:, 0])
svm_important_weights_set = set(svm_important_weights[:, 0])
forest_important_weights_set = set(forest_important_weights[:, 0])
gb_important_weights_set = set(gb_important_weights[:, 0])
intersection_logistic_svm = logistic_important_weights_set.intersection(svm_important_weights_set)
display(intersection_logistic_svm)

intersection_forest_gb = forest_important_weights_set.intersection(gb_important_weights_set)
display(intersection_forest_gb)

intersection_all = intersection_logistic_svm.intersection(intersection_forest_gb)
display(intersection_all)

{'FpDensityMorgan3',
 'MaxAbsEStateIndex',
 'MaxEStateIndex',
 'SMR_VSA3',
 'SMR_VSA4',
 'VSA_EState4',
 'VSA_EState7',
 'VSA_EState9',
 'fr_ArN',
 'fr_Ar_COO',
 'fr_NH2',
 'fr_azide',
 'fr_halogen',
 'fr_ketone_Topliss',
 'fr_priamide',
 'fr_quatN'}

{'BertzCT',
 'Chi0v',
 'Chi1n',
 'Chi1v',
 'Chi2n',
 'Kappa2',
 'MinEStateIndex',
 'NHOHCount',
 'SlogP_VSA1',
 'VSA_EState4',
 'VSA_EState6',
 'fr_quatN'}

{'VSA_EState4', 'fr_quatN'}