# MAJORITY VOTE DEMO

In [7]:
import joblib
import pandas as pd
import numpy as np
import glob

from mlchem.ml.preprocessing.scaling import scale_df_standard,transform_df
from mlchem.helper import suppress_warnings
suppress_warnings()
from mlchem.metrics import get_geometric_S
from mlchem.ml.modelling.model_evaluation import MajorityVote


estimator_path_list = glob.glob('../data/*_estimator')
estimator_list = [joblib.load(e) for e in estimator_path_list]

column_path_list = glob.glob('../data/*_cols')
column_list = [joblib.load(e) for e in column_path_list]

estimator_names = [e[e.find('\\')+len('\\'):e.find('_estimator')] for e in estimator_path_list]

dict_estimators = {}
for n,e in zip(estimator_names,estimator_list):
    dict_estimators[n] = e
dict_estimators


{'KNN_3_COMB': KNeighborsClassifier(n_neighbors=3),
 'KNN_3_SFS': KNeighborsClassifier(n_neighbors=3),
 'KNN_5_COMB': KNeighborsClassifier(),
 'KNN_5_SFS': KNeighborsClassifier(),
 'KNN_7_COMB': KNeighborsClassifier(n_neighbors=7),
 'KNN_7_SFS': KNeighborsClassifier(n_neighbors=7),
 'KNN_9_COMB': KNeighborsClassifier(n_neighbors=9),
 'KNN_9_SFS': KNeighborsClassifier(n_neighbors=9),
 'LOGREG_l1_0.1_COMB': LogisticRegression(C=0.1, penalty='l1', random_state=1, solver='liblinear'),
 'LOGREG_l1_0.1_SFS': LogisticRegression(C=0.1, penalty='l1', random_state=1, solver='liblinear'),
 'LOGREG_l1_10_COMB': LogisticRegression(C=10, penalty='l1', random_state=1, solver='liblinear'),
 'LOGREG_l1_10_SFS': LogisticRegression(C=10, penalty='l1', random_state=1, solver='liblinear'),
 'LOGREG_l1_1_COMB': LogisticRegression(C=1, penalty='l1', random_state=1, solver='liblinear'),
 'LOGREG_l1_1_SFS': LogisticRegression(C=1, penalty='l1', random_state=1, solver='liblinear'),
 'LOGREG_l2_0.1_COMB': Logist

In [8]:
estimator_list

[KNeighborsClassifier(n_neighbors=3),
 KNeighborsClassifier(n_neighbors=3),
 KNeighborsClassifier(),
 KNeighborsClassifier(),
 KNeighborsClassifier(n_neighbors=7),
 KNeighborsClassifier(n_neighbors=7),
 KNeighborsClassifier(n_neighbors=9),
 KNeighborsClassifier(n_neighbors=9),
 LogisticRegression(C=0.1, penalty='l1', random_state=1, solver='liblinear'),
 LogisticRegression(C=0.1, penalty='l1', random_state=1, solver='liblinear'),
 LogisticRegression(C=10, penalty='l1', random_state=1, solver='liblinear'),
 LogisticRegression(C=10, penalty='l1', random_state=1, solver='liblinear'),
 LogisticRegression(C=1, penalty='l1', random_state=1, solver='liblinear'),
 LogisticRegression(C=1, penalty='l1', random_state=1, solver='liblinear'),
 LogisticRegression(C=0.1, random_state=1, solver='liblinear'),
 LogisticRegression(C=0.1, random_state=1, solver='liblinear'),
 LogisticRegression(C=10, random_state=1, solver='liblinear'),
 LogisticRegression(C=10, random_state=1, solver='liblinear'),
 Logis

In [9]:
estimator_path_list

['../data\\KNN_3_COMB_estimator',
 '../data\\KNN_3_SFS_estimator',
 '../data\\KNN_5_COMB_estimator',
 '../data\\KNN_5_SFS_estimator',
 '../data\\KNN_7_COMB_estimator',
 '../data\\KNN_7_SFS_estimator',
 '../data\\KNN_9_COMB_estimator',
 '../data\\KNN_9_SFS_estimator',
 '../data\\LOGREG_l1_0.1_COMB_estimator',
 '../data\\LOGREG_l1_0.1_SFS_estimator',
 '../data\\LOGREG_l1_10_COMB_estimator',
 '../data\\LOGREG_l1_10_SFS_estimator',
 '../data\\LOGREG_l1_1_COMB_estimator',
 '../data\\LOGREG_l1_1_SFS_estimator',
 '../data\\LOGREG_l2_0.1_COMB_estimator',
 '../data\\LOGREG_l2_0.1_SFS_estimator',
 '../data\\LOGREG_l2_10_COMB_estimator',
 '../data\\LOGREG_l2_10_SFS_estimator',
 '../data\\LOGREG_l2_1_COMB_estimator',
 '../data\\LOGREG_l2_1_SFS_estimator']

In [10]:

descriptor_types = ['rdkit',
                    ]
scaling_decisions = [True,
                     ]

dict_train_sets = {}
dict_test_sets = {}

dict_y_trains = {}
dict_y_tests = {}


for desc_type,scale in zip(descriptor_types,scaling_decisions):

    train_set_original = joblib.load(f'../data/train_set_{desc_type}')
    train_set,y_train = train_set_original.iloc[:,:-1],train_set_original.iloc[:,-1].values

    test_set_original = joblib.load(f'../data/test_set_{desc_type}')
    test_set,y_test = test_set_original.iloc[:,:-1],test_set_original.iloc[:,-1].values

    
    if scale == True:
        train_set_scaled,scaler = scale_df_standard(train_set,0)
        test_set_scaled = transform_df(test_set,scaler,0)[0]
    else:
        train_set_scaled = train_set
        test_set_scaled = test_set

    dict_train_sets[desc_type] = train_set_scaled
    dict_test_sets[desc_type] = test_set_scaled
    dict_y_trains[desc_type] = y_train
    dict_y_tests[desc_type] = y_test

    
big_train_set = pd.concat([trainset for trainset in dict_train_sets.values()],axis=1)
big_test_set = pd.concat([testset for testset in dict_test_sets.values()],axis=1)

In [13]:
big_train_set.head()

Unnamed: 0_level_0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
SMILES,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CCCCC(C)CCC,-2.275207,-2.275207,0.820951,0.845857,1.443304,0.120536,-0.222067,-0.331742,-0.222047,-0.041798,...,-0.130744,0.0,0.0,-0.193565,0.0,0.0,0.0,0.0,-0.021018,0.0
C1=CC2CCCC2O1,-0.950214,-0.950214,0.06427,0.210778,0.782349,3.949329,-0.457538,-0.447795,-0.457449,-0.433195,...,-0.130744,0.0,0.0,-0.193565,0.0,0.0,0.0,0.0,-0.60952,0.0
C=CCC=C(C=NCCC#CC=CC)NCO,0.615818,0.615818,-0.807,-0.839315,-1.398373,-0.168279,1.131575,1.1736,1.1318,1.132392,...,-0.130744,0.0,0.0,-0.193565,0.0,0.0,0.0,0.0,-0.60952,0.0
O=[N+]CC=NC#CO,0.863868,0.863868,-0.987028,-0.688219,0.055173,-0.645222,-0.432408,-0.332364,-0.432019,-0.531044,...,-0.130744,0.0,0.0,-0.193565,0.0,0.0,0.0,0.0,-0.60952,0.0
O=CCC#CCO,0.944862,0.944862,-0.698412,-0.930453,-0.209184,-0.857575,-0.614341,-0.563847,-0.614135,-0.628893,...,-0.130744,0.0,0.0,-0.193565,0.0,0.0,0.0,0.0,-0.60952,0.0


In [14]:
big_test_set.head()

Unnamed: 0_level_0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
SMILES,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C#C[SH]=O,0.736897,0.736897,-0.576518,-1.032759,-0.718343,-1.084407,-0.926476,-0.852641,-0.927263,-1.150756,...,-0.130744,0.0,0.0,4.655757,0.0,0.0,0.0,0.0,-0.60952,0.0
CCCCBC=CCCC=O,1.130166,1.130166,0.26361,0.378083,-0.654275,-0.3381,0.087337,0.056078,0.090125,0.1539,...,-0.130744,0.0,0.0,-0.193565,0.0,0.0,0.0,0.0,0.567484,0.0
C1=CCCC=1,-1.999043,-1.999043,1.273359,1.225561,-0.028563,0.652063,-1.030547,-1.026698,-1.030566,-1.02029,...,-0.130744,0.0,0.0,-0.193565,0.0,0.0,0.0,0.0,-0.60952,0.0
C=C(C)C,-1.735791,-1.735791,1.195073,1.159856,-0.08736,-0.847923,-1.160555,-1.200437,-1.160539,-1.085523,...,-0.130744,0.0,0.0,-0.193565,0.0,0.0,0.0,0.0,-0.60952,0.0
CCCN1CCC2=C1C2C,-2.18708,-2.18708,0.621495,0.678454,1.749396,2.246643,-0.105431,-0.129131,-0.105358,-0.041798,...,-0.130744,0.0,0.0,-0.193565,0.0,0.0,0.0,0.0,-0.60952,0.0


In [15]:
joblib.dump(big_train_set,'../data/big_train_set_scaled')
joblib.dump(big_test_set,'../data/big_test_set_scaled')
joblib.dump(y_train,'../data/y_train')
joblib.dump(y_test,'../data/y_test')

['../data/y_test']

In [16]:
mv = MajorityVote(train_set=big_train_set,test_set=big_test_set,y_train=y_train,y_test=y_test,task_type='classification',estimator_list=estimator_list,column_list=column_list,estimator_names=estimator_names)
mv.fit()

In [17]:
mv.df_test_predictions_hard.head()

Unnamed: 0_level_0,KNN_3_COMB,KNN_3_SFS,KNN_5_COMB,KNN_5_SFS,KNN_7_COMB,KNN_7_SFS,KNN_9_COMB,KNN_9_SFS,LOGREG_l1_0.1_COMB,LOGREG_l1_0.1_SFS,...,LOGREG_l1_10_SFS,LOGREG_l1_1_COMB,LOGREG_l1_1_SFS,LOGREG_l2_0.1_COMB,LOGREG_l2_0.1_SFS,LOGREG_l2_10_COMB,LOGREG_l2_10_SFS,LOGREG_l2_1_COMB,LOGREG_l2_1_SFS,Y
SMILES,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C#C[SH]=O,1,0,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1
CCCCBC=CCCC=O,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
C1=CCCC=1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
C=C(C)C,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
CCCN1CCC2=C1C2C,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [18]:
mv.df_test_predictions_soft.head()

Unnamed: 0_level_0,KNN_3_COMB,KNN_3_SFS,KNN_5_COMB,KNN_5_SFS,KNN_7_COMB,KNN_7_SFS,KNN_9_COMB,KNN_9_SFS,LOGREG_l1_0.1_COMB,LOGREG_l1_0.1_SFS,...,LOGREG_l1_10_SFS,LOGREG_l1_1_COMB,LOGREG_l1_1_SFS,LOGREG_l2_0.1_COMB,LOGREG_l2_0.1_SFS,LOGREG_l2_10_COMB,LOGREG_l2_10_SFS,LOGREG_l2_1_COMB,LOGREG_l2_1_SFS,Y
SMILES,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C#C[SH]=O,0.666667,0.0,0.8,0.8,0.714286,0.714286,0.777778,0.777778,0.160761,0.123734,...,0.040182,0.142205,0.044815,0.146201,0.124206,0.146464,0.040603,0.091547,0.046734,1
CCCCBC=CCCC=O,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.718286,0.611476,...,0.608807,0.808053,0.607645,0.609323,0.598605,0.826303,0.608945,0.664607,0.65543,1
C1=CCCC=1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.957541,0.963951,...,0.988585,0.993806,0.986156,0.955654,0.928291,0.994079,0.988339,0.98684,0.985365,1
C=C(C)C,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.939356,0.941675,...,0.976494,0.987667,0.972378,0.925424,0.865204,0.989415,0.976067,0.975424,0.971709,1
CCCN1CCC2=C1C2C,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.97417,0.974582,...,0.998975,0.995987,0.998529,0.96823,0.987665,0.996095,0.998946,0.991606,0.997894,1


In [19]:
mv.predict(metric=get_geometric_S,metric_name='gS',n_estimators_max=3)

In [20]:
mv.final_results.sort_values('gS_test',ascending=False)

Unnamed: 0,gS_train,gS_test
"['KNN_3_COMB', 'KNN_9_COMB', 'LOGREG_l2_10_COMB']_hard",0.945075,0.949415
"['KNN_3_COMB', 'KNN_7_COMB', 'LOGREG_l1_1_COMB']_hard",0.947119,0.949415
"['KNN_3_COMB', 'KNN_7_SFS', 'LOGREG_l1_10_COMB']_hard",0.945075,0.949415
"['KNN_3_COMB', 'KNN_7_SFS', 'LOGREG_l2_10_COMB']_hard",0.945075,0.949415
"['KNN_3_COMB', 'KNN_9_SFS', 'LOGREG_l2_10_COMB']_hard",0.945075,0.949415
...,...,...
"['KNN_9_SFS', 'LOGREG_l1_1_SFS', 'LOGREG_l2_0.1_COMB']_hard",0.879558,0.866025
"['KNN_9_SFS', 'LOGREG_l1_1_SFS', 'LOGREG_l2_0.1_SFS']_hard",0.881573,0.866025
"['KNN_3_COMB', 'LOGREG_l1_10_SFS', 'LOGREG_l2_10_SFS']_soft",0.910402,0.858293
"['KNN_3_COMB', 'LOGREG_l1_10_SFS', 'LOGREG_l1_1_SFS']_soft",0.910402,0.858293


## Apply best estimators 

In [21]:
best_cols = ['KNN_3_COMB', 'KNN_9_COMB', 'LOGREG_l2_10_COMB']
hard = True

best_cols_Y = best_cols+['Y']


if hard == True:
    from scipy.stats import mode
    df = mv.df_test_predictions_hard[best_cols_Y]
    df['Y_pred'] = df[best_cols].mode(axis=1)
else:
    df = mv.df_test_predictions_soft[best_cols_Y]
    df['Y_pred_proba'] = df[best_cols].mean(axis=1)
    df['Y_pred'] = round(df['Y_pred_proba'])
df


Unnamed: 0_level_0,KNN_3_COMB,KNN_9_COMB,LOGREG_l2_10_COMB,Y,Y_pred
SMILES,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C#C[SH]=O,1,1,0,1,1
CCCCBC=CCCC=O,1,1,1,1,1
C1=CCCC=1,1,1,1,1,1
C=C(C)C,1,1,1,1,1
CCCN1CCC2=C1C2C,1,1,1,1,1
...,...,...,...,...,...
CC=CCN=CC,1,1,1,1,1
CCCCN1C=C1O,0,0,0,0,0
CNCCCCCO,0,0,0,0,0
CC(C)=CCCF,1,1,0,1,1
