## SelectFeatures
This notebook shows the development of how we select important features from `ChemFeatures` to study with. We used the following method to analyze the weight of each feature:

* LASSO (Least Absolute Shrinkage and Selection Operator)
* RFE (Recursive Feature Elimination)
* RIDGE (Ridge Regression)
* SS (Stability Selection)

We use explained variance to compare and evaluate these methods for final decision making.<br>

#### Use ``ChemFeatures`` to generate a dataframe of features.

In [1]:
import pandas as pd
# Module for extracting ChemInfo
from SeekFeatures import ChemFeatures
import mordred
import sklearn

# plotting package
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# Use 100K data for demonstration
data = pd.read_csv('../Database/HCEPD_100K.csv') 
# Use first 50 data to illustrate the analysis
data50 = data.head(50)
data50.head()

Unnamed: 0,id,SMILES_str,stoich_str,mass,pce,voc,jsc,e_homo_alpha,e_gap_alpha,e_lumo_alpha,tmp_smiles_str
0,655365,C1C=CC=C1c1cc2[se]c3c4occc4c4nsnc4c3c2cn1,C18H9N3OSSe,394.3151,5.161953,0.867601,91.567575,-5.467601,2.022944,-3.444656,C1=CC=C(C1)c1cc2[se]c3c4occc4c4nsnc4c3c2cn1
1,1245190,C1C=CC=C1c1cc2[se]c3c(ncc4ccccc34)c2c2=C[SiH2]...,C22H15NSeSi,400.4135,5.261398,0.504824,160.401549,-5.104824,1.63075,-3.474074,C1=CC=C(C1)c1cc2[se]c3c(ncc4ccccc34)c2c2=C[SiH...
2,21847,C1C=c2ccc3c4c[nH]cc4c4c5[SiH2]C(=Cc5oc4c3c2=C1...,C24H17NOSi,363.4903,0.0,0.0,197.47478,-4.539526,1.462158,-3.077368,C1=CC=C(C1)C1=Cc2oc3c(c2[SiH2]1)c1c[nH]cc1c1cc...
3,65553,[SiH2]1C=CC2=C1C=C([SiH2]2)C1=Cc2[se]ccc2[SiH2]1,C12H12SeSi3,319.4448,6.138294,0.630274,149.887545,-5.230274,1.68225,-3.548025,C1=CC2=C([SiH2]1)C=C([SiH2]2)C1=Cc2[se]ccc2[Si...
4,720918,C1C=c2c3ccsc3c3[se]c4cc(oc4c3c2=C1)C1=CC=CC1,C20H12OSSe,379.3398,1.991366,0.242119,126.581347,-4.842119,1.809439,-3.03268,C1=CC=C(C1)c1cc2[se]c3c4sccc4c4=CCC=c4c3c2o1


Generate a dataframe of chemical features.

In [55]:
features_df = ChemFeatures(data50['SMILES_str'])

100%|██████████| 50/50 [00:20<00:00,  2.63it/s]
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

100%|██████████| 50/50 [00:22<00:00,  2.30it/s]


In [5]:
features_df.head()

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,20.142136,16.169815,0,0,33.204238,2.616114,5.057172,33.204238,1.38351,4.188377,...,10.460213,77.559765,394.963154,11.96858,1216,42,146.0,185.0,4.611111,5.0
1,20.849242,16.133746,0,0,34.75519,2.600656,5.047921,34.75519,1.390208,4.22113,...,10.493799,77.890806,401.013897,10.025347,1337,45,150.0,189.0,4.861111,5.25
2,22.889683,17.89332,0,0,37.437727,2.64542,5.16005,37.437727,1.386582,4.315006,...,10.639862,82.032579,363.107941,8.252453,1635,49,168.0,215.0,5.083333,5.555556
3,13.313708,11.688393,0,0,21.283525,2.481194,4.637583,21.283525,1.33022,3.777767,...,9.782393,66.966647,319.941201,11.426471,430,20,94.0,115.0,3.166667,3.333333
4,19.435029,15.989365,0,0,31.67049,2.627835,5.062549,31.67049,1.376978,4.154569,...,10.42931,77.119188,379.977407,10.856497,1074,39,142.0,181.0,4.361111,4.75


In [63]:
features_df = df

In [58]:
df.head()

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,20.142136,16.169815,0,0,33.204238,2.616114,5.057172,33.204238,1.38351,4.188377,...,10.460213,77.559765,394.963154,11.96858,1216,42,146.0,185.0,4.611111,5.0
1,20.849242,16.133746,0,0,34.75519,2.600656,5.047921,34.75519,1.390208,4.22113,...,10.493799,77.890806,401.013897,10.025347,1337,45,150.0,189.0,4.861111,5.25
2,22.889683,17.89332,0,0,37.437727,2.64542,5.16005,37.437727,1.386582,4.315006,...,10.639862,82.032579,363.107941,8.252453,1635,49,168.0,215.0,5.083333,5.555556
3,13.313708,11.688393,0,0,21.283525,2.481194,4.637583,21.283525,1.33022,3.777767,...,9.782393,66.966647,319.941201,11.426471,430,20,94.0,115.0,3.166667,3.333333
4,19.435029,15.989365,0,0,31.67049,2.627835,5.062549,31.67049,1.376978,4.154569,...,10.42931,77.119188,379.977407,10.856497,1074,39,142.0,181.0,4.361111,4.75


> Not all molecules have the same features as others, so there will be some non-value entries hidden in the dataframe.

In [6]:
# find the non-value entries.

missing =[]
for i in range(features_df.shape[1]):
    if type(features_df.loc[1][i]) == mordred.error.Missing:
        missing.append(features_df.loc[1][i])

# show examples of the non-value entries.        
missing[0:5]

[<mordred.error.Missing at 0xa21ddb2b0>,
 <mordred.error.Missing at 0xa21e80358>,
 <mordred.error.Missing at 0xa21e6eb70>,
 <mordred.error.Missing at 0xa21e672b0>,
 <mordred.error.Missing at 0xa21e69908>]

In [7]:
import numpy as np
assd = features_df.dtypes
type(assd[1])

numpy.dtype

> **for these non-value entries, we assign them to 0, simply indicating that the molecule does not possess that specific feature.**

In [8]:
# replace missing value(wrong type) with 0
type_series = features_df.dtypes
wrong_column = []
for col in range(len(type_series)):
    if type_series[col] != np.dtype('int64') and type_series[col] != np.dtype('float64'):
        wrong_column.append(col)

In [10]:
len(wrong_column)

418

In [None]:
for column in wrong_column:
    for item in features_df.iloc[:,column]:
        if type(item) != np.float64 and type(item) != np.int64:
            features_df = features_df.iloc[:,column].replace(item, 0)
np.unique(type_series)

In [64]:
for column in wrong_column:
    i = 0
    for item in features_df.iloc[:,column]:
        if type(item) != np.float64 and type(item) != np.int64:
            features_df.iloc[i,column] = 0
        i += 1    

In [65]:
for column in wrong_column:
    for item in features_df.iloc[:,column]:
        print(item)

0
0
0.5039737335319426
0
0
0
0
0
0
0.34121220418765275
0.10336177263524798
0
0
0
0.2548426205987187
0.12065034969150082
0.5343023061343222
0.31996984431773007
0.32942406975976357
0
0.3741360876690819
0
0.4334625751797914
0
0.44090828714797553
0
0
0.12019511676788064
0
0.25055104029969094
0.23853941379926025
0.16170685007673305
0.13707590163638528
0.5126454230712347
0.3397173398880736
0
0
0
0.44191323340957045
0
0.36237159451909046
0
0.33742403834671697
0
0
0
0
0
0
0.13233190257406494
0
0
-0.1908488543236474
0
0
0
0
0
0
-0.10018285164194271
-0.02266826525046546
0
0
0
-0.0782855397096871
-0.003573649229828544
-0.15021951370007178
-0.10008457829684914
-0.10022987623853707
0
-0.09143194289547316
0
-0.14880624972444195
0
-0.11921828845665702
0
0
-0.009415051805105358
0
-0.08439424003296014
-0.08452596349983274
-0.03103278228099108
-0.022089236234025665
-0.19871293675813223
-0.09483862158918002
0
0
0
-0.09658869525884245
0
-0.08317407410031954
0
-0.0813897840550313
0
0
0
0
0
0
-0.00597576330

0.6003139964777089
0
0.5919602803918821
0
0.5735636605123884
0
0
0
0
0
0
0.7469344945773185
0
0
1.1491953645957795
0
0
0
0
0
0
0.6992019314035819
0.9620893340033636
0
0
0
1.171478256149999
0.8323158002048191
0.698803792487785
1.0562518143201132
0.7422133355476044
0
0.7605993174581166
0
0.6856266874908081
0
0.706745505601046
0
0
0.6996375700567886
0
1.1105513819518484
0.9865255288725374
1.0418311486937688
0.9197459475063612
1.1989532910103153
0.875991870873081
0
0
0
0.831231788882205
0
0.8766714452926451
0
0.6402840161578628
0
0
0
0
0
0
0.765899546172471
0
0
1.2595391800839468
0
0
0
0
0
0
0.9619393381818271
0.8902063961647745
0
0
0
1.0617897109529633
0.9754894301667167
0.9721006687445974
0.9758639467836565
0.9982762096202462
0
1.0447199589594383
0
0.8304598360785549
0
0.9870403613077736
0
0
0.8904053928358407
0
1.0318243003280065
0.9257824236154463
1.1001346537256576
0.9085383855455792
1.1426163695214193
1.0365577302307092
0
0
0
1.11569731288532
0
1.0441368737359176
0
0.8942657472260332

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1.071181817053931
1.0712689786530876
0
1.0433582923910305
1.009753455585887
1.1368652368354748
1.1101995937263793
1.0382292488722862
1.0932492441421013
1.1355331947593852
1.1495231813774383
1.0492185269169396
1.1352915564373895
1.0534472666349468
1.1229507590073065
1.0741183306298745
1.0686004818594104
1.0888099192207237
1.129848181041964
1.1075788051846251
1.0284309923081567
1.061369666944763
1.1234849773242628
1.0220159091082612
1.0220500373460781
1.0158191627329434
1.0660987514202702
1.053062960134569
1.0736694794870298
1.0315579589528903
1.1012624926513814
1.1161730284272673
1.0786763882295087
1.039996724572392
1.1198594576719576
1.0523397738725118
1.0144549949609472
1.0654671055622535
1.1169472578721316
1.0664926881246324
1.1396886810279665
1.04328625142027
1.0378465923406397
1.0316223125052488
1.1114888825900728
1.0486717628077655
1.1241476144840141
1.1608468845357072
1

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0.340167076089695
0
0.6762030738221214
0.35473497732426307
0.21739181783824635
0.5246886810279666
0.4398044217687074
0.34729402872259996
0
0
0.5504213907785336
0.42976493606701927
0.43401699105568137
0
0
0
0
0
0.484147652116402
0
0.2245214474678756
0
0.45053996598639456
0
0.4356864134542706
0.34871126228269067
0
0.36722978080120916
0
0
0
0
0
0
0.4185848922902494
0.39332294028722603
0.3615173847316703
0
0.4497423784328545
0
0.5232140101410934
0
0.3267899659863944
0.3101232993197278
0.46978859599395306
0.28658682917611467
0.42681831065759634
0.5298892983119173
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [44]:
type(features_df.iloc[2,wrong_column[0]])

numpy.float64

In [None]:
from sklearn.preprocessing import StandardScaler
features =[feature for feature in features_df.columns]

# Separating out the features
x = features_df.loc[:, features].values

# Our target is pce
y = features_df.loc[:, 'pce'].values
sc = StandardScaler()
X = sc.fit_transform(x)

In [None]:
features = [ftr for ftr in features_df.head()]
ranks = {}
def rank_to_dict(ranks, names, order=1):
    sc = StandardScaler()
    ranks = sc.fit_transform(order*np.array([ranks]).T).T[0]
    ranks = map(lambda x: round(x, 2), ranks)
    return dict(zip(names, ranks ))

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR
estimatorREFCV = SVR('linear')
selectorREFCV = RFECV(estimatorREFCV, step=5, scoring='explained_variance')
selectorREFCV.fit(X, y)

In [None]:
ridge = Ridge(alpha=7)
ridge.fit(X, Y)
ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names)

In [None]:
from sklearn.linear_model import RandomizedLogisticRegression as stability