In [1]:
# Python 2 script.
from __future__ import division

# External library.
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
from sklearn.preprocessing import MinMaxScaler

## Functions

In [2]:
def nan_filter(data, thresh=80, value='zero'):
    """
    Purpose
    -------
    1. Drop columns with all the values as NaNs.
    2. Drop columns with NaNs over a certian limit.
    3. Replace NaNs with a value.
    
    Arguments
    ---------
    data: Dataframe.
    thresh: Threshold for non-NaN values; default is 80%.
    value: Value to replace NaN with; default is zero.
           Options: 'zero', 'mean', 'median'
    
    Returns
    -------
    A modified Pandas dataframe.
    """
    
    data = data.dropna(axis='columns', how='all')
    threshold = int((data.shape[0] * thresh) / 100)
    data = data.dropna(axis='columns', thresh=thresh)
    if value == 'zero':
        data = data.apply(lambda x: x.fillna(0)) 
    elif value == 'mean':
        data = data.apply(lambda x: x.fillna(x.mean()))
    elif value == 'median':
        data = data.apply(lambda x: x.fillna(x.median()))
  
    return data

In [3]:
def variance_thresh(data, thresh=0.0):
    """
    Purpose
    -------
    1. Select columns with variace greater than a specified threshold.
    
    Arguments
    ---------
    data: Dataframe with column values in float64 and devoid of missing values.
    thresh: Variace threshold. Default is 0.0 (float)
    
    Returns
    -------
    A modified Pandas dataframe.
    """
    
    selector = VarianceThreshold(threshold=thresh)
    selector.fit(data)
    idx = np.where(selector.variances_ > thresh)[0]
    cols = data.columns[idx]
    df = data[cols]
    
    return df

In [4]:
def normalize_mm(data, feat_range=(0,1)):
    """
    Purpose
    -------
    Transforms features by scaling each feature to a given range using  Scikit-learn MinMaxScaler.
    
    Arguments
    ---------
    data: Pandas dataframe.
    feat_range: Scale range in a tuple. Default is (0,1)
    
    Returns
    -------
    Transformed Pandas dataframe.
    
    """
    cols = list(data.columns)
    ind = list(data.index)
    scaler = MinMaxScaler(feature_range=feat_range)
    scaler.fit(data)
    df = pd.DataFrame(data=scaler.transform(data), index=ind, columns=cols)
    
    return df

In [5]:
def select_k_chi2(X, y, k=1):
    """
    Purpose
    -------
    Select K best features using Chi squared test.
    
    Arguments
    ---------
    X: Features in a Pandas dataframe.
    y: Targets in a Pandas dataframe.
    k: Value for k-best feature estimation.
    
    Returns
    -------
    A Pandas dataframe with the k-best features.
    """
    
    selector = SelectKBest(chi2, k=k).fit(X, y)
    mask = selector.get_support()
    new_features = X.columns[mask]
    
    return X[new_features]

## Load Data

In [6]:
# Load the original dataset with all the features calculated in feature-calculation-mordred notebook.
df = pd.read_csv('../data/bitter_sweet_2d_plus_3d_descriptors.tsv.gz', compression='gzip', sep="\t", index_col=0, dtype='str')

In [7]:
columns = df.columns

In [8]:
# Separate data into feature and target dataframes.
X = df[columns[3:-1]]
X = X.apply(pd.to_numeric, errors='coerce') # Change numbers as strings to numeric 
                                            # and relpace strings with NaNs.
#X = X.astype(dtype='float64') # Convert dtypes to float64.

In [24]:
X['nAcid'].dtype

dtype('int64')

In [28]:
X['SMR_VSA2'] = X['SMR_VSA2'].astype(dtype='int64')

In [29]:
y = df.loc[:,columns[-1:]]
y.Target.replace(to_replace='Sweet', value=1, inplace=True) # Replace 'Sweet' label with 1.
y.Target.replace(to_replace='Bitter', value=0, inplace=True) # Replace 'Bitter' label with 0.

In [30]:
# Print some metrics about the data.
print "Total number of features: ", X.shape[1] 
print "Total number of samples: ", X.shape[0]
print "To check if there are any null values in the features and how many: ", \
       X.isnull().values.any(), X.isnull().sum().sum() 
print "To check if there are any null values in the target and how many:", \
       y.isnull().values.any(), y.isnull().sum().sum() 
print "The total number of Sweet targets: ", y[y.Target == 1].count()[0]
print "The total number of Bitter targets: ", y[y.Target == 0].count()[0]

Total number of features:  2163
Total number of samples:  4742
To check if there are any null values in the features and how many:  True 3325048
To check if there are any null values in the target and how many: False 0
The total number of Sweet targets:  2741
The total number of Bitter targets:  2001


In [31]:
# Replace NaNs with the column mean. Keep columns with atleast 80% non NaNs. 
# Remove columns with only NaN values.
X_no_nan = nan_filter(data=X, thresh=80, value='mean')

In [32]:
print "To check if there are any null values in the features after fixing nans and how many: ", \
       X_no_nan.isnull().values.any(), X_no_nan.isnull().sum().sum() 
print "Total number of features left after fixing NaNs: ", X_no_nan.shape[1] 

To check if there are any null values in the features after fixing nans and how many:  False 0
Total number of features left after fixing NaNs:  1552


The number of features are still quite high. It might be reasonable to find the most important features in order to keep the feature:data points ratio to 1:10. There are multiple methods that we can use to do feature selection and here I am going to try a few from the sklearn.feature_selection module (http://scikit-learn.org/stable/modules/feature_selection.html)


## Removing features with low variance
Remove all features whose variance doesn’t meet some threshold for example remove all zero-variance features, i.e. features that have the same value in all samples.

I would try both remove all zero-variance features and all features that are either one or zero (on or off) in more than 80% of the samples respectively. Boolean features are Bernoulli random variables, and the variance of such variables is given by
<br>
<br>

<math xmlns="http://www.w3.org/1998/Math/MathML" display="block">
  <mrow class="MJX-TeXAtom-ORD">
    <mi mathvariant="normal">V</mi>
    <mi mathvariant="normal">a</mi>
    <mi mathvariant="normal">r</mi>
  </mrow>
  <mo stretchy="false">[</mo>
  <mi>X</mi>
  <mo stretchy="false">]</mo>
  <mo>=</mo>
  <mi>p</mi>
  <mo stretchy="false">(</mo>
  <mn>1</mn>
  <mo>&#x2212;<!-- − --></mo>
  <mi>p</mi>
  <mo stretchy="false">)</mo>
</math>

so we can select using the threshold .8 * (1 - .8):

In [33]:
X_zero_var = variance_thresh(data=X_no_nan, thresh=0.0)
print "Features after feature selection by removing zero variace columns:", X_zero_var.shape[1]

Features after feature selection by removing zero variace columns: 1394


In [46]:
thresh = (.8 * (1 - .8))
X_80 = variance_thresh(data=X_no_nan, thresh=thresh)
print "Features after feature selection by removing columns that are either zero or one in more than 80% of the samples:", \
       X_80.shape[1]

Features after feature selection by removing columns that are either zero or one in more than 80% of the samples: 851


## Univariate feature selection
### SelectKBest: Removes all but the highest scoring features.

1. Reduce X_no_nan features to 400.
2. Reduce X_80 features to 400.
3. Find the consensus between the above two datasets.

In [35]:
# SelectKBest Chi2 can't take negative values hence normalize the values in the range 0 and 1.
X_no_nan_norm = normalize_mm(X_no_nan)
X_80_norm = normalize_mm(X_80)

In [36]:
X_no_nan_k = select_k_chi2(X_no_nan_norm, y, k=400)
X_80_k = select_k_chi2(X_80_norm, y, k=400)

In [49]:
feat_inter = list(set(X_no_nan_k.columns).intersection(X_80_k.columns))

In [52]:
X_no_nan[feat_inter]

Unnamed: 0,MPC6,MPC7,MPC4,ATS4d,MPC2,ETA_beta_ns_d,VR3_Dzx,VR3_Dzv,ATS4m,ECIndex,...,SaaCH,WPath,PEOE_VSA8,ATS1p,SMR_VSA2,TpiPC10,AATS1d,naRing,piPC9,MPC9
0,74,70,62,338.0,36,0.000000,5.675068,5.675423,8120.238308,367,...,0.000000,1110,0.000000,66.507524,0,6.411818,4.152174,0,4.234107,68
1,74,70,62,315.0,36,0.000000,5.675068,5.669272,10752.308414,367,...,0.000000,1110,0.000000,71.807000,0,6.411818,4.372093,0,4.234107,68
2,26,27,42,332.0,34,0.000000,5.704121,5.697127,5803.021582,581,...,0.000000,1807,27.619567,92.232838,0,5.805135,3.140351,0,3.332205,25
3,32,31,33,227.0,27,0.000000,5.387804,5.391494,5146.402186,352,...,9.055423,1000,6.420822,65.338645,0,6.767271,3.512821,1,4.884694,24
4,17,7,22,117.0,18,0.000000,4.022934,4.037334,2920.173451,104,...,0.000000,178,0.000000,33.788530,0,4.912655,3.875000,0,0.000000,0
5,54,50,46,326.0,32,0.000000,5.636160,5.636962,7868.108691,355,...,0.000000,1158,0.000000,66.816497,0,6.073045,3.765957,0,3.688879,39
6,58,53,56,302.0,36,0.000000,5.659144,5.661744,7035.568338,414,...,0.000000,1220,0.000000,66.507524,0,6.324359,4.108696,0,4.219508,67
7,71,68,56,322.0,35,0.000000,5.665216,5.666246,7848.627250,376,...,0.000000,1156,0.000000,66.507524,0,6.401917,4.086957,0,4.290459,72
8,13,0,19,41.0,15,0.398058,4.785645,4.790263,2065.597401,2000000000,...,0.000000,1000000109,6.076020,27.503957,0,5.370638,4.214286,0,0.000000,0
9,10,8,16,111.0,15,0.398058,4.785645,4.790263,2507.268131,2200000000,...,0.000000,1100000160,6.041841,41.720260,0,4.934474,3.434783,0,0.000000,0


In [39]:
(4695/4742) * 100

99.00885702235344

In [42]:
X['SMR_VSA2']

0       0
1       0
2       0
3       0
4       0
5       0
6       0
7       0
8       0
9       0
10      0
11      0
12      0
13      0
14      0
15      0
16      0
17      0
18      0
19      0
20      0
21      0
22      0
23      0
24      0
25      0
26      0
27      0
28      0
29      0
       ..
4712    0
4713    0
4714    0
4715    0
4716    0
4717    0
4718    0
4719    0
4720    0
4721    0
4722    0
4723    0
4724    0
4725    0
4726    0
4727    0
4728    0
4729    0
4730    0
4731    0
4732    0
4733    0
4734    0
4735    0
4736    0
4737    0
4738    0
4739    0
4740    0
4741    0
Name: SMR_VSA2, Length: 4742, dtype: int64

In [41]:
df['SMR_VSA2']

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
5       0.0
6       0.0
7       0.0
8       0.0
9       0.0
10      0.0
11      0.0
12      0.0
13      0.0
14      0.0
15      0.0
16      0.0
17      0.0
18      0.0
19      0.0
20      0.0
21      0.0
22      0.0
23      0.0
24      0.0
25      0.0
26      0.0
27      0.0
28      0.0
29      0.0
       ... 
4712    0.0
4713    0.0
4714    0.0
4715    0.0
4716    0.0
4717    0.0
4718    0.0
4719    0.0
4720    0.0
4721    0.0
4722    0.0
4723    0.0
4724    0.0
4725    0.0
4726    0.0
4727    0.0
4728    0.0
4729    0.0
4730    0.0
4731    0.0
4732    0.0
4733    0.0
4734    0.0
4735    0.0
4736    0.0
4737    0.0
4738    0.0
4739    0.0
4740    0.0
4741    0.0
Name: SMR_VSA2, Length: 4742, dtype: object