In [1]:
# Python 2 script.
from __future__ import division

# External library.
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
from sklearn.preprocessing import MinMaxScaler

## Functions

In [2]:
def nan_filter(data, thresh=80, value='zero'):
    """
    Purpose
    -------
    1. Drop columns with all the values as NaNs.
    2. Drop columns with NaNs over a certian limit.
    3. Replace NaNs with a value.
    
    Arguments
    ---------
    data: Dataframe.
    thresh: Threshold for non-NaN values; default is 80%.
    value: Value to replace NaN with; default is zero.
           Options: 'zero', 'mean', 'median'
    
    Returns
    -------
    A modified Pandas dataframe.
    """
    
    data = data.dropna(axis='columns', how='all')
    threshold = int((data.shape[0] * thresh) / 100)
    data = data.dropna(axis='columns', thresh=thresh)
    if value == 'zero':
        data = data.apply(lambda x: x.fillna(0)) 
    elif value == 'mean':
        data = data.apply(lambda x: x.fillna(x.mean()))
    elif value == 'median':
        data = data.apply(lambda x: x.fillna(x.median()))
  
    return data

In [3]:
def variance_thresh(data, thresh=0.0):
    """
    Purpose
    -------
    1. Select columns with variace greater than a specified threshold.
    
    Arguments
    ---------
    data: Dataframe with column values in float64 and devoid of missing values.
    thresh: Variace threshold. Default is 0.0 (float)
    
    Returns
    -------
    A modified Pandas dataframe.
    """
    
    selector = VarianceThreshold(threshold=thresh)
    selector.fit(data)
    idx = np.where(selector.variances_ > thresh)[0]
    cols = data.columns[idx]
    df = data[cols]
    
    return df

In [4]:
def normalize_mm(data, feat_range=(0,1)):
    """
    Purpose
    -------
    Transforms features by scaling each feature to a given range using  Scikit-learn MinMaxScaler.
    
    Arguments
    ---------
    data: Pandas dataframe.
    feat_range: Scale range in a tuple. Default is (0,1)
    
    Returns
    -------
    Transformed Pandas dataframe.
    
    """
    cols = list(data.columns)
    ind = list(data.index)
    scaler = MinMaxScaler(feature_range=feat_range)
    scaler.fit(data)
    df = pd.DataFrame(data=scaler.transform(data), index=ind, columns=cols)
    
    return df

In [5]:
def select_k_chi2(X, y, k=1):
    """
    Purpose
    -------
    Select K best features using Chi squared test.
    
    Arguments
    ---------
    X: Features in a Pandas dataframe.
    y: Targets in a Pandas dataframe.
    k: Value for k-best feature estimation.
    
    Returns
    -------
    A Pandas dataframe with the k-best features.
    """
    
    selector = SelectKBest(chi2, k=k).fit(X, y)
    mask = selector.get_support()
    new_features = X.columns[mask]
    
    return X[new_features]

## Load Data

In [6]:
# Load the original dataset with all the features calculated in feature-calculation-mordred notebook.
df = pd.read_csv('../data/bitter_sweet_2d_plus_3d_descriptors.tsv.gz', compression='gzip', sep="\t", index_col=0, dtype='str')

In [7]:
columns = df.columns

In [8]:
# Separate data into feature and target dataframes.
X = df[columns[3:-1]]
X = X.apply(pd.to_numeric, errors='coerce') # Change numbers as strings to numeric 
                                            # and relpace strings with NaNs.
#X = X.astype(dtype='float64') # Convert dtypes to float64.

In [9]:
X['nAcid'].dtype

dtype('int64')

In [10]:
X['SMR_VSA2'] = X['SMR_VSA2'].astype(dtype='int64')

In [11]:
y = df.loc[:,columns[-1:]]
y.Target.replace(to_replace='Sweet', value=1, inplace=True) # Replace 'Sweet' label with 1.
y.Target.replace(to_replace='Bitter', value=0, inplace=True) # Replace 'Bitter' label with 0.

In [12]:
# Print some metrics about the data.
print "Total number of features: ", X.shape[1] 
print "Total number of samples: ", X.shape[0]
print "To check if there are any null values in the features and how many: ", \
       X.isnull().values.any(), X.isnull().sum().sum() 
print "To check if there are any null values in the target and how many:", \
       y.isnull().values.any(), y.isnull().sum().sum() 
print "The total number of Sweet targets: ", y[y.Target == 1].count()[0]
print "The total number of Bitter targets: ", y[y.Target == 0].count()[0]

Total number of features:  2163
Total number of samples:  4742
To check if there are any null values in the features and how many:  True 3325048
To check if there are any null values in the target and how many: False 0
The total number of Sweet targets:  2741
The total number of Bitter targets:  2001


In [13]:
# Replace NaNs with the column mean. Keep columns with atleast 80% non NaNs. 
# Remove columns with only NaN values.
X_no_nan = nan_filter(data=X, thresh=80, value='mean')

In [14]:
print "To check if there are any null values in the features after fixing nans and how many: ", \
       X_no_nan.isnull().values.any(), X_no_nan.isnull().sum().sum() 
print "Total number of features left after fixing NaNs: ", X_no_nan.shape[1] 

To check if there are any null values in the features after fixing nans and how many:  False 0
Total number of features left after fixing NaNs:  1552


In [15]:
X_no_nan.head(1)

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,17.213262,16.059815,0,0,28.766735,2.52242,4.95823,28.766735,1.250728,4.052767,...,10.247042,72.317821,342.116212,7.602582,1110,43,120.0,147.0,10.451389,5.291667


### Normalise and save all features without any selection

In [24]:
X_no_nan_norm = normalize_mm(X_no_nan)
X_ =  pd.concat([X_no_nan_norm,y], axis=1)
X_.to_pickle('../data/X_all.pkl', compression='gzip')

The number of features are still quite high. It might be reasonable to find the most important features in order to keep the feature:data points ratio to 1:10. There are multiple methods that we can use to do feature selection and here I am going to try a few from the sklearn.feature_selection module (http://scikit-learn.org/stable/modules/feature_selection.html)


## Removing features with low variance
Remove all features whose variance doesn’t meet some threshold for example remove all zero-variance features, i.e. features that have the same value in all samples.

I would try both remove all zero-variance features and all features that are either one or zero (on or off) in more than 80% of the samples respectively. Boolean features are Bernoulli random variables, and the variance of such variables is given by
<br>
<br>

<math xmlns="http://www.w3.org/1998/Math/MathML" display="block">
  <mrow class="MJX-TeXAtom-ORD">
    <mi mathvariant="normal">V</mi>
    <mi mathvariant="normal">a</mi>
    <mi mathvariant="normal">r</mi>
  </mrow>
  <mo stretchy="false">[</mo>
  <mi>X</mi>
  <mo stretchy="false">]</mo>
  <mo>=</mo>
  <mi>p</mi>
  <mo stretchy="false">(</mo>
  <mn>1</mn>
  <mo>&#x2212;<!-- − --></mo>
  <mi>p</mi>
  <mo stretchy="false">)</mo>
</math>

so we can select using the threshold .8 * (1 - .8):

In [16]:
X_zero_var = variance_thresh(data=X_no_nan, thresh=0.0)
print "Features after feature selection by removing zero variace columns:", X_zero_var.shape[1]

Features after feature selection by removing zero variace columns: 1394


In [17]:
thresh = (.8 * (1 - .8))
X_80 = variance_thresh(data=X_zero_var, thresh=thresh)
print "Features after feature selection by removing columns that are either zero or one in more than 80% of the samples:", \
       X_80.shape[1]

Features after feature selection by removing columns that are either zero or one in more than 80% of the samples: 851


In [18]:
# Select top 400 features according to variance.
df_var = X_80.var(axis=0)
df_sorted = df_var.sort_values(axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last')
X_var_400 = X_80[df_sorted.index[:400]]
X_var_400 = normalize_mm(X_var_400) # Normalize before saving.

In [19]:
# Combine target and top 400 features based on variance.
X_var_400_ =  pd.concat([X_var_400,y], axis=1)
X_var_400_.to_pickle('../data/X_var_400.pkl', compression='gzip')

## Univariate feature selection
### SelectKBest: Removes all but the highest scoring features.

1. Reduce X_no_nan features to 400.

In [21]:
# SelectKBest Chi2 can't take negative values hence normalize the values in the range 0 and 1.
X_no_nan_norm = normalize_mm(X_no_nan)

In [22]:
X_k_400 = select_k_chi2(X_no_nan_norm, y, k=400)

In [23]:
# Combine target and top 400 features based on Chi2 test.
X_k_400_ = pd.concat([X_k_400, y], axis=1)
X_k_400_.to_pickle('../data/X_k_400.pkl', compression='gzip')