# EliminateUnviableFeatures Tests

## imports

In [1]:
import pandas as pd
import copy, random, os, sys
import numpy as np
import numba as jit
from pandas_summary import DataFrameSummary
# steppy imports
from steppy.base import Step, BaseTransformer, make_transformer
from steppy.adapter import Adapter, E
from steppy.utils import get_logger
__author__ = 'Bruce_H_Cottman'
__license__ = 'MIT License'
#
from toolkit.preprocessing.EliminateUnviableFeatures import EliminateUnviableFeatures

print (sys.version)
%reload_ext autoreload
%autoreload 2

3.6.5 |Anaconda, Inc.| (default, Apr 26 2018, 08:42:37) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]


## logging

In [2]:
logger = get_logger()
logger.setLevel('DEBUG'.upper())
logger.getEffectiveLevel()
LOGGER = logger

## system info

In [3]:
sys.path

['',
 '/Users/brucecottman/anaconda3/envs/steppy/lib/python36.zip',
 '/Users/brucecottman/anaconda3/envs/steppy/lib/python3.6',
 '/Users/brucecottman/anaconda3/envs/steppy/lib/python3.6/lib-dynload',
 '/Users/brucecottman/.local/lib/python3.6/site-packages',
 '/Users/brucecottman/anaconda3/envs/steppy/lib/python3.6/site-packages',
 '/Users/brucecottman/anaconda3/envs/steppy/lib/python3.6/site-packages/aeosa',
 '/Users/brucecottman/anaconda3/envs/steppy/lib/python3.6/site-packages/pip-18.0-py3.6.egg',
 '/Users/brucecottman/Documents/PROJECTS/steppy-toolkit',
 '/Users/brucecottman/Documents/PROJECTS/steppy-toolkit/src/bayes-opt',
 '/Users/brucecottman/anaconda3/envs/steppy/lib/python3.6/site-packages/torchvision-0.2.1-py3.6.egg',
 '/Users/brucecottman/anaconda3/envs/steppy/lib/python3.6/site-packages/IPython/extensions',
 '/Users/brucecottman/.ipython']

In [4]:
!conda list

# packages in environment at /Users/brucecottman/anaconda3/envs/steppy:
#
# Name                    Version                   Build  Channel
_ipyw_jlab_nb_ext_conf    0.1.0            py36h2fc01ae_0  
absl-py                   0.4.1                     <pip>
alabaster                 0.7.10           py36h174008c_0  
anaconda                  5.2.0                    py36_3  
anaconda-client           1.6.14                   py36_0  
anaconda-navigator        1.8.7                    py36_0  
anaconda-project          0.8.2            py36h9ee5d53_0  
appnope                   0.1.0            py36hf537a9a_0  
appscript                 1.0.1            py36h9e71e49_1  
argcomplete               1.9.4                     <pip>
asn1crypto                0.24.0                   py36_0  
astor                     0.7.1                     <pip>
astroid                   1.6.3                    py36_0  
astropy                   3.0.2            py36h917ab60_1  
atomicwr

## Clean out EXPERIMENT_DIR

In [5]:
for v in ['foo',None,None]: print(v)

foo
None
None


In [6]:
MODELS_DIR = '../input/models'
EXPERIMENT_DIR = MODELS_DIR+'/tmp'
!mkdir $MODELS_DIR
!mkdir $EXPERIMENT_DIR
import shutil

# By default pipelines will try to load previously trained models so we delete the cache to ba sure we're starting from scratch
shutil.rmtree(EXPERIMENT_DIR, ignore_errors=True)

mkdir: ../input/models: File exists
mkdir: ../input/models/tmp: File exists


## Data Setup

### df_type

In [7]:
df_type = pd.DataFrame({
    'boolean': [True,False,True,False,True], 
    'integer': [1,2,33,44,34],
    'float': [1.,2.,35.,46,.37],
    'object': ['red','blue','green','pink',np.nan]
    
})
df_type = pd.concat([df_type
              ,pd.Series( ['red','blue','green','pink',np.nan],dtype='category',name='categorical')]
               ,axis=1)
df_type.shape,df_type

((5, 5),    boolean  integer  float object categorical
 0     True        1   1.00    red         red
 1    False        2   2.00   blue        blue
 2     True       33  35.00  green       green
 3    False       44  46.00   pink        pink
 4     True       34   0.37    NaN         NaN)

### City

In [8]:
from sklearn.datasets import load_boston
boston = load_boston()

logger.info(boston.feature_names)
logger.info(boston.DESCR)
City = pd.DataFrame(boston.data, columns = boston.feature_names )
#City = City[['CRIM', 'INDUS','NOX','TAX','B']]
City['MEDV'] = boston.target
logger.info(City.shape)
logger.info(City.columns)

2018-12-08 19:07:57 steppy >>> ['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
2018-12-08 19:07:57 steppy >>> Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index 

### data_City

In [9]:
data_City = {
    'input': {
        'features': City,
        'labels': ['CRIME'],
        'ignore': ['ZN','CHAS','RM', 'RAD','MEDV'],
        'SD_LIMIT': .05
    }
}

### Housing

In [10]:
from sklearn.datasets import fetch_california_housing
print(__doc__)
dataset = fetch_california_housing()

# Take only 2 features to make visualization easier
# Feature of 0 has a long tail distribution.
# Feature 5 has a few but very large outliers.
logger.info(dataset.data.shape)
logger.info(dataset.feature_names)
logger.info(dataset.DESCR) 
Housing = pd.DataFrame(dataset.data, columns = dataset.feature_names )
#keeppers = ['MedInc', 'HouseAge', 'AveRooms'
#                   , 'AveBedrms', 'Population', 'AveOccup']
#Housing = Housing[keeppers]
logger.info(Housing.shape)
logger.info(Housing.columns)

Automatically created module for IPython interactive environment
2018-12-08 19:07:57 steppy >>> (20640, 8)
2018-12-08 19:07:57 steppy >>> ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
2018-12-08 19:07:57 steppy >>> California housing dataset.

The original database is available from StatLib

    http://lib.stat.cmu.edu/datasets/

The data contains 20,640 observations on 9 variables.

This dataset contains the average house value as target variable
and the following input variables (features): average income,
housing average age, average rooms, average bedrooms, population,
average occupation, latitude, and longitude in that order.

References
----------

Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions,
Statistics and Probability Letters, 33 (1997) 291-297.


2018-12-08 19:07:57 steppy >>> (20640, 8)
2018-12-08 19:07:57 steppy >>> Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       '

In [11]:
from sklearn.datasets import load_iris
dataset = load_iris()
logger.info(dataset.data.shape)
logger.info(dataset.feature_names)
logger.info(dataset.DESCR) 
Iris = pd.DataFrame(dataset.data, columns = dataset.feature_names )
logger.info(Iris.shape)
logger.info(Iris.columns)


2018-12-08 19:07:57 steppy >>> (150, 4)
2018-12-08 19:07:57 steppy >>> ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
2018-12-08 19:07:57 steppy >>> Iris Plants Database

Notes
-----
Data Set Characteristics:
    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20  0.76     0.9565  (high!)

    :Missing Attribute Values: None
    :Clas

### breast_cancer

In [12]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
ld = ['11/11/1906','11/11/1906','11/11/1906 12:13:14','11/11/1906','11/11/1906']
dset = load_breast_cancer()
LOGGER.info(dset.feature_names)
LOGGER.info(dset.DESCR)
breast_cancer = pd.DataFrame(dset.data, columns = dset.feature_names )
#City = City[['CRIM', 'INDUS','NOX','TAX','B']]
breast_cancer['Negative'] = dset.target
#idenical to eliminate as idenical
breast_cancer['Positive'] = breast_cancer['Negative']
#elim as one value
breast_cancer['OneValueEliminate'] = 1
# eliminate by std
breast_cancer['LowVarEliminate'] = breast_cancer['OneValueEliminate']
breast_cancer.loc[0,'LowVarEliminate'] = 0
breast_cancer['dates'] = pd.to_datetime(ld*112+ld*1+ld[0:4])
LOGGER.info(breast_cancer.shape)
LOGGER.info(breast_cancer.dtypes)
LOGGER.info(breast_cancer.head())
display(DataFrameSummary(breast_cancer).summary())
def bc():
    return breast_cancer.copy()


2018-12-08 19:07:57 steppy >>> ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
2018-12-08 19:07:57 steppy >>> Breast Cancer Wisconsin (Diagnostic) Database

Notes
-----
Data Set Characteristics:
    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
       

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Negative,Positive,OneValueEliminate,LowVarEliminate,dates
count,569,569,569,569,569,569,569,569,569,569,...,569,569,569,569,569,569,569,569,569,
mean,14.1273,19.2896,91.969,654.889,0.0963603,0.104341,0.0887993,0.0489191,0.181162,0.0627976,...,0.254265,0.272188,0.114606,0.290076,0.0839458,0.627417,0.627417,1,0.998243,
std,3.52405,4.30104,24.299,351.914,0.0140641,0.0528128,0.0797198,0.0388028,0.0274143,0.00706036,...,0.157336,0.208624,0.0657323,0.0618675,0.0180613,0.483918,0.483918,0,0.0419222,
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0,0,0.106,0.04996,...,0.02729,0,0,0.1565,0.05504,0,0,1,0,
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,0.1472,0.1145,0.06493,0.2504,0.07146,0,0,1,1,
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,0.2119,0.2267,0.09993,0.2822,0.08004,1,1,1,1,
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,0.3391,0.3829,0.1614,0.3179,0.09208,1,1,1,1,
max,28.11,39.28,188.5,2501,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,1.058,1.252,0.291,0.6638,0.2075,1,1,1,1,
counts,569,569,569,569,569,569,569,569,569,569,...,569,569,569,569,569,569,569,569,569,569
uniques,456,479,522,539,474,537,537,542,432,499,...,529,539,492,500,535,2,2,1,2,2


In [13]:
data_Breast_Cancer = {
    'input': {
        'features': bc(),
        'labels': ['CRIME'],
        'ignore': ['OneValueEliminate','CHAS','RM', 'RAD','MEDV','Positive'],
        'SD_LIMIT': .05
    }
}

In [14]:
def df_City():
    City = pd.DataFrame(boston.data, columns = boston.feature_names )
    City = City[['CRIM', 'INDUS','NOX','TAX','B']]
#    City['MEDV'] = boston.target
    return(City)

##  df_type EliminateUnviableFeatures

In [15]:
def aX():
    return np.array([
            [4, 3.7, 8.9],
            [3, 3.2, 0.5],
            [1, 0.9, 8.9],
            [5, 5.0, 2.4],
            [5, 7.8, 2.4],
            [0, 7.0, 0.2],
            [8, 1.9, 7.8],
            [3, 9.2, 2.8],
            [5, 5.7, 4.5],
            [6, 5.3, 3.2]
        ])

def cn():
    return (['integer_0','float_1','float_2'])
def df_type():
    return pd.DataFrame(aX(),columns=[cn()])

df_type().isna().any().any()

False

In [16]:
def df_type_low_V():
    return ((pd.DataFrame(aX(),columns=[cn()]))\
           .replace(to_replace=0, value=0)\
           .replace(to_replace=4, value=1)
           .replace(to_replace=3, value=1)\
           .replace(to_replace=5, value=1)\
           .replace(to_replace=8, value=1)\
           .replace(to_replace=6, value=1))
df_type_low_V().std()/(df_type_low_V().max() - df_type_low_V().min())
#df_type_low_V().std()

integer_0    0.316228
float_1      0.349100
float_2      0.376073
dtype: float64

## STEPPY CLASS

### step class for

In [17]:
Elim_step = Step(name='cleanFeatures',
               transformer=EliminateUnviableFeatures(),
               input_data=['input'],
               adapter=Adapter({
                   'X': E('input','features')
               }),
               experiment_directory=EXPERIMENT_DIR,
               is_trainable=False,
               force_fitting=False)

2018-12-08 19:07:58 steppy >>> initializing Step cleanFeatures...
2018-12-08 19:07:58 steppy >>> initializing experiment directories under ../input/models/tmp
2018-12-08 19:07:58 steppy >>> done: initializing experiment directories
2018-12-08 19:07:58 steppy >>> Step cleanFeatures initialized


### elim_step city

In [18]:
Elim_step.transform(data_City)['X'].columns

2018-12-08 19:07:58 steppy >>> Step cleanFeatures, adapting inputs...
2018-12-08 19:07:58 steppy >>> Step cleanFeatures, transforming...


Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'MEDV'],
      dtype='object')

### elim_step  ignore

In [19]:
Elim_step = Step(name='cleanFeatures',
               transformer=EliminateUnviableFeatures(),
               input_data=['input'],
               adapter=Adapter({
                   'X': E('input','features'),
                   'ignore': E('input','ignore')
               }),
               experiment_directory=EXPERIMENT_DIR,
               is_trainable=False,
               force_fitting=False)

2018-12-08 19:07:58 steppy >>> initializing Step cleanFeatures...
2018-12-08 19:07:58 steppy >>> initializing experiment directories under ../input/models/tmp
2018-12-08 19:07:58 steppy >>> done: initializing experiment directories
2018-12-08 19:07:58 steppy >>> Step cleanFeatures initialized


### city

Notice how ignore was included by specifying adapter,but we dont see any change during trasform. lets make more obvious

In [20]:
Elim_step.transform(data_City)['X'].columns

2018-12-08 19:07:58 steppy >>> Step cleanFeatures, adapting inputs...
2018-12-08 19:07:58 steppy >>> Step cleanFeatures, transforming...


Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'MEDV'],
      dtype='object')

### data_Breast_Cancer

In [21]:
Elim_step.transform(data_Breast_Cancer)['X'].columns

2018-12-08 19:07:58 steppy >>> Step cleanFeatures, adapting inputs...
2018-12-08 19:07:58 steppy >>> Step cleanFeatures, transforming...


Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension',
       'Negative', 'OneValueEliminate'],
      dtype='object')

### Breat_Cancer SD_LIMIT

In [22]:
Elim_step = Step(name='cleanFeatures',
               transformer=EliminateUnviableFeatures(),
               input_data=['input'],
               adapter=Adapter({
                   'X': E('input','features'),
                   'ignore': E('input','ignore'),
                   'SD_LIMIT':  E('input','SD_LIMIT')
        
               }),
               experiment_directory=EXPERIMENT_DIR,
               is_trainable=False,
               force_fitting=False)

2018-12-08 19:07:58 steppy >>> initializing Step cleanFeatures...
2018-12-08 19:07:58 steppy >>> initializing experiment directories under ../input/models/tmp
2018-12-08 19:07:58 steppy >>> done: initializing experiment directories
2018-12-08 19:07:58 steppy >>> Step cleanFeatures initialized


In [23]:
Elim_step.transform(data_Breast_Cancer)['X'].columns

2018-12-08 19:07:58 steppy >>> Step cleanFeatures, adapting inputs...
2018-12-08 19:07:58 steppy >>> Step cleanFeatures, transforming...


Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension',
       'Negative', 'OneValueEliminate'],
      dtype='object')