## Tensorflow Random Forest

- Data are stored in **pickle** by **prepare_data** sript


#### Load packages

In [1]:
import pickle

from __future__ import print_function

import numpy as np
import pandas as pd

import tensorflow as tf



### Load data

In [2]:
## load data and scalled it
with open('../data/features.pickle', 'rb') as handle:
    d = pickle.load(handle)

## No further scale for the RF model
d.keys()

['test', 'train', 'validation']

### Random forest model training

In [3]:
tf.set_random_seed(123)
params = tf.contrib.tensor_forest.python.tensor_forest.ForestHParams(
  num_classes=3, 
  num_features=d['train']['features'].shape[1], 
  regression=False,
  num_trees=25, 
  max_nodes=100,
  num_splits_to_consider=20)

classifier = tf.contrib.tensor_forest.client.random_forest.TensorForestEstimator(params,model_dir="./tmp/tf_model")

# Define the training inputs
train_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={"x": np.array(d['train']['features'],dtype=np.float32)},
    y=np.array(d['train']['label'],dtype=np.int),
    num_epochs=None,
    batch_size=2000,
    shuffle=True
)

classifier.fit(input_fn=train_input_fn, steps=10000)

Instructions for updating:
Please switch to tf.contrib.estimator.*_head.
Instructions for updating:
Please replace uses of any Estimator from tf.contrib.learn with an Estimator from tf.estimator.*
Instructions for updating:
When switching to tf.estimator.Estimator, use tf.estimator.RunConfig instead.
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 0, '_keep_checkpoint_max': 5, '_task_type': None, '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1c2464c110>, '_model_dir': './tmp/tf_model', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_session_config': None, '_tf_random_seed': None, '_save_summary_steps': 100, '_environment': 'local', '_num_worker_replicas': 0, '_task_id': 0, '_log_step_count_steps': 100, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_evaluation_master': '', '_master': 

TensorForestEstimator(params=None)

### Accuracy on the validation dataset

In [4]:
# Define the test inputs
validation_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={"x": np.array(d['validation']['features'],dtype=np.float32)},
    y=np.array(d['validation']['label'],dtype=np.int),
    num_epochs=1,
    shuffle=False
)

# Evaluate accuracy
accuracy_score = classifier.evaluate(input_fn=validation_input_fn)["accuracy"]
print("\nValidation Accuracy: {0:f}%\n".format(accuracy_score*100))

INFO:tensorflow:Constructing forest with params = 
INFO:tensorflow:{'num_output_columns': 4, 'params_proto': pruning_type {
  prune_every_samples {
    constant_value: 0.0
  }
}
finish_type {
  check_every_steps {
    constant_value: 0.0
  }
}
num_trees: 25
max_nodes: 100
num_outputs: 3
num_splits_to_consider {
  constant_value: 20.0
}
split_after_samples {
  constant_value: 250.0
}
dominate_fraction {
  constant_value: 0.990000009537
}
num_features: 188
, 'feature_bagging_fraction': 1.0, 'valid_leaf_threshold': 1, 'checkpoint_stats': False, 'initialize_average_splits': False, 'pruning_type': 0, 'prune_every_samples': 0, 'dominate_fraction': 0.99, 'max_fertile_nodes': 0, 'early_finish_check_every_samples': 0, 'dominate_method': 'bootstrap', 'bagging_fraction': 1.0, 'regression': False, 'param_file': None, 'bagged_num_features': 188, 'use_running_stats_method': False, 'max_nodes': 100, 'split_finish_name': 'basic', 'leaf_model_type': 0, 'stats_model_type': 0, 'bagged_features': None, 'n

### Output prediction on the testing dataset to be evaluated by drivendata.org

In [5]:
test_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={"x": np.array(d['test']['features'],dtype=np.float32)},
    shuffle=False
)
## Output prediction for test, evalucate it through https://www.drivendata.org/
pred = [p['classes']  for p in classifier.predict(input_fn=test_input_fn)]
mapping = dict(enumerate(d['train']['label_original'].cat.categories))
pred_label = [mapping[i] for i in pred ]

outd = pd.DataFrame({"id":d['test']["id"]['id'].tolist(),"status_group": pred_label})

outd.to_csv(path_or_buf="../result/TF-RF-predctions.csv",index=False)

INFO:tensorflow:Constructing forest with params = 
INFO:tensorflow:{'num_output_columns': 4, 'params_proto': pruning_type {
  prune_every_samples {
    constant_value: 0.0
  }
}
finish_type {
  check_every_steps {
    constant_value: 0.0
  }
}
num_trees: 25
max_nodes: 100
num_outputs: 3
num_splits_to_consider {
  constant_value: 20.0
}
split_after_samples {
  constant_value: 250.0
}
dominate_fraction {
  constant_value: 0.990000009537
}
num_features: 188
, 'feature_bagging_fraction': 1.0, 'valid_leaf_threshold': 1, 'checkpoint_stats': False, 'initialize_average_splits': False, 'pruning_type': 0, 'prune_every_samples': 0, 'dominate_fraction': 0.99, 'max_fertile_nodes': 0, 'early_finish_check_every_samples': 0, 'dominate_method': 'bootstrap', 'bagging_fraction': 1.0, 'regression': False, 'param_file': None, 'bagged_num_features': 188, 'use_running_stats_method': False, 'max_nodes': 100, 'split_finish_name': 'basic', 'leaf_model_type': 0, 'stats_model_type': 0, 'bagged_features': None, 'n

### Output package versions used

In [6]:
#find the names of the imported modules
import types
def imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            yield val.__name__

#exclude all modules not listed by `!pip freeze`
excludes = ['__builtin__', 'types', 'IPython.core.shadowns', 'sys', 'os']
imported_modules = [module for module in imports() if module not in excludes]
pip_modules = !pip freeze #you could also use `!conda list` with anaconda

#print the names and versions of the imported modules
for module in pip_modules:
    name, version = module.split('==')
    if name in imported_modules:
        print(name + '\t' + version)

numpy	1.13.3
pandas	0.20.3
tensorflow	1.8.0
