In [1]:
import swat
import importlib
import numpy as np
import pandas as pd
swat = importlib.reload(swat)
import sys
sys.path.append('../')

#### Generate Data

Idea from Chapter 10 Boosting and Additive Trees, 'the elements of statistical learning', Trevor Hastie. Robert Tibshirani. Jerome Friedman

In [2]:
mu, sigma = 0, 1 # mean and standard deviation

np.random.normal(mu, sigma, 10)

allnumpys = list()
for i in range(50):
    st = np.random.normal(mu, sigma, 1000)
    allnumpys.append(st)

data = pd.DataFrame(allnumpys)

data = data.transpose()

data['label']=1

def f(x):
    sumn=0
    for i in range(10):
        sumn = sumn + x[i]*x[i]+2*np.random.normal(0, 1, 1)
    return sumn

data['label']=data.apply(f, axis=1) 
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,label
0,-0.007493,-0.254256,0.943714,1.320606,0.15254,-0.722262,0.619254,0.088562,-0.400675,0.273277,...,0.692341,-0.192567,-1.544246,-0.452088,0.197781,-2.788997,-1.540466,0.732558,-0.542412,8.360026
1,-0.050378,-0.4205,-0.329569,-0.445582,-0.149846,0.20886,0.049302,0.220638,-0.231512,0.698283,...,-0.956949,0.873738,-0.562571,1.106439,2.119756,-1.11031,-2.415102,0.047699,1.299589,4.724155
2,1.625406,1.4632,-0.242348,-0.367632,-0.20313,-0.142465,-0.623666,0.627395,-0.313969,-0.662891,...,-1.388141,0.149241,1.061887,0.008396,0.986308,-0.222105,-1.15279,0.80052,1.432791,-1.854019
3,-0.013379,-0.092329,0.329632,-1.202536,-1.626637,-0.600046,-1.0282,-1.280613,-2.050329,-1.704493,...,0.68735,0.156483,-0.074395,-1.942796,0.78874,-1.488194,0.74838,0.037063,-0.360543,7.740393
4,0.725744,-1.531477,-1.141002,-3.196552,0.998813,-1.247302,0.548896,-1.144473,2.396329,0.529136,...,0.318008,0.738173,0.489923,-0.91356,-0.810846,1.333046,1.213844,-1.242368,1.033143,21.558883


### Create Connections and Load Data ( SAS Viya version)

In [3]:
casconn = swat.CAS('snap001', 14298, nworkers=10)

casdata = casconn.CASTable('casdata')

In [4]:
casconn.upload(data, casout=casdata)

NOTE: Cloud Analytic Services made the uploaded file available as table CASDATA in caslib CASUSERHDFS(lidong).
NOTE: The table CASDATA has been created in caslib CASUSERHDFS(lidong) from binary data uploaded to Cloud Analytic Services.


In [5]:
casdata.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,label
0,-0.007493,-0.254256,0.943714,1.320606,0.15254,-0.722262,0.619254,0.088562,-0.400675,0.273277,...,0.692341,-0.192567,-1.544246,-0.452088,0.197781,-2.788997,-1.540466,0.732558,-0.542412,8.360026
1,-0.050378,-0.4205,-0.329569,-0.445582,-0.149846,0.20886,0.049302,0.220638,-0.231512,0.698283,...,-0.956949,0.873738,-0.562571,1.106439,2.119756,-1.11031,-2.415102,0.047699,1.299589,4.724155
2,1.625406,1.4632,-0.242348,-0.367632,-0.20313,-0.142465,-0.623666,0.627395,-0.313969,-0.662891,...,-1.388141,0.149241,1.061887,0.008396,0.986308,-0.222105,-1.15279,0.80052,1.432791,-1.854019
3,-0.013379,-0.092329,0.329632,-1.202536,-1.626637,-0.600046,-1.0282,-1.280613,-2.050329,-1.704493,...,0.68735,0.156483,-0.074395,-1.942796,0.78874,-1.488194,0.74838,0.037063,-0.360543,7.740393
4,0.725744,-1.531477,-1.141002,-3.196552,0.998813,-1.247302,0.548896,-1.144473,2.396329,0.529136,...,0.318008,0.738173,0.489923,-0.91356,-0.810846,1.333046,1.213844,-1.242368,1.033143,21.558883


In [6]:
casdata.tableinfo()

Unnamed: 0,Name,Rows,Columns,Encoding,CreateTimeFormatted,ModTimeFormatted,JavaCharSet,CreateTime,ModTime,Global,Repeated,View,SourceName,SourceCaslib,Compressed,Creator,Modifier
0,CASDATA,1000,51,utf-8,18Apr2017:14:57:19,18Apr2017:14:57:19,UTF8,1808147000.0,1808147000.0,0,0,0,,,0,lidong,


### Estimator

In [7]:
from pipefitter.estimator import DecisionTree, DecisionForest, GBTree

In [8]:
params = dict(target='label', 
              inputs=[str(i) for i in range(50)])

In [9]:
dtree = DecisionTree(max_depth=6, **params)
dtree

DecisionTree(alpha=0.0, cf_level=0.25, criterion=None, inputs=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49'], leaf_size=5, max_branches=2, max_depth=6, n_bins=20, nominals=[], prune=False, target='label', var_importance=False)

#### Decision Tree Fit and Score of CAS Table

In [10]:
model = dtree.fit(casdata)
model

DecisionTreeModel(alpha=0.0, cf_level=0.25, criterion=None, inputs=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49'], leaf_size=5, max_branches=2, max_depth=6, n_bins=20, nominals=[], prune=False, target='label', var_importance=False)

In [11]:
vars(model)

{'backend': <module 'pipefitter.backends.cas' from '..\\pipefitter\\backends\\cas\\__init__.py'>,
 'data': CASTable('kmodeltree1a65d23c_2bc7_483a_9333_c150fd760ab6', replace=True),
 'diagnostics': [ModelInfo]
 
  Decision Tree for CASDATA
  
                                  Descr        Value
  0                Number of Tree Nodes    31.000000
  1              Max Number of Branches     2.000000
  2                    Number of Levels     6.000000
  3                    Number of Leaves    16.000000
  4                      Number of Bins    20.000000
  5              Minimum Size of Leaves     5.000000
  6              Maximum Size of Leaves   738.000000
  7                 Number of Variables    50.000000
  8   Alpha for Cost-Complexity Pruning     0.000000
  9         Number of Observations Used  1000.000000
  10              Maximum STD of Leaves     9.130072
  11              Minimum STD of Leaves     2.687148
  12                 Mean Squared Error    52.530324
 
 [OutputCasTab

In [12]:
score = model.score(casdata)
score

Target                                        label
Level                                      INTERVAL
Var                                   _DT_PredMean_
NBins                                           100
NObsUsed                                       1000
TargetCount                                    1000
TargetMiss                                        0
PredCount                                      1000
PredMiss                                          0
AverageAbsoluteError                        5.80429
AverageSquaredError                         52.5303
AverageSquaredLogarithmicError             0.671168
RootAverageAbsoluteError                    2.40921
RootAverageSquaredError                     7.24778
RootAverageSquaredLogarithmicError         0.819249
dtype: object

### HyperParameter Tuning

In [15]:
from pipefitter.estimator import DecisionTree
from pipefitter.model_selection import HyperParameterTuning

In [16]:
HyperParameterTuning?

In [17]:
param_grid = dict(
    max_depth=[6, 10],
    leaf_size=[3, 5],
)

In [18]:
hpt = HyperParameterTuning(
          estimator=DecisionTree(target='label', 
              inputs=[str(i) for i in range(50)]),
          param_grid=param_grid, 
          cv=3)

In [20]:
hpt.gridsearch(casdata, n_jobs=4)



Unnamed: 0,MeanScore,ScoreStd,Parameters,FoldScores,MeanClockTime
0,68.562686,6.454912,"{'max_depth': 6, 'leaf_size': 3}","[75.91279927080157, 69.18340988973921, 60.1572...",0.000857
1,68.990775,7.217512,"{'max_depth': 6, 'leaf_size': 5}","[76.77730926498194, 70.39630799355857, 59.3082...",0.000879
2,76.975283,7.86814,"{'max_depth': 10, 'leaf_size': 5}","[85.76152912213294, 78.0260519203542, 66.60634...",0.000976
3,80.401381,9.309854,"{'max_depth': 10, 'leaf_size': 3}","[91.14160258132087, 81.05204048259716, 68.3857...",0.000945
