# Chapter 9 - Modeling Categorical Variables  

In [1]:
import swat

conn = swat.CAS('server-name.mycompany.com', 5570, 'username', 'password')

In [2]:
#conn.loadtable('organics.sas7bdat', casout='organics')
organics = conn.CASTable('organics')

NOTE: Cloud Analytic Services added the caslib 'tmp'.
NOTE: Cloud Analytic Services made the file organics_new_vistat.sas7bdat available as table ORGANICS in caslib CASUSER(username).


In [3]:
organics.tableinfo()

Unnamed: 0,Name,Rows,Columns,Encoding,CreateTimeFormatted,ModTimeFormatted,JavaCharSet,CreateTime,ModTime,Global,Repeated,View,SourceName,SourceCaslib,Compressed,Creator,Modifier
0,ORGANICS,1688948,36,utf-8,20Jan2017:10:41:34,20Jan2017:10:41:34,UTF8,1800528000.0,1800528000.0,0,0,0,organics_new_vistat.sas7bdat,tmp,0,username,


## Logistic Regression

In [4]:
conn.loadactionset('regression')

NOTE: Added action set 'regression'.


Simple Logistic Regression

In [5]:
organics.logistic(
    target = 'TargetBuy',
    inputs = ['DemAge', 'Purchase_3mon', 'Purchase_6mon'],
)

NOTE: Convergence criterion (GCONV=1E-8) satisfied.


Unnamed: 0,RowId,Description,Value
0,DATA,Data Source,ORGANICS
1,RESPONSEVAR,Response Variable,TargetBuy
2,DIST,Distribution,Binary
3,LINK,Link Function,Logit
4,TECH,Optimization Technique,Newton-Raphson with Ridging

Unnamed: 0,RowId,Description,Value
0,NREAD,Number of Observations Read,1688948.0
1,NUSED,Number of Observations Used,1574340.0

Unnamed: 0,OrderedValue,Outcome,TargetBuy,Freq,Modeled
0,1,Bought,Bought,387600.0,*
1,2,No,No,1186740.0,

Unnamed: 0,Reason,Status,MaxGradient
0,Convergence criterion (GCONV=1E-8) satisfied.,0,4.419885e-08

Unnamed: 0,RowId,Description,Value
0,NDESIGNCOLS,Columns in Design,4
1,NEFFECTS,Number of Effects,4
2,MAXEFCOLS,Max Effect Columns,1
3,DESIGNRANK,Rank of Design,4
4,OPTPARM,Parameters in Optimization,4

Unnamed: 0,Test,DF,ChiSq,ProbChiSq
0,Likelihood Ratio,3,149251.090674,0.0

Unnamed: 0,RowId,Description,Value
0,M2LL,-2 Log Likelihood,1608090.0
1,AIC,AIC (smaller is better),1608098.0
2,AICC,AICC (smaller is better),1608098.0
3,SBC,SBC (smaller is better),1608147.0

Unnamed: 0,Effect,Parameter,ParmName,DF,Estimate,StdErr,ChiSq,ProbChiSq
0,Intercept,Intercept,Intercept,1,1.755274,0.057092,945.232627,1.442299e-207
1,DemAge,DemAge,DemAge,1,-0.057438,0.000158,131358.158671,0.0
2,purchase_3mon,purchase_3mon,purchase_3mon,1,-2e-06,5.5e-05,0.000866,0.976525
3,purchase_6mon,purchase_6mon,purchase_6mon,1,3.9e-05,3.9e-05,0.990883,0.3195267

Unnamed: 0,RowId,Task,Time,RelTime
0,SETUP,Setup and Parsing,0.008532,0.008942
1,LEVELIZATION,Levelization,0.074384,0.077957
2,INITIALIZATION,Model Initialization,0.000436,0.000457
3,SSCP,SSCP Computation,0.140239,0.146975
4,FITTING,Model Fitting,0.583362,0.611384
5,CLEANUP,Cleanup,0.004702,0.004928
6,TOTAL,Total,0.954166,1.0


Adding more predictors

In [6]:
organics.logistic(
    target = 'TargetBuy',
    inputs = ['DemAge', 'Purchase_3mon', 'Purchase_6mon', 'DemGender', 'DemHomeowner'],
    nominals = ['DemGender', 'DemHomeowner'],
    display = {'names': ['ParameterEstimates']} 
)

NOTE: Convergence criterion (GCONV=1E-8) satisfied.


Unnamed: 0,Effect,DemGender,DemHomeowner,Parameter,ParmName,DF,Estimate,StdErr,ChiSq,ProbChiSq
0,Intercept,,,Intercept,Intercept,1,0.353349,0.059586,35.165819,3.027918e-09
1,DemAge,,,DemAge,DemAge,1,-0.056478,0.000162,120824.157416,0.0
2,purchase_3mon,,,purchase_3mon,purchase_3mon,1,9e-06,5.7e-05,0.024229,0.8763032
3,purchase_6mon,,,purchase_6mon,purchase_6mon,1,3.7e-05,4e-05,0.819283,0.36539
4,DemGender,F,,DemGender F,DemGender_F,1,1.817158,0.007381,60608.910532,0.0
5,DemGender,M,,DemGender M,DemGender_M,1,0.857905,0.008216,10904.528993,0.0
6,DemGender,U,,DemGender U,DemGender_U,0,0.0,,,
7,DemHomeowner,,No,DemHomeowner No,DemHomeowner_No,1,0.00032,0.004226,0.005725,0.9396871
8,DemHomeowner,,Yes,DemHomeowner Yes,DemHomeowner_Yes,0,0.0,,,


Another way to define a model

In [7]:
all_preds = ['DemAge', 'Purchase_3mon', 'Purchase_9mon', 'DemGender', 'DemHomeowner']
all_cats = ['DemGender', 'DemHomeowner']

model1 = organics.Logistic()
model1.nominals = all_cats
model1.target = 'TargetBuy'
model1.inputs = all_preds
model1.display.names = ['ParameterEstimates']

model1()

NOTE: Convergence criterion (GCONV=1E-8) satisfied.


Unnamed: 0,Effect,DemGender,DemHomeowner,Parameter,ParmName,DF,Estimate,StdErr,ChiSq,ProbChiSq
0,Intercept,,,Intercept,Intercept,1,0.447991,0.072359,38.330834,5.971168e-10
1,DemAge,,,DemAge,DemAge,1,-0.056478,0.000162,120823.996353,0.0
2,purchase_3mon,,,purchase_3mon,purchase_3mon,1,7.1e-05,4.9e-05,2.0741,0.1498183
3,purchase_9mon,,,purchase_9mon,purchase_9mon,1,-2.6e-05,2.9e-05,0.811013,0.367821
4,DemGender,F,,DemGender F,DemGender_F,1,1.817162,0.007381,60609.158117,0.0
5,DemGender,M,,DemGender M,DemGender_M,1,0.857905,0.008216,10904.528278,0.0
6,DemGender,U,,DemGender U,DemGender_U,0,0.0,,,
7,DemHomeowner,,No,DemHomeowner No,DemHomeowner_No,1,0.000318,0.004226,0.005669,0.9399814
8,DemHomeowner,,Yes,DemHomeowner Yes,DemHomeowner_Yes,0,0.0,,,


In [8]:
model1.link = 'PROBIT',
model1.display.names = ['ResponseProfile', 'ParameterEstimates']

model1()

NOTE: Convergence criterion (GCONV=1E-8) satisfied.


Unnamed: 0,OrderedValue,Outcome,TargetBuy,Freq,Modeled
0,1,Bought,Bought,387600.0,*
1,2,No,No,1186740.0,

Unnamed: 0,Effect,DemGender,DemHomeowner,Parameter,ParmName,DF,Estimate,StdErr,ChiSq,ProbChiSq
0,Intercept,,,Intercept,Intercept,1,0.447991,0.072359,38.330834,5.971168e-10
1,DemAge,,,DemAge,DemAge,1,-0.056478,0.000162,120823.996353,0.0
2,purchase_3mon,,,purchase_3mon,purchase_3mon,1,7.1e-05,4.9e-05,2.0741,0.1498183
3,purchase_9mon,,,purchase_9mon,purchase_9mon,1,-2.6e-05,2.9e-05,0.811013,0.367821
4,DemGender,F,,DemGender F,DemGender_F,1,1.817162,0.007381,60609.158117,0.0
5,DemGender,M,,DemGender M,DemGender_M,1,0.857905,0.008216,10904.528278,0.0
6,DemGender,U,,DemGender U,DemGender_U,0,0.0,,,
7,DemHomeowner,,No,DemHomeowner No,DemHomeowner_No,1,0.000318,0.004226,0.005669,0.9399814
8,DemHomeowner,,Yes,DemHomeowner Yes,DemHomeowner_Yes,0,0.0,,,


Output predicted values

In [9]:
result1 = conn.CASTable('predicted', replace=True)
model1.output.casout = result1
model1.output.copyvars = 'all';
del model1.display
model1()

NOTE: Convergence criterion (GCONV=1E-8) satisfied.


Unnamed: 0,RowId,Description,Value
0,DATA,Data Source,ORGANICS
1,RESPONSEVAR,Response Variable,TargetBuy
2,DIST,Distribution,Binary
3,LINK,Link Function,Logit
4,TECH,Optimization Technique,Newton-Raphson with Ridging

Unnamed: 0,RowId,Description,Value
0,NREAD,Number of Observations Read,1688948.0
1,NUSED,Number of Observations Used,1574340.0

Unnamed: 0,OrderedValue,Outcome,TargetBuy,Freq,Modeled
0,1,Bought,Bought,387600.0,*
1,2,No,No,1186740.0,

Unnamed: 0,Class,Levels,Values
0,DemGender,3.0,F M U
1,DemHomeowner,2.0,No Yes

Unnamed: 0,Reason,Status,MaxGradient
0,Convergence criterion (GCONV=1E-8) satisfied.,0,3e-06

Unnamed: 0,RowId,Description,Value
0,NDESIGNCOLS,Columns in Design,9
1,NEFFECTS,Number of Effects,6
2,MAXEFCOLS,Max Effect Columns,3
3,DESIGNRANK,Rank of Design,7
4,OPTPARM,Parameters in Optimization,7

Unnamed: 0,Test,DF,ChiSq,ProbChiSq
0,Likelihood Ratio,6,252367.563696,0.0

Unnamed: 0,RowId,Description,Value
0,M2LL,-2 Log Likelihood,1504974.0
1,AIC,AIC (smaller is better),1504988.0
2,AICC,AICC (smaller is better),1504988.0
3,SBC,SBC (smaller is better),1505074.0

Unnamed: 0,Effect,DemGender,DemHomeowner,Parameter,ParmName,DF,Estimate,StdErr,ChiSq,ProbChiSq
0,Intercept,,,Intercept,Intercept,1,0.447991,0.072359,38.330834,5.971168e-10
1,DemAge,,,DemAge,DemAge,1,-0.056478,0.000162,120823.996353,0.0
2,purchase_3mon,,,purchase_3mon,purchase_3mon,1,7.1e-05,4.9e-05,2.0741,0.1498183
3,purchase_9mon,,,purchase_9mon,purchase_9mon,1,-2.6e-05,2.9e-05,0.811013,0.367821
4,DemGender,F,,DemGender F,DemGender_F,1,1.817162,0.007381,60609.158117,0.0
5,DemGender,M,,DemGender M,DemGender_M,1,0.857905,0.008216,10904.528278,0.0
6,DemGender,U,,DemGender U,DemGender_U,0,0.0,,,
7,DemHomeowner,,No,DemHomeowner No,DemHomeowner_No,1,0.000318,0.004226,0.005669,0.9399814
8,DemHomeowner,,Yes,DemHomeowner Yes,DemHomeowner_Yes,0,0.0,,,

Unnamed: 0,RowId,Task,Time,RelTime
0,SETUP,Setup and Parsing,0.030029,0.005449
1,LEVELIZATION,Levelization,0.88402,0.160424
2,INITIALIZATION,Model Initialization,0.000194,3.5e-05
3,SSCP,SSCP Computation,0.602842,0.109399
4,FITTING,Model Fitting,1.344155,0.243926
5,OUTPUT,Creating Output Data,2.368268,0.429773
6,CLEANUP,Cleanup,0.009795,0.001778
7,TOTAL,Total,5.510507,1.0

Unnamed: 0,casLib,Name,Label,Rows,Columns,casTable
0,CASUSER(username),predicted,,1688948,37,"CASTable('predicted', caslib='CASUSER(username)')"


In [10]:
result1.columns

Index(['_PRED_', 'ID', 'DemAffl', 'DemAge', 'DemGender', 'DemHomeowner',
       'DemAgeGroup', 'DemCluster', 'DemReg', 'DemTVReg', 'DemFlag1',
       'DemFlag2', 'DemFlag3', 'DemFlag4', 'DemFlag5', 'DemFlag6', 'DemFlag7',
       'DemFlag8', 'PromClass', 'PromTime', 'TargetBuy', 'Bought_Beverages',
       'Bought_Bakery', 'Bought_Canned', 'Bought_Dairy', 'Bought_Baking',
       'Bought_Frozen', 'Bought_Meat', 'Bought_Fruits', 'Bought_Vegetables',
       'Bought_Cleaners', 'Bought_PaperGoods', 'Bought_Others',
       'purchase_3mon', 'purchase_6mon', 'purchase_9mon', 'purchase_12mon'],
      dtype='object')

In [11]:
result1.crosstab(row='DemGender', weight='_PRED_', aggregators='mean')

Unnamed: 0,DemGender,Col1
0,F,0.343891
1,M,0.165559
2,U,0.076981


Generate score code

In [12]:
result = organics.logistic(
    target = 'TargetBuy',
    inputs = ['DemAge', 'Purchase_3mon', 'Purchase_6mon'],
    code = {},
)   
result['_code_']

NOTE: Convergence criterion (GCONV=1E-8) satisfied.


Unnamed: 0,SASCode
0,/*-----------------------------------------...
1,Generated SAS Scoring Code
2,Date: 20Jan2017:10:42:06
3,-----------------------------------------...
4,
5,drop _badval_ _linp_ _temp_ _i_ _j_;
6,_badval_ = 0;
7,_linp_ = 0;
8,_temp_ = 0;
9,_i_ = 0;


Group by logistic regression

In [13]:
organics.groupby = ['DemGender']
result = organics.logistic(
    target = 'TargetBuy',
    inputs = ['DemAge', 'Purchase_3mon', 'Purchase_6mon'],
)  

NOTE: Convergence criterion (GCONV=1E-8) satisfied.
NOTE: Convergence criterion (GCONV=1E-8) satisfied.
NOTE: Convergence criterion (GCONV=1E-8) satisfied.


In [14]:
for df in result:
    if 'ParameterEstimates' in df:
        print(result[df][['Effect','Parameter','Estimate']])
        print('')

Parameter Estimates

                  Effect      Parameter  Estimate
DemGender                                        
F              Intercept      Intercept  2.181055
F                 DemAge         DemAge -0.057016
F          purchase_3mon  purchase_3mon  0.000023
F          purchase_6mon  purchase_6mon  0.000038

Parameter Estimates

                  Effect      Parameter  Estimate
DemGender                                        
M              Intercept      Intercept  1.230455
M                 DemAge         DemAge -0.056114
M          purchase_3mon  purchase_3mon -0.000091
M          purchase_6mon  purchase_6mon  0.000065

Parameter Estimates

                  Effect      Parameter  Estimate
DemGender                                        
U              Intercept      Intercept  0.211336
U                 DemAge         DemAge -0.052621
U          purchase_3mon  purchase_3mon  0.000139
U          purchase_6mon  purchase_6mon -0.000046



## Decision Trees

In [15]:
conn.loadactionset('decisiontree')
conn.help(actionset='decisiontree')

NOTE: Added action set 'decisiontree'.
NOTE: Information for action set 'decisiontree':
NOTE:    decisionTree
NOTE:       dtreeTrain - Train a decision tree
NOTE:       dtreeScore - Score a table using a decision tree model
NOTE:       dtreeSplit - Split decision tree nodes
NOTE:       dtreePrune - Prune a decision tree
NOTE:       dtreeMerge - Merge decision tree nodes
NOTE:       dtreeCode - Generate DATA step scoring code from a decision tree model
NOTE:       forestTrain - Train a forest
NOTE:       forestScore - Score a table using a forest model
NOTE:       forestCode - Generate DATA step scoring code from a forest model
NOTE:       gbtreeTrain - Train a gradient boosting tree
NOTE:       gbtreeScore - Score a table using a gradient boosting tree model
NOTE:       gbtreecode - Generate DATA step scoring code from a gradient boosting tree model


Unnamed: 0,name,description
0,dtreeTrain,Train a decision tree
1,dtreeScore,Score a table using a decision tree model
2,dtreeSplit,Split decision tree nodes
3,dtreePrune,Prune a decision tree
4,dtreeMerge,Merge decision tree nodes
5,dtreeCode,Generate DATA step scoring code from a decisio...
6,forestTrain,Train a forest
7,forestScore,Score a table using a forest model
8,forestCode,Generate DATA step scoring code from a forest ...
9,gbtreeTrain,Train a gradient boosting tree


In [16]:
organics = conn.CASTable('organics')

output1 = conn.CASTable('treeModel1', replace=True)
tree1 = organics.Dtreetrain()
tree1.target = 'TargetBuy'
tree1.inputs = ['DemGender']
tree1.casout = output1
tree1()

Unnamed: 0,Descr,Value
0,Number of Tree Nodes,5.0
1,Max Number of Branches,2.0
2,Number of Levels,3.0
3,Number of Leaves,3.0
4,Number of Bins,20.0
5,Minimum Size of Leaves,323684.0
6,Maximum Size of Leaves,923324.0
7,Number of Variables,1.0
8,Confidence Level for Pruning,0.25
9,Number of Observations Used,1688948.0

Unnamed: 0,casLib,Name,Rows,Columns,casTable
0,CASUSER(username),treeModel1,5,24,"CASTable('treeModel1', caslib='CASUSER(username)')"


In [17]:
output1.columns

Index(['_Target_', '_NumTargetLevel_', '_TargetValL_', '_TargetVal0_',
       '_TargetVal1_', '_CI0_', '_CI1_', '_NodeID_', '_TreeLevel_',
       '_NodeName_', '_Parent_', '_ParentName_', '_NodeType_', '_Gain_',
       '_NumObs_', '_TargetValue_', '_NumChild_', '_ChildID0_', '_ChildID1_',
       '_PBranches_', '_PBNameL0_', '_PBNameL1_', '_PBName0_', '_PBName1_'],
      dtype='object')

Intepret the decision tree structure

In [18]:
output1[['_TreeLevel_', '_NodeID_', '_Parent_', '_ParentName_', 
         '_NodeType_', '_PBName0_', '_PBName1_']].sort_values('_NodeID_').head(20)

Unnamed: 0,_TreeLevel_,_NodeID_,_Parent_,_ParentName_,_NodeType_,_PBName0_,_PBName1_
0,0.0,0.0,-1.0,,1.0,,
1,1.0,1.0,0.0,DemGender,1.0,M,U
2,1.0,2.0,0.0,DemGender,3.0,F,
3,2.0,3.0,1.0,DemGender,3.0,U,
4,2.0,4.0,1.0,DemGender,3.0,M,


In [19]:
output1[['_TreeLevel_', '_NodeID_', '_Parent_', 
         '_TargetVal0_', '_TargetVal1_', '_CI0_', '_CI1_', 
         '_Gain_', '_NumObs_']].sort_values('_NodeID_').head(20)

Unnamed: 0,_TreeLevel_,_NodeID_,_Parent_,_TargetVal0_,_TargetVal1_,_CI0_,_CI1_,_Gain_,_NumObs_
0,0.0,0.0,-1.0,Bought,No,0.247716,0.752284,0.047713,1688948.0
1,1.0,1.0,0.0,Bought,No,0.129045,0.870955,0.012886,765624.0
2,1.0,2.0,0.0,Bought,No,0.346119,0.653881,0.0,923324.0
3,2.0,3.0,1.0,Bought,No,0.078422,0.921578,0.0,323684.0
4,2.0,4.0,1.0,Bought,No,0.166122,0.833878,0.0,441940.0


Pruning

In [20]:
tree1.prune = True
tree1()

Unnamed: 0,Descr,Value
0,Number of Tree Nodes,3.0
1,Max Number of Branches,2.0
2,Number of Levels,2.0
3,Number of Leaves,2.0
4,Number of Bins,20.0
5,Minimum Size of Leaves,765624.0
6,Maximum Size of Leaves,923324.0
7,Number of Variables,1.0
8,Confidence Level for Pruning,0.25
9,Number of Observations Used,1688948.0

Unnamed: 0,casLib,Name,Rows,Columns,casTable
0,CASUSER(username),treeModel1,3,24,"CASTable('treeModel1', caslib='CASUSER(username)')"


In [21]:
output1[['_TreeLevel_', '_NodeID_', '_Parent_', '_ParentName_',
         '_NodeType_', '_PBName0_', '_PBName1_']].sort_values('_NodeID_').head(20)

Unnamed: 0,_TreeLevel_,_NodeID_,_Parent_,_ParentName_,_NodeType_,_PBName0_,_PBName1_
0,0.0,0.0,-1.0,,1.0,,
1,1.0,1.0,0.0,DemGender,3.0,M,U
2,1.0,2.0,0.0,DemGender,3.0,F,


Adding more predictors

In [22]:
varlist = ['DemGender', 'DemHomeowner', 'DemAgeGroup', 'DemCluster', 
           'DemReg', 'DemTVReg', 'DemFlag1', 'DemFlag2', 'DemFlag3', 'DemFlag4', 
           'DemFlag5', 'DemFlag6', 'DemFlag7', 'DemFlag8', 'PromClass']

output2 = conn.CASTable('treeModel2')
output2.replace = True

tree2 = organics.dtreetrain
tree2.target = 'TargetBuy'
tree2.inputs = varlist
tree2.casout = output2
tree2()

Unnamed: 0,Descr,Value
0,Number of Tree Nodes,45.0
1,Max Number of Branches,2.0
2,Number of Levels,6.0
3,Number of Leaves,23.0
4,Number of Bins,20.0
5,Minimum Size of Leaves,76.0
6,Maximum Size of Leaves,597284.0
7,Number of Variables,15.0
8,Confidence Level for Pruning,0.25
9,Number of Observations Used,1688948.0

Unnamed: 0,casLib,Name,Rows,Columns,casTable
0,CASUSER(username),treeModel2,45,130,"CASTable('treeModel2', caslib='CASUSER(username)')"


Increase leaf size

In [23]:
organics.dtreetrain(
    target='TargetBuy',
    inputs = varlist,
    casout=output2, 
    leafSize=1000,
    maxLevel = 4,
)

Unnamed: 0,Descr,Value
0,Number of Tree Nodes,15.0
1,Max Number of Branches,2.0
2,Number of Levels,4.0
3,Number of Leaves,8.0
4,Number of Bins,20.0
5,Minimum Size of Leaves,1216.0
6,Maximum Size of Leaves,889124.0
7,Number of Variables,15.0
8,Confidence Level for Pruning,0.25
9,Number of Observations Used,1688948.0

Unnamed: 0,casLib,Name,Rows,Columns,casTable
0,CASUSER(username),treeModel2,15,130,"CASTable('treeModel2', caslib='CASUSER(username)')"


Scoring using the decision tree model

In [24]:
organics.dtreescore(modelTable=conn.CASTable('treeModel2'))

Unnamed: 0,Descr,Value
0,Number of Observations Read,1688948.0
1,Number of Observations Used,1688948.0
2,Misclassification Error (%),23.934662287


In [25]:
output3 = conn.CASTable('predicted', replace=True)
organics.dtreeScore(modelTable = output2, casout = output3)
output3.columns

Index(['_DT_PredName_', '_DT_PredP_', '_DT_PredLevel_', '_LeafID_', '_MissIt_',
       '_NumNodes_', '_NodeList0_', '_NodeList1_', '_NodeList2_',
       '_NodeList3_'],
      dtype='object')

In [26]:
output3.head(10)

Unnamed: 0,_DT_PredName_,_DT_PredP_,_DT_PredLevel_,_LeafID_,_MissIt_,_NumNodes_,_NodeList0_,_NodeList1_,_NodeList2_,_NodeList3_
0,No,0.667749,1.0,9.0,0.0,4.0,0.0,1.0,4.0,9.0
1,No,0.840345,1.0,8.0,0.0,4.0,0.0,1.0,3.0,8.0
2,No,0.667749,1.0,9.0,0.0,4.0,0.0,1.0,4.0,9.0
3,No,0.667749,1.0,9.0,0.0,4.0,0.0,1.0,4.0,9.0
4,No,0.840345,1.0,8.0,0.0,4.0,0.0,1.0,3.0,8.0
5,No,0.667749,1.0,9.0,0.0,4.0,0.0,1.0,4.0,9.0
6,No,0.840345,1.0,8.0,0.0,4.0,0.0,1.0,3.0,8.0
7,No,0.667749,1.0,9.0,0.0,4.0,0.0,1.0,4.0,9.0
8,No,0.667749,1.0,9.0,0.0,4.0,0.0,1.0,4.0,9.0
9,No,0.923686,1.0,7.0,0.0,4.0,0.0,1.0,3.0,7.0


## Gradient Boosting, Forests, and Neural Networks

In [27]:
varlist = ['DemGender', 'DemHomeowner', 'DemAgeGroup', 'DemCluster', 
           'DemReg', 'DemTVReg', 'DemFlag1', 'DemFlag2', 'DemFlag3', 'DemFlag4', 
           'DemFlag5', 'DemFlag6', 'DemFlag7', 'DemFlag8', 'PromClass']

output = conn.CASTable('forest1')
output.replace = True

forest1 = organics.foresttrain
forest1.target = 'TargetBuy'
forest1.inputs = varlist
forest1.casout = output
forest1()

Unnamed: 0,Descr,Value
0,Number of Trees,50.0
1,Number of Selected Variables (M),4.0
2,Random Number Seed,0.0
3,Bootstrap Percentage (%),63.212056
4,Number of Bins,20.0
5,Number of Variables,15.0
6,Confidence Level for Pruning,0.25
7,Max Number of Tree Nodes,59.0
8,Min Number of Tree Nodes,25.0
9,Max Number of Branches,2.0

Unnamed: 0,casLib,Name,Rows,Columns,casTable
0,CASUSER(username),forest1,2196,132,"CASTable('forest1', caslib='CASUSER(username)')"


In [28]:
forest1.varimp = True
result = forest1()
result['DTreeVarImpInfo']

Unnamed: 0,Variable,Importance,Std
0,DemGender,16342.355841,8087.767096
1,DemAgeGroup,5748.171795,2499.726833
2,PromClass,2517.516023,1307.450975
3,DemTVReg,282.827455,98.494842
4,DemFlag6,246.837783,278.700253
5,DemCluster,191.360944,53.261145
6,DemReg,132.455284,53.360679
7,DemFlag1,125.761133,149.488296
8,DemFlag2,109.324912,243.124991
9,DemFlag7,49.987114,40.416881


In [29]:
result['OutputCasTables']

Unnamed: 0,casLib,Name,Rows,Columns,casTable
0,CASUSER(username),forest1,2192,132,"CASTable('forest1', caslib='CASUSER(username)')"


In [30]:
varlist = ['DemGender', 'DemHomeowner', 'DemAgeGroup', 'DemCluster', 
           'DemReg', 'DemTVReg', 'DemFlag1', 'DemFlag2', 'DemFlag3', 'DemFlag4', 
           'DemFlag5', 'DemFlag6', 'DemFlag7', 'DemFlag8', 'PromClass']

output = conn.CASTable('gbtree1', replace=True)
gbtree1 = organics.GBTreeTrain()
gbtree1.target = 'TargetBuy'
gbtree1.inputs = varlist
gbtree1.casout = output
gbtree1()

Unnamed: 0,Descr,Value
0,Number of Trees,50.0
1,Distribution,2.0
2,Learning Rate,0.1
3,Subsampling Rate,0.5
4,Number of Selected Variables (M),15.0
5,Number of Bins,20.0
6,Number of Variables,15.0
7,Max Number of Tree Nodes,63.0
8,Min Number of Tree Nodes,61.0
9,Max Number of Branches,2.0

Unnamed: 0,casLib,Name,Rows,Columns,casTable
0,CASUSER(username),gbtree1,3148,121,"CASTable('gbtree1', caslib='CASUSER(username)')"


In [31]:
conn.loadactionset('neuralnet')
conn.help(actionset='neuralnet')

NOTE: Added action set 'neuralnet'.
NOTE: Information for action set 'neuralnet':
NOTE:    neuralNet
NOTE:       annTrain - Train an artificial neural network
NOTE:       annScore - Score a table using an artificial neural network model
NOTE:       annCode - Generate DATA step scoring code from an artificial neural network model


Unnamed: 0,name,description
0,annTrain,Train an artificial neural network
1,annScore,Score a table using an artificial neural netwo...
2,annCode,Generate DATA step scoring code from an artifi...


In [32]:
output = conn.CASTable('neural1')
output.replace = True

neural1 = organics.Anntrain()
neural1.target = 'TargetBuy'
neural1.inputs = ['DemAge','DemAffl','DemGender']
neural1.casout = output
neural1.hiddens = [4,2]
neural1.maxIter = 1000
result = neural1()
list(result.keys())

['OptIterHistory', 'ConvergenceStatus', 'ModelInfo', 'OutputCasTables']

In [33]:
result['ModelInfo']

Unnamed: 0,Descr,Value
0,Model,Neural Net
1,Number of Observations Used,1498264
2,Number of Observations Read,1688948
3,Target/Response Variable,TargetBuy
4,Number of Nodes,13
5,Number of Input Nodes,5
6,Number of Output Nodes,2
7,Number of Hidden Nodes,6
8,Number of Hidden Layers,2
9,Number of Weight Parameters,30


In [34]:
organics.annscore(modelTable=output)

Unnamed: 0,Descr,Value
0,Number of Observations Read,1688948.0
1,Number of Observations Used,1498264.0
2,Misclassification Error (%),18.306787055


In [35]:
conn.close()