# Chapter 8: Modeling Continuous Variables

In [1]:
import swat

conn = swat.CAS('server-name.mycomany.com', 5570, 'username', 'password')

In [2]:
cars = conn.upload_file('https://raw.githubusercontent.com/sassoftware/sas-viya-programming/master/data/cars.csv',
                        casout=dict(name='cars', replace=True))

NOTE: Cloud Analytic Services made the uploaded file available as table CARS in caslib CASUSER(username).
NOTE: The table CARS has been created in caslib CASUSER(username) from binary data uploaded to Cloud Analytic Services.


In [3]:
cars.tableinfo()

Unnamed: 0,Name,Rows,Columns,Encoding,CreateTimeFormatted,ModTimeFormatted,JavaCharSet,CreateTime,ModTime,Global,Repeated,View,SourceName,SourceCaslib,Compressed,Creator,Modifier
0,CARS,428,15,utf-8,20Jan2017:10:37:27,20Jan2017:10:37:27,UTF8,1800528000.0,1800528000.0,0,0,0,,,0,username,


In [4]:
cars.columninfo()

Unnamed: 0,Column,ID,Type,RawLength,FormattedLength,NFL,NFD
0,Make,1,varchar,13,13,0,0
1,Model,2,varchar,39,39,0,0
2,Type,3,varchar,6,6,0,0
3,Origin,4,varchar,6,6,0,0
4,DriveTrain,5,varchar,5,5,0,0
5,MSRP,6,double,8,12,0,0
6,Invoice,7,double,8,12,0,0
7,EngineSize,8,double,8,12,0,0
8,Cylinders,9,double,8,12,0,0
9,Horsepower,10,double,8,12,0,0


## Linear Regressions

In [5]:
conn.loadactionset('regression')
conn.help(actionset='regression')

NOTE: Added action set 'regression'.
NOTE: Information for action set 'regression':
NOTE:    regression
NOTE:       glm - Fits linear regression models using the method of least squares
NOTE:       genmod - Fits generalized linear regression models
NOTE:       logistic - Fits logistic regression models


Unnamed: 0,name,description
0,glm,Fits linear regression models using the method...
1,genmod,Fits generalized linear regression models
2,logistic,Fits logistic regression models


Simple linear regression

In [6]:
cars.glm( 
    target='MSRP',
    inputs=['MPG_City']
)

Unnamed: 0,RowId,Description,Value
0,DATA,Data Source,CARS
1,RESPONSEVAR,Response Variable,MSRP

Unnamed: 0,RowId,Description,Value
0,NREAD,Number of Observations Read,428.0
1,NUSED,Number of Observations Used,428.0

Unnamed: 0,RowId,Description,Value
0,NEFFECTS,Number of Effects,2
1,NPARMS,Number of Parameters,2

Unnamed: 0,RowId,Source,DF,SS,MS,FValue,ProbF
0,MODEL,Model,1.0,36380900000.0,36380900000.0,124.13436,1.783404e-25
1,ERROR,Error,426.0,124850700000.0,293076800.0,,
2,TOTAL,Corrected Total,427.0,161231600000.0,,,

Unnamed: 0,RowId,Description,Value
0,RMSE,Root MSE,17119.49
1,RSQUARE,R-Square,0.2256437
2,ADJRSQ,Adj R-Sq,0.223826
3,AIC,AIC,8776.26
4,AICC,AICC,8776.316
5,SBC,SBC,8354.378
6,TRAIN_ASE,ASE,291707300.0

Unnamed: 0,Effect,Parameter,DF,Estimate,StdErr,tValue,Probt
0,Intercept,Intercept,1,68124.606698,3278.919093,20.776544,1.0061690000000001e-66
1,MPG_City,MPG_City,1,-1762.135298,158.158758,-11.14156,1.783404e-25

Unnamed: 0,RowId,Task,Time,RelTime
0,SETUP,Setup and Parsing,0.015097,0.604168
1,LEVELIZATION,Levelization,0.00138,0.055225
2,INITIALIZATION,Model Initialization,0.000286,0.01145
3,SSCP,SSCP Computation,0.002123,0.084966
4,FITTING,Model Fitting,0.000614,0.024569
5,CLEANUP,Cleanup,0.005266,0.21074
6,TOTAL,Total,0.024988,1.0


Another way to define a model 

In [7]:
linear1 = cars.Glm()
linear1.target = 'MSRP'
linear1.inputs = ['MPG_City']
linear1()

Unnamed: 0,RowId,Description,Value
0,DATA,Data Source,CARS
1,RESPONSEVAR,Response Variable,MSRP

Unnamed: 0,RowId,Description,Value
0,NREAD,Number of Observations Read,428.0
1,NUSED,Number of Observations Used,428.0

Unnamed: 0,RowId,Description,Value
0,NEFFECTS,Number of Effects,2
1,NPARMS,Number of Parameters,2

Unnamed: 0,RowId,Source,DF,SS,MS,FValue,ProbF
0,MODEL,Model,1.0,36380900000.0,36380900000.0,124.13436,1.783404e-25
1,ERROR,Error,426.0,124850700000.0,293076800.0,,
2,TOTAL,Corrected Total,427.0,161231600000.0,,,

Unnamed: 0,RowId,Description,Value
0,RMSE,Root MSE,17119.49
1,RSQUARE,R-Square,0.2256437
2,ADJRSQ,Adj R-Sq,0.223826
3,AIC,AIC,8776.26
4,AICC,AICC,8776.316
5,SBC,SBC,8354.378
6,TRAIN_ASE,ASE,291707300.0

Unnamed: 0,Effect,Parameter,DF,Estimate,StdErr,tValue,Probt
0,Intercept,Intercept,1,68124.606698,3278.919093,20.776544,1.0061690000000001e-66
1,MPG_City,MPG_City,1,-1762.135298,158.158758,-11.14156,1.783404e-25

Unnamed: 0,RowId,Task,Time,RelTime
0,SETUP,Setup and Parsing,0.005691,0.424574
1,LEVELIZATION,Levelization,0.001746,0.130254
2,INITIALIZATION,Model Initialization,8.5e-05,0.00635
3,SSCP,SSCP Computation,0.000758,0.056545
4,FITTING,Model Fitting,0.000757,0.056474
5,CLEANUP,Cleanup,0.00411,0.306629
6,TOTAL,Total,0.013404,1.0


In [8]:
linear1.display.names = ['ParameterEstimates']
linear1()

Unnamed: 0,Effect,Parameter,DF,Estimate,StdErr,tValue,Probt
0,Intercept,Intercept,1,68124.606698,3278.919093,20.776544,1.0061690000000001e-66
1,MPG_City,MPG_City,1,-1762.135298,158.158758,-11.14156,1.783404e-25


Scoring

In [9]:
del linear1.display.names

result1 = conn.CASTable('MSRPPrediction')
result1.replace = True
linear1.output.casout = result1
linear1.output.copyVars = 'ALL';
linear1()

Unnamed: 0,RowId,Description,Value
0,DATA,Data Source,CARS
1,RESPONSEVAR,Response Variable,MSRP

Unnamed: 0,RowId,Description,Value
0,NREAD,Number of Observations Read,428.0
1,NUSED,Number of Observations Used,428.0

Unnamed: 0,RowId,Description,Value
0,NEFFECTS,Number of Effects,2
1,NPARMS,Number of Parameters,2

Unnamed: 0,RowId,Source,DF,SS,MS,FValue,ProbF
0,MODEL,Model,1.0,36380900000.0,36380900000.0,124.13436,1.783404e-25
1,ERROR,Error,426.0,124850700000.0,293076800.0,,
2,TOTAL,Corrected Total,427.0,161231600000.0,,,

Unnamed: 0,RowId,Description,Value
0,RMSE,Root MSE,17119.49
1,RSQUARE,R-Square,0.2256437
2,ADJRSQ,Adj R-Sq,0.223826
3,AIC,AIC,8776.26
4,AICC,AICC,8776.316
5,SBC,SBC,8354.378
6,TRAIN_ASE,ASE,291707300.0

Unnamed: 0,Effect,Parameter,DF,Estimate,StdErr,tValue,Probt
0,Intercept,Intercept,1,68124.606698,3278.919093,20.776544,1.0061690000000001e-66
1,MPG_City,MPG_City,1,-1762.135298,158.158758,-11.14156,1.783404e-25

Unnamed: 0,RowId,Task,Time,RelTime
0,SETUP,Setup and Parsing,0.007779,0.07792
1,LEVELIZATION,Levelization,0.001535,0.015375
2,INITIALIZATION,Model Initialization,6.5e-05,0.00065
3,SSCP,SSCP Computation,0.000779,0.007805
4,FITTING,Model Fitting,0.000519,0.005199
5,OUTPUT,Creating Output Data,0.084094,0.842357
6,CLEANUP,Cleanup,0.004831,0.04839
7,TOTAL,Total,0.099832,1.0

Unnamed: 0,casLib,Name,Label,Rows,Columns,casTable
0,CASUSER(username),MSRPPrediction,,428,16,"CASTable('MSRPPrediction', caslib='CASUSER(kes..."


In [10]:
result1[['pred']].summary()

Unnamed: 0,Column,Min,Max,N,NMiss,Mean,Sum,Std,StdErr,Var,USS,CSS,CV,TValue,ProbT
0,Pred,-37603.511169,50503.25372,428.0,0.0,32774.85514,14027638.0,9230.448198,446.170554,85201170.0,496134700000.0,36380900000.0,28.163201,73.458131,2.1822030000000003e-244


Output more information in the score table

In [11]:
result2 = conn.CASTable('MSRPPrediction3')
result2.replace = True
linear1.output.casout = result2
linear1.output.pred  = 'Predicted_MSRP'
linear1.output.resid = 'Presidual_MSRP'
linear1.output.lcl = 'LCL_MSRP'
linear1.output.ucl = 'UCL_MSRP'
linear1()

Unnamed: 0,RowId,Description,Value
0,DATA,Data Source,CARS
1,RESPONSEVAR,Response Variable,MSRP

Unnamed: 0,RowId,Description,Value
0,NREAD,Number of Observations Read,428.0
1,NUSED,Number of Observations Used,428.0

Unnamed: 0,RowId,Description,Value
0,NEFFECTS,Number of Effects,2
1,NPARMS,Number of Parameters,2

Unnamed: 0,RowId,Source,DF,SS,MS,FValue,ProbF
0,MODEL,Model,1.0,36380900000.0,36380900000.0,124.13436,1.783404e-25
1,ERROR,Error,426.0,124850700000.0,293076800.0,,
2,TOTAL,Corrected Total,427.0,161231600000.0,,,

Unnamed: 0,RowId,Description,Value
0,RMSE,Root MSE,17119.49
1,RSQUARE,R-Square,0.2256437
2,ADJRSQ,Adj R-Sq,0.223826
3,AIC,AIC,8776.26
4,AICC,AICC,8776.316
5,SBC,SBC,8354.378
6,TRAIN_ASE,ASE,291707300.0

Unnamed: 0,Effect,Parameter,DF,Estimate,StdErr,tValue,Probt
0,Intercept,Intercept,1,68124.606698,3278.919093,20.776544,1.0061690000000001e-66
1,MPG_City,MPG_City,1,-1762.135298,158.158758,-11.14156,1.783404e-25

Unnamed: 0,RowId,Task,Time,RelTime
0,SETUP,Setup and Parsing,0.007907,0.077416
1,LEVELIZATION,Levelization,0.001566,0.015332
2,INITIALIZATION,Model Initialization,7e-05,0.000686
3,SSCP,SSCP Computation,0.000683,0.006685
4,FITTING,Model Fitting,0.000517,0.005061
5,OUTPUT,Creating Output Data,0.084996,0.832168
6,CLEANUP,Cleanup,0.006178,0.060488
7,TOTAL,Total,0.102138,1.0

Unnamed: 0,casLib,Name,Label,Rows,Columns,casTable
0,CASUSER(username),MSRPPrediction3,,428,19,"CASTable('MSRPPrediction3', caslib='CASUSER(ke..."


Use scatter plot of predicted values and residuals to check the model fitting

In [12]:
from bokeh.charts import Scatter, output_file, output_notebook, show

out1 = result2.to_frame()

p = Scatter(out1, x='Predicted_MSRP', y='Residual_MSRP', color='Origin', marker='Origin')
output_notebook()
#output_file('scatter.html')
show(p)

Investigate which observations have negative predicted MSRP values

In [13]:
result2[['Predicted_MSRP', 'MSRP', 'MPG_City','Make','Model']].query('Predicted_MSRP < 0').to_frame()

Unnamed: 0,Predicted_MSRP,MSRP,MPG_City,Make,Model
0,-12933.617,20140.0,46.0,Honda,Civic Hybrid 4dr manual (gas/electric)
1,-37603.511169,19110.0,60.0,Honda,Insight 2dr (gas/electric)
2,-35841.375871,20510.0,59.0,Toyota,Prius 4dr (gas/electric)


In [14]:
p = Scatter(out1, x='MPG_City', y='MSRP', color='Origin', marker='Origin')
output_notebook()
#output_file('scatter.html')
show(p)

Remove outliers

In [15]:
cars.where = 'MSRP < 100000 and MPG_City < 40'

result2 = conn.CASTable('cas.MSRPPrediction2')
result2.replace = True

linear2 = cars.Glm()
linear2 = cars.query('MSRP < 100000 and MPG_City < 40').glm
linear2.target = 'MSRP'
linear2.inputs = ['MPG_City']
linear2.output.casout = result2
linear2.output.copyVars = 'ALL';
linear2.output.pred = 'Predicted_MSRP'
linear2.output.resid = 'Residual_MSRP'
linear2.output.lcl = 'LCL_MSRP'
linear2.output.ucl = 'UCL_MSRP'
linear2()

Unnamed: 0,RowId,Description,Value
0,DATA,Data Source,CARS
1,RESPONSEVAR,Response Variable,MSRP

Unnamed: 0,RowId,Description,Value
0,NREAD,Number of Observations Read,421.0
1,NUSED,Number of Observations Used,421.0

Unnamed: 0,RowId,Description,Value
0,NEFFECTS,Number of Effects,2
1,NPARMS,Number of Parameters,2

Unnamed: 0,RowId,Source,DF,SS,MS,FValue,ProbF
0,MODEL,Model,1.0,37217200000.0,37217200000.0,217.329426,6.373036e-40
1,ERROR,Error,419.0,71752850000.0,171247900.0,,
2,TOTAL,Corrected Total,420.0,108970100000.0,,,

Unnamed: 0,RowId,Description,Value
0,RMSE,Root MSE,13086.17
1,RSQUARE,R-Square,0.341536
2,ADJRSQ,Adj R-Sq,0.3399645
3,AIC,AIC,8406.575
4,AICC,AICC,8406.633
5,SBC,SBC,7991.661
6,TRAIN_ASE,ASE,170434300.0

Unnamed: 0,Effect,Parameter,DF,Estimate,StdErr,tValue,Probt
0,Intercept,Intercept,1,75304.372444,3017.469237,24.956136,6.743006e-85
1,MPG_City,MPG_City,1,-2188.496867,148.452209,-14.742097,6.373036e-40

Unnamed: 0,RowId,Task,Time,RelTime
0,SETUP,Setup and Parsing,0.008729,0.086034
1,LEVELIZATION,Levelization,0.001676,0.016517
2,INITIALIZATION,Model Initialization,6.4e-05,0.00063
3,SSCP,SSCP Computation,0.000714,0.007038
4,FITTING,Model Fitting,0.000514,0.005066
5,OUTPUT,Creating Output Data,0.083871,0.826625
6,CLEANUP,Cleanup,0.005674,0.055924
7,TOTAL,Total,0.101462,1.0

Unnamed: 0,casLib,Name,Label,Rows,Columns,casTable
0,CASUSER(username),cas.MSRPPrediction2,,421,19,"CASTable('cas.MSRPPrediction2', caslib='CASUSE..."


Check the model fitting again

In [16]:
out2 = result2.to_frame()
p = Scatter(out2, x='Predicted_MSRP', y='Residual_MSRP', color='Origin', marker='Origin')
output_notebook()
#output_file('scatter.html')
show(p)

Adding more predictors

In [17]:
nomList = ['Origin','Type','DriveTrain']
contList = ['MPG_City','Weight','Length']

linear3 = conn.CASTable('cars').Glm()
linear3.target = 'MSRP'
linear3.inputs = nomList + contList
linear3.nominals = nomList
linear3.display.names = ['FitStatistics','ParameterEstimates']
linear3()

Unnamed: 0,RowId,Description,Value
0,RMSE,Root MSE,12015.14
1,RSQUARE,R-Square,0.6284172
2,ADJRSQ,Adj R-Sq,0.6176727
3,AIC,AIC,8483.996
4,AICC,AICC,8485.013
5,SBC,SBC,8106.765
6,TRAIN_ASE,ASE,139978700.0

Unnamed: 0,Effect,Origin,Type,DriveTrain,Parameter,DF,Estimate,StdErr,tValue,Probt
0,Intercept,,,,Intercept,1,-23692.980669,16261.000069,-1.457043,0.1458607
1,Origin,Asia,,,Origin Asia,1,2191.206289,1479.7567,1.480788,0.1394218
2,Origin,Europe,,,Origin Europe,1,17100.937866,1779.533025,9.60979,7.112422999999999e-20
3,Origin,USA,,,Origin USA,0,0.0,,,
4,Type,,Hybrid,,Type Hybrid,1,26154.719438,10602.173003,2.466921,0.01403098
5,Type,,SUV,,Type SUV,1,-1016.065543,3083.503255,-0.329517,0.7419315
6,Type,,Sedan,,Type Sedan,1,2481.367175,2359.814614,1.051509,0.2936366
7,Type,,Sports,,Type Sports,1,21015.571095,3065.180416,6.856226,2.572647e-11
8,Type,,Truck,,Type Truck,1,-12891.562541,3592.436933,-3.588529,0.000372256
9,Type,,Wagon,,Type Wagon,0,0.0,,,


Groupby regression 

In [18]:
cars = conn.CASTable('cars')
out = cars.groupby('Origin')[['MSRP']].summary().concat_bygroups()
out['Summary'][['Column','Mean','Var','Std']]

Unnamed: 0_level_0,Column,Mean,Var,Std
Origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Asia,MSRP,24741.322785,128166600.0,11321.069675
Europe,MSRP,48349.796748,641031500.0,25318.600464
USA,MSRP,28377.442177,137170500.0,11711.982506


In [19]:
cars = conn.CASTable('cars')
cars.groupby=['Origin']
cars.where = 'MSRP < 100000 and MPG_City < 40'
nomList = ['Type','DriveTrain']
contList = ['MPG_City','Weight','Length']
groupBYResult =conn.CASTable('MSRPPredictionGroupBy')

linear4 = cars.glm
linear4.target = 'MSRP'
linear4.inputs = nomList + contList
linear4.nominals = nomList
linear4.display.names = ['FitStatistics','ParameterEstimates']
linear4.output.casout = groupBYResult
linear4.output.copyVars = 'ALL';
linear4.output.pred = 'Predicted_MSRP'
linear4.output.resid = 'Residual_MSRP'
linear4.output.lcl = 'LCL_MSRP'
linear4.output.ucl = 'UCL_MSRP'
linear4()

Unnamed: 0,Origin,Origin_f,_key_
0,Asia,Asia,Asia
1,Europe,Europe,Europe
2,USA,USA,USA

Unnamed: 0_level_0,RowId,Description,Value
Origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Asia,RMSE,Root MSE,7023.321
Asia,RSQUARE,R-Square,0.6432711
Asia,ADJRSQ,Adj R-Sq,0.6211293
Asia,AIC,AIC,2912.33
Asia,AICC,AICC,2914.176
Asia,SBC,SBC,2785.764
Asia,TRAIN_ASE,ASE,46144650.0

Unnamed: 0_level_0,Effect,Type,DriveTrain,Parameter,DF,Estimate,StdErr,tValue,Probt
Origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Asia,Intercept,,,Intercept,1,-7800.227372,19135.59784,-0.407629,0.6841472
Asia,Type,SUV,,Type SUV,1,-968.795973,2805.772482,-0.345287,0.7303792
Asia,Type,Sedan,,Type Sedan,1,3766.422921,2436.960318,1.545541,0.1243951
Asia,Type,Sports,,Type Sports,1,9154.062848,2929.463572,3.124826,0.002149295
Asia,Type,Truck,,Type Truck,1,-11550.233853,3982.945107,-2.899923,0.004313916
Asia,Type,Wagon,,Type Wagon,0,0.0,,,
Asia,DriveTrain,,All,DriveTrain All,1,-8003.325899,2150.711618,-3.721245,0.0002828697
Asia,DriveTrain,,Front,DriveTrain Front,1,-12340.999993,1974.140868,-6.251327,4.295785e-09
Asia,DriveTrain,,Rear,DriveTrain Rear,0,0.0,,,
Asia,MPG_City,,,MPG_City,1,17.219012,250.187094,0.068825,0.9452241

Unnamed: 0_level_0,RowId,Description,Value
Origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Europe,RMSE,Root MSE,8984.9
Europe,RSQUARE,R-Square,0.7682232
Europe,ADJRSQ,Adj R-Sq,0.7513667
Europe,AIC,AIC,2296.227
Europe,AICC,AICC,2298.264
Europe,SBC,SBC,2200.239
Europe,TRAIN_ASE,ASE,74622910.0

Unnamed: 0_level_0,Effect,Type,DriveTrain,Parameter,DF,Estimate,StdErr,tValue,Probt
Origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Europe,Intercept,,,Intercept,1,-97268.426796,23984.147398,-4.05553,9.371452e-05
Europe,Type,SUV,,Type SUV,1,-8680.048584,4832.887423,-1.796038,0.07523358
Europe,Type,Sedan,,Type Sedan,1,3983.43257,2826.381315,1.409375,0.161545
Europe,Type,Sports,,Type Sports,1,28690.13797,3766.149186,7.617897,9.639114e-12
Europe,Type,Wagon,,Type Wagon,0,0.0,,,
Europe,DriveTrain,,All,DriveTrain All,1,-5872.661455,2354.280432,-2.494461,0.01410149
Europe,DriveTrain,,Front,DriveTrain Front,1,-5977.536993,2282.743859,-2.618575,0.0100744
Europe,DriveTrain,,Rear,DriveTrain Rear,0,0.0,,,
Europe,MPG_City,,,MPG_City,1,-572.454386,438.201671,-1.306372,0.194151
Europe,Weight,,,Weight,1,15.461755,3.107484,4.975652,2.418969e-06

Unnamed: 0_level_0,RowId,Description,Value
Origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
USA,RMSE,Root MSE,7336.632
USA,RSQUARE,R-Square,0.631786
USA,ADJRSQ,Adj R-Sq,0.6075967
USA,AIC,AIC,2775.43
USA,AICC,AICC,2777.386
USA,SBC,SBC,2656.335
USA,TRAIN_ASE,ASE,50164520.0

Unnamed: 0_level_0,Effect,Type,DriveTrain,Parameter,DF,Estimate,StdErr,tValue,Probt
Origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
USA,Intercept,,,Intercept,1,27189.194732,17038.936722,1.59571,0.112858
USA,Type,SUV,,Type SUV,1,-2642.099747,3692.428467,-0.715545,0.47549
USA,Type,Sedan,,Type Sedan,1,1143.294987,2930.398473,0.39015,0.697032
USA,Type,Sports,,Type Sports,1,16249.501038,4286.648839,3.790724,0.000224
USA,Type,Truck,,Type Truck,1,-8503.479255,3650.165897,-2.329614,0.021288
USA,Type,Wagon,,Type Wagon,0,0.0,,,
USA,DriveTrain,,All,DriveTrain All,1,-2882.556364,2448.067975,-1.177482,0.241045
USA,DriveTrain,,Front,DriveTrain Front,1,-660.20584,1827.536082,-0.361255,0.718466
USA,DriveTrain,,Rear,DriveTrain Rear,0,0.0,,,
USA,MPG_City,,,MPG_City,1,-1069.867684,340.679897,-3.14039,0.002067


In [20]:
out = groupBYResult.to_frame()
p = Scatter(out, x='Predicted_MSRP', y='Residual_MSRP', color='Origin', marker='Origin')
output_notebook()
#output_file('scatter.html')
show(p)

## Extensions of Ordinary Linear Regression

### Generalized Linear Models

Gamma Regression

In [21]:
cars = conn.CASTable('cars')
genmodModel1 = cars.Genmod()
genmodModel1.model.depvars = 'MSRP'
genmodModel1.model.effects = ['MPG_City']
genmodModel1.model.dist = 'gamma'
genmodModel1.model.link = 'log'
genmodModel1()

NOTE: Convergence criterion (GCONV=1E-8) satisfied.


Unnamed: 0,RowId,Description,Value
0,DATA,Data Source,CARS
1,RESPONSEVAR,Response Variable,MSRP
2,DIST,Distribution,Gamma
3,LINK,Link Function,Log
4,TECH,Optimization Technique,Newton-Raphson with Ridging

Unnamed: 0,RowId,Description,Value
0,NREAD,Number of Observations Read,428.0
1,NUSED,Number of Observations Used,428.0

Unnamed: 0,Reason,Status,MaxGradient
0,Convergence criterion (GCONV=1E-8) satisfied.,0,1.068552e-09

Unnamed: 0,RowId,Description,Value
0,NDESIGNCOLS,Columns in Design,2
1,NEFFECTS,Number of Effects,2
2,MAXEFCOLS,Max Effect Columns,1
3,DESIGNRANK,Rank of Design,2
4,OPTPARM,Parameters in Optimization,3

Unnamed: 0,RowId,Description,Value
0,M2LL,-2 Log Likelihood,9270.853164
1,AIC,AIC (smaller is better),9276.853164
2,AICC,AICC (smaller is better),9276.909768
3,SBC,SBC (smaller is better),9289.030533

Unnamed: 0,Effect,Parameter,ParmName,DF,Estimate,StdErr,ChiSq,ProbChiSq
0,Intercept,Intercept,Intercept,1,11.30779,0.059611,35983.929066,0.0
1,MPG_City,MPG_City,MPG_City,1,-0.0474,0.002801,286.44537,2.9586549999999997e-64
2,Dispersion,Dispersion,Dispersion,1,5.886574,0.391526,,

Unnamed: 0,RowId,Task,Time,RelTime
0,SETUP,Setup and Parsing,0.017091,0.101825
1,LEVELIZATION,Levelization,0.00136,0.008102
2,INITIALIZATION,Model Initialization,0.01446,0.08615
3,SSCP,SSCP Computation,0.004074,0.024271
4,FITTING,Model Fitting,0.125968,0.750493
5,CLEANUP,Cleanup,0.004377,0.026077
6,TOTAL,Total,0.167847,1.0


Multinomial Regression

In [22]:
genmodModel1.model.depvars = 'Cylinders'
genmodModel1.model.dist = 'multinomial'
genmodModel1.model.link = 'logit'
genmodModel1.model.effects = ['MPG_City']
genmodModel1.display.names = ['ModelInfo', 'ParameterEstimates']
genmodModel1()

NOTE: Convergence criterion (GCONV=1E-8) satisfied.


Unnamed: 0,RowId,Description,Value
0,DATA,Data Source,CARS
1,RESPONSEVAR,Response Variable,Cylinders
2,NLEVELS,Number of Response Levels,7
3,DIST,Distribution,Multinomial
4,LINKTYPE,Link Type,Cumulative
5,LINK,Link Function,Logit
6,TECH,Optimization Technique,Newton-Raphson with Ridging

Unnamed: 0,Effect,Parameter,ParmName,Outcome,Cylinders,DF,Estimate,StdErr,ChiSq,ProbChiSq
0,Intercept,Intercept,Intercept_3,3.0,3.0,1,-60.329075,4.829533,156.042532,8.286542e-36
1,Intercept,Intercept,Intercept_4,4.0,4.0,1,-21.461149,1.584887,183.361936,8.941487999999999e-42
2,Intercept,Intercept,Intercept_5,5.0,5.0,1,-21.233691,1.575766,181.579751,2.190306e-41
3,Intercept,Intercept,Intercept_6,6.0,6.0,1,-16.632445,1.337275,154.693103,1.634032e-35
4,Intercept,Intercept,Intercept_8,8.0,8.0,1,-10.988487,1.13947,92.99719,5.236863e-22
5,Intercept,Intercept,Intercept_10,10.0,10.0,1,-10.31422,1.186541,75.562638,3.539969e-18
6,MPG_City,MPG_City,MPG_City,,,1,1.013934,0.077371,171.734698,3.092446e-39


Score the input table

In [23]:
genmodResult = conn.CASTable('CylinderPredicted', replace=True)
genmodModel1.output.casout = genmodResult
genmodModel1.output.copyVars = 'ALL';
genmodModel1.output.pred = 'Prob_Cylinders'
genmodModel1()
genmodResult[['Prob_Cylinders','_level_','Cylinders','MPG_City']].head(24)

NOTE: Convergence criterion (GCONV=1E-8) satisfied.


Unnamed: 0,Prob_Cylinders,_LEVEL_,Cylinders,MPG_City
0,1.9288419999999997e-19,3.0,6.0,17.0
1,0.01442488,4.0,6.0,17.0
2,0.01804258,5.0,6.0,17.0
3,0.6466697,6.0,6.0,17.0
4,0.9980702,8.0,6.0,17.0
5,0.9990158,10.0,6.0,17.0
6,2.331945e-16,3.0,4.0,24.0
7,0.946509,4.0,4.0,24.0
8,0.9569226,5.0,4.0,24.0
9,0.9995483,6.0,4.0,24.0


### Regression Trees

In [24]:
conn.loadactionset('decisiontree')
conn.help(actionset='decisiontree')

NOTE: Added action set 'decisiontree'.
NOTE: Information for action set 'decisiontree':
NOTE:    decisionTree
NOTE:       dtreeTrain - Train a decision tree
NOTE:       dtreeScore - Score a table using a decision tree model
NOTE:       dtreeSplit - Split decision tree nodes
NOTE:       dtreePrune - Prune a decision tree
NOTE:       dtreeMerge - Merge decision tree nodes
NOTE:       dtreeCode - Generate DATA step scoring code from a decision tree model
NOTE:       forestTrain - Train a forest
NOTE:       forestScore - Score a table using a forest model
NOTE:       forestCode - Generate DATA step scoring code from a forest model
NOTE:       gbtreeTrain - Train a gradient boosting tree
NOTE:       gbtreeScore - Score a table using a gradient boosting tree model
NOTE:       gbtreecode - Generate DATA step scoring code from a gradient boosting tree model


Unnamed: 0,name,description
0,dtreeTrain,Train a decision tree
1,dtreeScore,Score a table using a decision tree model
2,dtreeSplit,Split decision tree nodes
3,dtreePrune,Prune a decision tree
4,dtreeMerge,Merge decision tree nodes
5,dtreeCode,Generate DATA step scoring code from a decisio...
6,forestTrain,Train a forest
7,forestScore,Score a table using a forest model
8,forestCode,Generate DATA step scoring code from a forest ...
9,gbtreeTrain,Train a gradient boosting tree


In [25]:
cars = conn.CASTable('cars')

output1 = conn.CASTable('treeModel1')
output1.replace = True;

tree1 = cars.dtreetrain
tree1.target = 'MSRP'
tree1.inputs = ['MPG_City']
tree1.casout = output1
tree1.maxlevel = 2
tree1()

Unnamed: 0,Descr,Value
0,Number of Tree Nodes,3.0
1,Max Number of Branches,2.0
2,Number of Levels,2.0
3,Number of Leaves,2.0
4,Number of Bins,20.0
5,Minimum Size of Leaves,202.0
6,Maximum Size of Leaves,226.0
7,Number of Variables,1.0
8,Alpha for Cost-Complexity Pruning,0.0
9,Number of Observations Used,428.0

Unnamed: 0,casLib,Name,Rows,Columns,casTable
0,CASUSER(username),treeModel1,3,23,"CASTable('treeModel1', caslib='CASUSER(username)')"


In [26]:
output1[['_NodeID_', '_Parent_','_Mean_','_NodeName_','_PBLower0_','_PBUpper0_']].fetch()

Unnamed: 0,_NodeID_,_Parent_,_Mean_,_NodeName_,_PBLower0_,_PBUpper0_
0,0.0,-1.0,32774.85514,MPG_City,,
1,1.0,0.0,22875.341584,MSRP,20.0,60.0
2,2.0,0.0,41623.09292,MSRP,10.0,20.0


In [27]:
conn.close()