In [2]:
import saspy
import pandas as pd

## Start a SAS session

You can configure the connection types you want within ~./config/saspy/sascfg_personal.py. 

We'll start a local connection to Windows, but you can easily connect to SAS Viya, SAS Mainframe, or a remote SAS server. 

For more information on how to change how you connect to SAS, see:
https://sassoftware.github.io/saspy/getting-started.html#start-a-sas-session

Once you're connected, it's a full SAS session. You can do anything in SAS that you could do in another editor. We'll look at an example of this later. For now, we'll focus on some of the Pythonic aspects.

In [3]:
sas = saspy.SASsession(cfgname='winlocal')

SAS Connection established. Subprocess id is 5908



## Read Data Directly

Now that we have started a SAS session, we can read the data directly with sas.read_csv, and save the table
to the WORK library as "insurance." 

We'll reference this table in Python as a variable of the same name.

In [4]:
url = 'https://github.com/stu-code/sas-tips/raw/refs/heads/main/data/insurance.csv'
insurance = sas.read_csv(url, table='insurance')

## Reading Data Using Pandas

But you don't have to use sas.read_csv. If you have a Pandas dataframe, you can easily go back and forth between the two with these functions:
- sas.df2sd(): Dataframe to SAS Dataset
- sas.sd2df(): SAS Dataset to Dataframe

As a bonus, you can even go to Parquet:
- sas.sd2pq(): SAS Dataset to Parquet

In [4]:
url = 'https://github.com/stu-code/sas-tips/raw/refs/heads/main/data/insurance.csv'
df_insurance = pd.read_csv(url)
insurance = sas.df2sd(df_insurance, table='insurance')

## Explore the data

We can do things like look at the first 5 rows, get information about the columns, perform summary statistics, and even build graphs.

All of these features are pandas-like, so there's a very low learning curve.

In [14]:
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19.0,female,27.9,0.0,yes,southwest,16884.924
1,18.0,male,33.77,1.0,no,southeast,1725.5523
2,28.0,male,33.0,3.0,no,southeast,4449.462
3,33.0,male,22.705,0.0,no,northwest,21984.47061
4,32.0,male,28.88,0.0,no,northwest,3866.8552


In [18]:
insurance.info()

Unnamed: 0,Variable,type,N,Nmiss
0,sex,char,1338.0,0.0
1,smoker,char,1338.0,0.0
2,region,char,1338.0,0.0
3,age,numeric,1338.0,0.0
4,bmi,numeric,1338.0,0.0
5,children,numeric,1338.0,0.0
6,charges,numeric,1338.0,0.0


In [20]:
insurance.columnInfo()

Unnamed: 0,Member,Num,Variable,Type,Len,Pos,Format,Informat
0,WORK.INSURANCE,1.0,age,Num,8.0,0.0,BEST12.,BEST32.
1,WORK.INSURANCE,3.0,bmi,Num,8.0,8.0,BEST12.,BEST32.
2,WORK.INSURANCE,7.0,charges,Num,8.0,24.0,BEST12.,BEST32.
3,WORK.INSURANCE,4.0,children,Num,8.0,16.0,BEST12.,BEST32.
4,WORK.INSURANCE,6.0,region,Char,9.0,41.0,$9.,$9.
5,WORK.INSURANCE,2.0,sex,Char,6.0,32.0,$6.,$6.
6,WORK.INSURANCE,5.0,smoker,Char,3.0,38.0,$3.,$3.


In [21]:
insurance.describe()

Unnamed: 0,Variable,N,NMiss,Median,Mean,StdDev,Min,P25,P50,P75,Max
0,age,1338.0,0.0,39.0,39.207025,14.04996,18.0,27.0,39.0,51.0,64.0
1,bmi,1338.0,0.0,30.4,30.663397,6.098187,15.96,26.29,30.4,34.7,53.13
2,children,1338.0,0.0,1.0,1.094918,1.205493,0.0,0.0,1.0,2.0,5.0
3,charges,1338.0,0.0,9382.033,13270.422265,12110.011237,1121.8739,4738.2682,9382.033,16657.71745,63770.42801


## Build a few models

Let's build a few regression models and compare their results. We'll compare:
- Linear Regression
- Random Forest

In [5]:
stat = sas.sasstat()
ml   = sas.sasml()

In [6]:
reg = stat.glm(
    data=insurance,
    cls=['region', 'sex', 'smoker'],
    model='age = bmi charges children region sex smoker',
    stmtpassthrough='output out=reg_preds p=p_age'
)

In [7]:
reg_preds = sas.sasdata('reg_preds')
reg_preds.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,p_age
0,19.0,female,27.9,0.0,yes,southwest,16884.924,24.495347
1,18.0,male,33.77,1.0,no,southeast,1725.5523,31.718566
2,28.0,male,33.0,3.0,no,southeast,4449.462,34.317552
3,33.0,male,22.705,0.0,no,northwest,21984.47061,54.096309
4,32.0,male,28.88,0.0,no,northwest,3866.8552,34.950952


In [8]:
forest = ml.hpforest(
    data=insurance,
    input=['bmi', 'charges', 'children', 'region', 'sex', 'smoker'],
    target='age',
    score='out=forest_preds'
)

In [9]:
forest_preds = sas.sasdata('forest_preds')
forest_preds.head()

Unnamed: 0,age,P_age,R_age,_WARN_
0,19.0,24.217934,-5.217934,
1,18.0,19.415152,-1.415152,
2,28.0,27.679111,0.320889,
3,33.0,39.900375,-6.900375,
4,32.0,30.056389,1.943611,


## Compare models using the assessModel method

We'll take a look at the mean (average) squared error between the two and see which model is better

In [12]:
( 
    reg_preds.assessModel(target='age', prediction='p_age', nominal=False)
    .assessmentstatistics
    ['RootAverageSquaredError']
)

0    11.984571
Name: RootAverageSquaredError, dtype: float64

In [13]:
( 
    forest_preds.assessModel(target='age', prediction='p_age', nominal=False)
    .assessmentstatistics
    ['RootAverageSquaredError']
)

0    4.310121
Name: RootAverageSquaredError, dtype: float64