In [1]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
import pandas as pd

In [2]:
bmx = pd.read_sas('BMX_I.XPT')

In [3]:
demo = pd.read_sas('DEMO_I.XPT')

In [4]:
merged = bmx.merge(right = demo, on = 'SEQN')

In [5]:
babies = merged.loc[merged.RIDAGEMN <= 12,['RIDAGEMN','RIAGENDR','BMXWT']]  
# subset to select those with age in month 12 and under, select age in months, gender, weight

In [6]:
babies.columns = ['age', 'gender','weight']  #rename columns

In [7]:
print(babies.head())

      age  gender  weight
32    2.0     2.0     5.2
63    7.0     2.0     8.2
105  10.0     2.0     7.3
115   5.0     2.0     7.2
121   5.0     1.0     7.5


In [8]:
babies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 399 entries, 32 to 9533
Data columns (total 3 columns):
age       399 non-null float64
gender    399 non-null float64
weight    399 non-null float64
dtypes: float64(3)
memory usage: 12.5 KB


## formula notation

In [9]:
model = smf.ols('weight ~ age + gender', data = babies)

In [10]:
results = model.fit()

In [11]:
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                 weight   R-squared:                       0.707
Model:                            OLS   Adj. R-squared:                  0.705
Method:                 Least Squares   F-statistic:                     477.0
Date:                Wed, 23 May 2018   Prob (F-statistic):          3.47e-106
Time:                        13:31:01   Log-Likelihood:                -582.83
No. Observations:                 399   AIC:                             1172.
Df Residuals:                     396   BIC:                             1184.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      6.4509      0.188     34.386      0.0

In [14]:
model2 = smf.ols('weight ~ age + C(gender)', data = babies)  # the C() function makes categories
# in our case, it doesn't make a difference because boys are 1 and girls are 2. (There are only 2 categories)

In [15]:
results2 = model2.fit()

In [16]:
print(results2.summary())

                            OLS Regression Results                            
Dep. Variable:                 weight   R-squared:                       0.707
Model:                            OLS   Adj. R-squared:                  0.705
Method:                 Least Squares   F-statistic:                     477.0
Date:                Wed, 23 May 2018   Prob (F-statistic):          3.47e-106
Time:                        13:34:34   Log-Likelihood:                -582.83
No. Observations:                 399   AIC:                             1172.
Df Residuals:                     396   BIC:                             1184.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept            5.7181      0.113  

## numpy array notation

In [21]:
# create arrays for the X and for the y
X = babies.loc[:,['age','gender']]
y = babies.weight

In [22]:
X = sm.add_constant(X)  # need to add a constant for the intercept term

In [23]:
model3 = sm.OLS(y, X)

In [24]:
results3 = model3.fit()

In [25]:
print(results3.summary())

                            OLS Regression Results                            
Dep. Variable:                 weight   R-squared:                       0.707
Model:                            OLS   Adj. R-squared:                  0.705
Method:                 Least Squares   F-statistic:                     477.0
Date:                Wed, 23 May 2018   Prob (F-statistic):          3.47e-106
Time:                        13:39:52   Log-Likelihood:                -582.83
No. Observations:                 399   AIC:                             1172.
Df Residuals:                     396   BIC:                             1184.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          6.4509      0.188     34.386      0.0

## the ANOVA table

In [27]:
table = sm.stats.anova_lm(results)

In [28]:
print(table)

             df      sum_sq     mean_sq           F         PR(>F)
age         1.0  991.502398  991.502398  905.132967  2.449632e-104
gender      1.0   53.481241   53.481241   48.822508   1.193519e-11
Residual  396.0  433.787039    1.095422         NaN            NaN


## predictions

In [61]:
x_new = {'age':4, 'gender':1}  # A dictionary to define the data of a 4 month old boy

In [62]:
print(x_new)

{'age': 4, 'gender': 1}


In [63]:
results.predict(x_new)

0    7.463456
dtype: float64

In [66]:
x_new = {'age':0, 'gender':2} # a 0 month old girl
results.predict(x_new)

0    4.985379
dtype: float64

In [68]:
results.predict(X)  # running the model on the entire X dataframe

32       5.858045
63       8.039710
105      9.348708
115      7.167044
121      7.899789
181     10.954119
197      7.167044
200      8.772454
213      8.476043
214      4.985379
295      4.985379
299      6.154457
311      9.785041
315      8.476043
318      5.858045
330      8.912376
372     10.221374
383      7.899789
437      9.348708
465     10.954119
482      5.858045
486      6.730711
491      9.208787
492      9.208787
499      6.590790
504      9.785041
544      7.603377
549     10.081453
570     10.517786
589      5.421712
          ...    
8749     8.912376
8766     7.603377
8790     6.730711
8793     8.476043
8805     9.645120
8820     7.027123
8857     9.785041
8893    10.954119
8930     7.603377
8935     7.167044
9012     6.154457
9050     7.027123
9056    10.517786
9065     7.603377
9079     7.899789
9086    10.081453
9156     5.718124
9187     7.899789
9229     9.785041
9230     7.463456
9249     9.348708
9259     6.294378
9275     7.899789
9293     7.027123
9344     6