In [1]:
import statsmodels.api as sm

GSSvocab_vocab = sm.datasets.get_rdataset("GSSvocab", "carData")

print(GSSvocab_vocab.__doc__)

+----------+-----------------+
| GSSvocab | R Documentation |
+----------+-----------------+

Data from the General Social Survey (GSS) from the National Opinion Research Center of the University of Chicago.
-----------------------------------------------------------------------------------------------------------------

Description
~~~~~~~~~~~

This data set illustrates analyis of a multifactor observational study,
with response given by subject's score on a vocabulary test, and factors
for age group, education level, natality status, gender and year of the
survey.

Usage
~~~~~

::

   data("GSSvocab")

Format
~~~~~~

A data frame with 28867 observations on the following 8 variables.

``year``
   a factor with levels ``1978`` ``1982`` ``1984`` ``1987`` ``1988``
   ``1989`` ``1990`` ``1991`` ``1993`` ``1994`` ``1996`` ``1998``
   ``2000`` ``2004`` ``2006`` ``2008`` ``2010`` ``2012`` ``2014``
   ``2016``. Data are included from the GSS for each of these years.

``gender``
   a factor wi

  return dataset_meta["Title"].item()


In [2]:
GSSvocab_vocab.data.head()

Unnamed: 0,year,gender,nativeBorn,ageGroup,educGroup,vocab,age,educ
1978.1,1978,female,yes,50-59,12 yrs,10.0,52.0,12.0
1978.2,1978,female,yes,60+,<12 yrs,6.0,74.0,9.0
1978.3,1978,male,yes,30-39,<12 yrs,4.0,35.0,10.0
1978.4,1978,female,yes,50-59,12 yrs,9.0,50.0,12.0
1978.5,1978,female,yes,40-49,12 yrs,6.0,41.0,12.0


In [0]:
import statsmodels.api as sm

data = sm.datasets.stackloss.load_pandas()

In [10]:
data.data

Unnamed: 0,STACKLOSS,AIRFLOW,WATERTEMP,ACIDCONC
0,42.0,80.0,27.0,89.0
1,37.0,80.0,27.0,88.0
2,37.0,75.0,25.0,90.0
3,28.0,62.0,24.0,87.0
4,18.0,62.0,22.0,87.0
5,18.0,62.0,23.0,87.0
6,19.0,62.0,24.0,93.0
7,20.0,62.0,24.0,93.0
8,15.0,58.0,23.0,87.0
9,14.0,58.0,18.0,80.0


In [11]:
data.endog.iloc[:5]

0    42.0
1    37.0
2    37.0
3    28.0
4    18.0
Name: STACKLOSS, dtype: float64

In [16]:
data.exog.iloc[:5,:]

Unnamed: 0,AIRFLOW,WATERTEMP,ACIDCONC
0,80.0,27.0,89.0
1,80.0,27.0,88.0
2,75.0,25.0,90.0
3,62.0,24.0,87.0
4,62.0,22.0,87.0


In [17]:
data.endog_name

'STACKLOSS'

In [18]:
data.exog_name

['AIRFLOW', 'WATERTEMP', 'ACIDCONC']

In [19]:
type(data.data)

pandas.core.frame.DataFrame

In [20]:
data.names

['STACKLOSS', 'AIRFLOW', 'WATERTEMP', 'ACIDCONC']

In [21]:
data.exog

Unnamed: 0,AIRFLOW,WATERTEMP,ACIDCONC
0,80.0,27.0,89.0
1,80.0,27.0,88.0
2,75.0,25.0,90.0
3,62.0,24.0,87.0
4,62.0,22.0,87.0
5,62.0,23.0,87.0
6,62.0,24.0,93.0
7,62.0,24.0,93.0
8,58.0,23.0,87.0
9,58.0,18.0,80.0


In [22]:
data.endog

0     42.0
1     37.0
2     37.0
3     28.0
4     18.0
5     18.0
6     19.0
7     20.0
8     15.0
9     14.0
10    14.0
11    13.0
12    11.0
13    12.0
14     8.0
15     7.0
16     8.0
17     8.0
18     9.0
19    15.0
20    15.0
Name: STACKLOSS, dtype: float64

In [23]:
y, x = data.endog, data.exog
res = sm.OLS(y, x).fit()
res.params

AIRFLOW      0.796765
WATERTEMP    1.111422
ACIDCONC    -0.624993
dtype: float64

In [24]:
res.summary()

0,1,2,3
Dep. Variable:,STACKLOSS,R-squared (uncentered):,0.965
Model:,OLS,Adj. R-squared (uncentered):,0.959
Method:,Least Squares,F-statistic:,165.9
Date:,"Thu, 12 Dec 2019",Prob (F-statistic):,2.66e-13
Time:,05:36:42,Log-Likelihood:,-57.625
No. Observations:,21,AIC:,121.2
Df Residuals:,18,BIC:,124.4
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
AIRFLOW,0.7968,0.166,4.793,0.000,0.448,1.146
WATERTEMP,1.1114,0.456,2.437,0.025,0.153,2.069
ACIDCONC,-0.6250,0.085,-7.378,0.000,-0.803,-0.447

0,1,2,3
Omnibus:,0.555,Durbin-Watson:,1.52
Prob(Omnibus):,0.758,Jarque-Bera (JB):,0.614
Skew:,-0.136,Prob(JB):,0.736
Kurtosis:,2.207,Cond. No.,57.5


In [28]:
dir(sm.datasets.stackloss)[:6]

['COPYRIGHT', 'DESCRLONG', 'DESCRSHORT', 'NOTE', 'SOURCE', 'TITLE']