## StatsModels
#### statistical methods, tests, regression, time-series analysis, multivariate statistics

In [1]:
import statsmodels.api as sm
import numpy as np

In [4]:
normRandVars = np.random.normal(0, 1, 1000)

x = sm.stats.DescrStatsW(normRandVars)

print(x)

<statsmodels.stats.weightstats.DescrStatsW object at 0x1c19992f40>


In [5]:
print(x.mean)

-0.052865918677741694


In [6]:
print(x.std)

0.9948278918903624


In [7]:
print(x.var)

0.9896825344830225


### Confidence Intervals

In [9]:
## Generate CI for a pop propn ##
tstar = 1.96

# Observer pop propn
p = .85

# pop size
n = 659

sm.stats.proportion_confint(n * p, n)

(0.8227378265796143, 0.8772621734203857)

In [10]:
import pandas as pd

df = pd.read_csv("https://raw.githubusercontent.com/UMstatspy/UMStatsPy/master/Course_1/Cartwheeldata.csv")

print(df.head())

   ID  Age Gender  GenderGroup Glasses  GlassesGroup  Height  Wingspan  \
0   1   56      F            1       Y             1    62.0      61.0   
1   2   26      F            1       Y             1    62.0      60.0   
2   3   33      F            1       Y             1    66.0      64.0   
3   4   39      F            1       N             0    64.0      63.0   
4   5   27      M            2       N             0    73.0      75.0   

   CWDistance Complete  CompleteGroup  Score  
0          79        Y              1      7  
1          70        Y              1      8  
2          85        Y              1      7  
3          87        Y              1     10  
4          72        N              0      4  


In [11]:
## Generate CI for a population mean ##
sm.stats.DescrStatsW(df['CWDistance']).zconfint_mean()

(76.57715593233026, 88.38284406766975)

### Hypothesis Testing

In [12]:
## 1 pop propn HT ##

n = 1018

# Null hypo pop propn
pnull = .52

# Observe pop propn
phat = .56

# Calculate test statistic and p-value
sm.stats.proportions_ztest(phat * n, n, pnull)

(2.571067795759113, 0.010138547731721065)

In [13]:
# HT for pop mean
sm.stats.ztest(df['CWDistance'], value = 80, alternative = 'larger')

(0.8234523266982029, 0.20512540845395266)

### OLS, GLM - Generalized Linear Models, GEE - Generalized Estimated Equations, MIZEDLM - Mixed Linear Models (Multilevel Models) - primary sm libraries

#### OLS - method for estimating the unknow pmts in a linear regression model. Used when target variable is continuous

In [14]:
da = pd.read_csv('data/nhanes_2015_2016.csv')

# Drop unused columns, drop rows with any missing values.
vars = ["BPXSY1", "RIDAGEYR", "RIAGENDR", "RIDRETH1", "DMDEDUC2", "BMXBMI",
        "SMQ020", "SDMVSTRA", "SDMVPSU"]
da = da[vars].dropna()

da['RIAGENDRx'] = da.RIAGENDR.replace({1: 'Male', 2: 'Female'})

# BPXSY1 - target var; continuous var that represents BP
# RIDAGEYR, RIAGENDRx - predictors
model = sm.OLS.from_formula('BPXSY1 ~ RIDAGEYR + RIAGENDRx', data=da)
res = model.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                 BPXSY1   R-squared:                       0.215
Model:                            OLS   Adj. R-squared:                  0.214
Method:                 Least Squares   F-statistic:                     697.4
Date:                Sat, 14 Mar 2020   Prob (F-statistic):          1.87e-268
Time:                        12:01:09   Log-Likelihood:                -21505.
No. Observations:                5102   AIC:                         4.302e+04
Df Residuals:                    5099   BIC:                         4.304e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept           100.6305      0.71

#### GLM - Generalized Linear Model. Using here for Logistic Regression - when target variable is binary outcome, or a classification of two groups, which can be denoted as group 0 and group 1.

In [15]:
da['smq'] = da.SMQ020.replace({2: 0, 7: np.nan, 9: np.nan})
model = sm.GLM.from_formula('smq ~ RIAGENDRx', family=sm.families.Binomial(), data=da)
res = model.fit()
print(res.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                    smq   No. Observations:                 5094
Model:                            GLM   Df Residuals:                     5092
Model Family:                Binomial   Df Model:                            1
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -3350.6
Date:                Sat, 14 Mar 2020   Deviance:                       6701.2
Time:                        15:54:04   Pearson chi2:                 5.09e+03
No. Iterations:                     4                                         
Covariance Type:            nonrobust                                         
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept            -0.7547      0.04

#### Generalized Estimated Equations - estimate generalized linear models for panel, cluster or repeated measures data when the observations are possibly correlated within a cluster but uncorrelated across clusters

In [20]:
da['group'] = 10 * da.SDMVSTRA + da.SDMVPSU
model = sm.GEE.from_formula('BPXSY1 ~ 1', groups='group', cov_struct = sm.cov_struct.Exchangeable(), data=da)
res = model.fit()
print(res.cov_struct.summary())

The correlation between two observations in the same cluster is 0.030


#### Multilevel Models - Similarly to GEEs, we use multilevel models when there is potential for outcomes to be grouped together which is not uncommon when using various sampling methods to collect data.

In [21]:
for v in ["BPXSY1", "RIDAGEYR", "BMXBMI", "smq", "SDMVSTRA"]:
    model = sm.GEE.from_formula(v + '~ 1', groups='group',
                               cov_struct=sm.cov_struct.Exchangeable(), data=da)
    result=model.fit()
    print(v, result.cov_struct.summary())

BPXSY1 The correlation between two observations in the same cluster is 0.030
RIDAGEYR The correlation between two observations in the same cluster is 0.035
BMXBMI The correlation between two observations in the same cluster is 0.039
smq The correlation between two observations in the same cluster is 0.026
SDMVSTRA The correlation between two observations in the same cluster is 0.959


In [None]:
print(df['gro'])