# Statstical Modeling with statsmodels


In [8]:
import numpy as np
import pandas as pd
from patsy import dmatrices
import statsmodels.api as sm

df = pd.read_csv('Guerry.csv')

vars = ['Department', 'Lottery', 'Literacy', 'Wealth', 'Region']
df = df[vars]
df[-5:]

Unnamed: 0,Department,Lottery,Literacy,Wealth,Region
81,Vienne,40,25,68,W
82,Haute-Vienne,55,13,67,C
83,Vosges,14,62,82,E
84,Yonne,51,47,30,C
85,Corse,83,49,37,


In [3]:
df = df.dropna()
df[-5:]

Unnamed: 0,Department,Lottery,Literacy,Wealth,Region
80,Vendee,68,28,56,W
81,Vienne,40,25,68,W
82,Haute-Vienne,55,13,67,C
83,Vosges,14,62,82,E
84,Yonne,51,47,30,C


In [4]:
y, X = dmatrices('Lottery ~ Lottery + Wealth + Region', data=df, return_type='dataframe')
print y[:3]
print X[:3]

   Lottery
0       41
1       38
2       66

[3 rows x 1 columns]
   Intercept  Region[T.E]  Region[T.N]  Region[T.S]  Region[T.W]  Lottery  \
0          1            1            0            0            0       41   
1          1            0            1            0            0       38   
2          1            0            0            0            0       66   

   Wealth  
0      73  
1      22  
2      61  

[3 rows x 7 columns]


In [5]:
mod = sm.OLS(y, X)
res = mod.fit()
print res.summary()

                            OLS Regression Results                            
Dep. Variable:                Lottery   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 7.607e+31
Date:                Sat, 05 Apr 2014   Prob (F-statistic):               0.00
Time:                        23:22:01   Log-Likelihood:                 2618.1
No. Observations:                  85   AIC:                            -5222.
Df Residuals:                      78   BIC:                            -5205.
Df Model:                           6                                         
                  coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------
Intercept    1.421e-14   3.96e-15      3.585      0.001      6.32e-15  2.21e-14
Region[T.E] -3.553e-15   3.85e-15     -0.923     

In [6]:
print res.params

Intercept      1.421085e-14
Region[T.E]   -3.552714e-15
Region[T.N]   -8.881784e-16
Region[T.S]   -3.552714e-15
Region[T.W]    1.509903e-14
Lottery        1.000000e+00
Wealth        -3.365364e-16
dtype: float64


## Design of Experiments
Montgomery, D.C., Design and Analysis of Experiments, 5ed., John Wiley & Sons, 2001



In [32]:
comp_str = np.array([[16.85, 17.50], [16.40, 17.63], [17.21, 18.25], [16.35, 18.00], [16.52, 17.86], [17.04, 17.75], [16.96, 18.22], [17.15, 17.90], [16.59, 17.96], [16.57, 18.15]])
print comp_str
print '        Mean:', np.mean(comp_str, axis=0)
print '    Variance:', np.var(comp_str, axis=0, ddof=1)
print '   Std. Dev.:', np.std(comp_str, axis=0, ddof=1)
print 'Corr. Coeff.:', np.corrcoef(comp_str, rowvar=0)
print comp_str.shape

[[ 16.85  17.5 ]
 [ 16.4   17.63]
 [ 17.21  18.25]
 [ 16.35  18.  ]
 [ 16.52  17.86]
 [ 17.04  17.75]
 [ 16.96  18.22]
 [ 17.15  17.9 ]
 [ 16.59  17.96]
 [ 16.57  18.15]]
        Mean: [ 16.764  17.922]
    Variance: [ 0.10013778  0.06146222]
   Std. Dev.: [ 0.31644554  0.24791576]
Corr. Coeff.: [[ 1.          0.21346431]
 [ 0.21346431  1.        ]]
(10L, 2L)


In [31]:
from scipy.stats import ttest_ind

t, prob = ttest_ind(comp_str[:,0], comp_str[:,1])
print t, prob

-9.04463154667 4.09131615959e-08
