Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Fetching contributors…

Cannot retrieve contributors at this time

401 lines (344 sloc) 16.048 kb
"""
Weighted Least Squares
example is extended to look at the meaning of rsquared in WLS,
at outliers, compares with RLM and a short bootstrap
"""
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
data = sm.datasets.ccard.load()
data.exog = sm.add_constant(data.exog, prepend=False)
ols_fit = sm.OLS(data.endog, data.exog).fit()
# perhaps the residuals from this fit depend on the square of income
incomesq = data.exog[:,2]
plt.scatter(incomesq, ols_fit.resid)
#@savefig wls_resid_check.png
plt.grid()
# If we think that the variance is proportional to income**2
# we would want to weight the regression by income
# the weights argument in WLS weights the regression by its square root
# and since income enters the equation, if we have income/income
# it becomes the constant, so we would want to perform
# this type of regression without an explicit constant in the design
#..data.exog = data.exog[:,:-1]
wls_fit = sm.WLS(data.endog, data.exog[:,:-1], weights=1/incomesq).fit()
# This however, leads to difficulties in interpreting the post-estimation
# statistics. Statsmodels does not yet handle this elegantly, but
# the following may be more appropriate
# explained sum of squares
ess = wls_fit.uncentered_tss - wls_fit.ssr
# rsquared
rsquared = ess/wls_fit.uncentered_tss
# mean squared error of the model
mse_model = ess/(wls_fit.df_model + 1) # add back the dof of the constant
# f statistic
fvalue = mse_model/wls_fit.mse_resid
# adjusted r-squared
rsquared_adj = 1 -(wls_fit.nobs)/(wls_fit.df_resid)*(1-rsquared)
#Trying to figure out what's going on in this example
#----------------------------------------------------
#JP: I need to look at this again. Even if I exclude the weight variable
# from the regressors and keep the constant in then the reported rsquared
# stays small. Below also compared using squared or sqrt of weight variable.
# TODO: need to add 45 degree line to graphs
wls_fit3 = sm.WLS(data.endog, data.exog[:,(0,1,3,4)], weights=1/incomesq).fit()
print wls_fit3.summary()
print 'corrected rsquared',
print (wls_fit3.uncentered_tss - wls_fit3.ssr)/wls_fit3.uncentered_tss
plt.figure();
plt.title('WLS dropping heteroscedasticity variable from regressors');
plt.plot(data.endog, wls_fit3.fittedvalues, 'o');
plt.xlim([0,2000]);
#@savefig wls_drop_het.png
plt.ylim([0,2000]);
print 'raw correlation of endog and fittedvalues'
print np.corrcoef(data.endog, wls_fit.fittedvalues)
print 'raw correlation coefficient of endog and fittedvalues squared'
print np.corrcoef(data.endog, wls_fit.fittedvalues)[0,1]**2
# compare with robust regression,
# heteroscedasticity correction downweights the outliers
rlm_fit = sm.RLM(data.endog, data.exog).fit()
plt.figure();
plt.title('using robust for comparison');
plt.plot(data.endog, rlm_fit.fittedvalues, 'o');
plt.xlim([0,2000]);
#@savefig wls_robust_compare.png
plt.ylim([0,2000]);
#What is going on? A more systematic look at the data
#----------------------------------------------------
# two helper functions
def getrsq(fitresult):
'''calculates rsquared residual, total and explained sums of squares
Parameters
----------
fitresult : instance of Regression Result class, or tuple of (resid, endog) arrays
regression residuals and endogenous variable
Returns
-------
rsquared
residual sum of squares
(centered) total sum of squares
explained sum of squares (for centered)
'''
if hasattr(fitresult, 'resid') and hasattr(fitresult, 'model'):
resid = fitresult.resid
endog = fitresult.model.endog
nobs = fitresult.nobs
else:
resid = fitresult[0]
endog = fitresult[1]
nobs = resid.shape[0]
rss = np.dot(resid, resid)
tss = np.var(endog)*nobs
return 1-rss/tss, rss, tss, tss-rss
def index_trim_outlier(resid, k):
'''returns indices to residual array with k outliers removed
Parameters
----------
resid : array_like, 1d
data vector, usually residuals of a regression
k : int
number of outliers to remove
Returns
-------
trimmed_index : array, 1d
index array with k outliers removed
outlier_index : array, 1d
index array of k outliers
Notes
-----
Outliers are defined as the k observations with the largest
absolute values.
'''
sort_index = np.argsort(np.abs(resid))
# index of non-outlier
trimmed_index = np.sort(sort_index[:-k])
outlier_index = np.sort(sort_index[-k:])
return trimmed_index, outlier_index
#Comparing estimation results for ols, rlm and wls with and without outliers
#---------------------------------------------------------------------------
#ols_test_fit = sm.OLS(data.endog, data.exog).fit()
olskeep, olsoutl = index_trim_outlier(ols_fit.resid, 2)
print 'ols outliers', olsoutl, ols_fit.resid[olsoutl]
ols_fit_rm2 = sm.OLS(data.endog[olskeep], data.exog[olskeep,:]).fit()
rlm_fit_rm2 = sm.RLM(data.endog[olskeep], data.exog[olskeep,:]).fit()
#weights = 1/incomesq
results = [ols_fit, ols_fit_rm2, rlm_fit, rlm_fit_rm2]
#Note: I think incomesq is already square
for weights in [1/incomesq, 1/incomesq**2, np.sqrt(incomesq)]:
print '\nComparison OLS and WLS with and without outliers'
wls_fit0 = sm.WLS(data.endog, data.exog, weights=weights).fit()
wls_fit_rm2 = sm.WLS(data.endog[olskeep], data.exog[olskeep,:],
weights=weights[olskeep]).fit()
wlskeep, wlsoutl = index_trim_outlier(ols_fit.resid, 2)
print '2 outliers candidates and residuals'
print wlsoutl, wls_fit.resid[olsoutl]
# redundant because ols and wls outliers are the same:
##wls_fit_rm2_ = sm.WLS(data.endog[wlskeep], data.exog[wlskeep,:],
## weights=1/incomesq[wlskeep]).fit()
print 'outliers ols, wls:', olsoutl, wlsoutl
print 'rsquared'
print 'ols vs ols rm2', ols_fit.rsquared, ols_fit_rm2.rsquared
print 'wls vs wls rm2', wls_fit0.rsquared, wls_fit_rm2.rsquared #, wls_fit_rm2_.rsquared
print 'compare R2_resid versus R2_wresid'
print 'ols minus 2', getrsq(ols_fit_rm2)[0],
print getrsq((ols_fit_rm2.wresid, ols_fit_rm2.model.wendog))[0]
print 'wls ', getrsq(wls_fit)[0],
print getrsq((wls_fit.wresid, wls_fit.model.wendog))[0]
print 'wls minus 2', getrsq(wls_fit_rm2)[0],
# next is same as wls_fit_rm2.rsquared for cross checking
print getrsq((wls_fit_rm2.wresid, wls_fit_rm2.model.wendog))[0]
#print getrsq(wls_fit_rm2_)[0],
#print getrsq((wls_fit_rm2_.wresid, wls_fit_rm2_.model.wendog))[0]
results.extend([wls_fit0, wls_fit_rm2])
print ' ols ols_rm2 rlm rlm_rm2 wls (lin) wls_rm2 (lin) wls (squ) wls_rm2 (squ) wls (sqrt) wls_rm2 (sqrt)'
print 'Parameter estimates'
print np.column_stack([r.params for r in results])
print 'R2 original data, next line R2 weighted data'
print np.column_stack([getattr(r, 'rsquared', None) for r in results])
print 'Standard errors'
print np.column_stack([getattr(r, 'bse', None) for r in results])
print 'Heteroscedasticity robust standard errors (with ols)'
print 'with outliers'
print np.column_stack([getattr(ols_fit, se, None) for se in ['HC0_se', 'HC1_se', 'HC2_se', 'HC3_se']])
#..'''
#..
#.. ols ols_rm2 rlm rlm_rm2 wls (lin) wls_rm2 (lin) wls (squ) wls_rm2 (squ) wls (sqrt) wls_rm2 (sqrt)
#..Parameter estimates
#..[[ -3.08181404 -5.06103843 -4.98510966 -5.34410309 -2.69418516 -3.1305703 -1.43815462 -1.58893054 -3.57074829 -6.80053364]
#.. [ 234.34702702 115.08753715 129.85391456 109.01433492 158.42697752 128.38182357 60.95113284 100.25000841 254.82166855 103.75834726]
#.. [ -14.99684418 -5.77558429 -6.46204829 -4.77409191 -7.24928987 -7.41228893 6.84943071 -3.34972494 -16.40524256 -4.5924465 ]
#.. [ 27.94090839 85.46566835 89.91389709 95.85086459 60.44877369 79.7759146 55.9884469 60.97199734 -3.8085159 84.69170048]
#.. [-237.1465136 39.51639838 -15.50014814 31.39771833 -114.10886935 -40.04207242 -6.41976501 -38.83583228 -260.72084271 117.20540179]]
#..
#..R2 original data, next line R2 weighted data
#..[[ 0.24357792 0.31745994 0.19220308 0.30527648 0.22861236 0.3112333 0.06573949 0.29366904 0.24114325 0.31218669]]
#..[[ 0.24357791 0.31745994 None None 0.05936888 0.0679071 0.06661848 0.12769654 0.35326686 0.54681225]]
#..
#..-> R2 with weighted data is jumping all over
#..
#..standard errors
#..[[ 5.51471653 3.31028758 2.61580069 2.39537089 3.80730631 2.90027255 2.71141739 2.46959477 6.37593755 3.39477842]
#.. [ 80.36595035 49.35949263 38.12005692 35.71722666 76.39115431 58.35231328 87.18452039 80.30086861 86.99568216 47.58202096]
#.. [ 7.46933695 4.55366113 3.54293763 3.29509357 9.72433732 7.41259156 15.15205888 14.10674821 7.18302629 3.91640711]
#.. [ 82.92232357 50.54681754 39.33262384 36.57639175 58.55088753 44.82218676 43.11017757 39.31097542 96.4077482 52.57314209]
#.. [ 199.35166485 122.1287718 94.55866295 88.3741058 139.68749646 106.89445525 115.79258539 105.99258363 239.38105863 130.32619908]]
#..
#..robust standard errors (with ols)
#..with outliers
#.. HC0_se HC1_se HC2_se HC3_se'
#..[[ 3.30166123 3.42264107 3.4477148 3.60462409]
#.. [ 88.86635165 92.12260235 92.08368378 95.48159869]
#.. [ 6.94456348 7.19902694 7.19953754 7.47634779]
#.. [ 92.18777672 95.56573144 95.67211143 99.31427277]
#.. [ 212.9905298 220.79495237 221.08892661 229.57434782]]
#..
#..removing 2 outliers
#..[[ 2.57840843 2.67574088 2.68958007 2.80968452]
#.. [ 36.21720995 37.58437497 37.69555106 39.51362437]
#.. [ 3.1156149 3.23322638 3.27353882 3.49104794]
#.. [ 50.09789409 51.98904166 51.89530067 53.79478834]
#.. [ 94.27094886 97.82958699 98.25588281 102.60375381]]
#..
#..
#..'''
# a quick bootstrap analysis
# --------------------------
#
#(I didn't check whether this is fully correct statistically)
#**With OLS on full sample**
nobs, nvar = data.exog.shape
niter = 2000
bootres = np.zeros((niter, nvar*2))
for it in range(niter):
rind = np.random.randint(nobs, size=nobs)
endog = data.endog[rind]
exog = data.exog[rind,:]
res = sm.OLS(endog, exog).fit()
bootres[it, :nvar] = res.params
bootres[it, nvar:] = res.bse
np.set_printoptions(linewidth=200)
print 'Bootstrap Results of parameters and parameter standard deviation OLS'
print 'Parameter estimates'
print 'median', np.median(bootres[:,:5], 0)
print 'mean ', np.mean(bootres[:,:5], 0)
print 'std ', np.std(bootres[:,:5], 0)
print 'Standard deviation of parameter estimates'
print 'median', np.median(bootres[:,5:], 0)
print 'mean ', np.mean(bootres[:,5:], 0)
print 'std ', np.std(bootres[:,5:], 0)
plt.figure()
for i in range(4):
plt.subplot(2,2,i+1)
plt.hist(bootres[:,i],50)
plt.title('var%d'%i)
#@savefig wls_bootstrap.png
plt.figtext(0.5, 0.935, 'OLS Bootstrap',
ha='center', color='black', weight='bold', size='large')
#**With WLS on sample with outliers removed**
data_endog = data.endog[olskeep]
data_exog = data.exog[olskeep,:]
incomesq_rm2 = incomesq[olskeep]
nobs, nvar = data_exog.shape
niter = 500 # a bit slow
bootreswls = np.zeros((niter, nvar*2))
for it in range(niter):
rind = np.random.randint(nobs, size=nobs)
endog = data_endog[rind]
exog = data_exog[rind,:]
res = sm.WLS(endog, exog, weights=1/incomesq[rind,:]).fit()
bootreswls[it, :nvar] = res.params
bootreswls[it, nvar:] = res.bse
print 'Bootstrap Results of parameters and parameter standard deviation',
print 'WLS removed 2 outliers from sample'
print 'Parameter estimates'
print 'median', np.median(bootreswls[:,:5], 0)
print 'mean ', np.mean(bootreswls[:,:5], 0)
print 'std ', np.std(bootreswls[:,:5], 0)
print 'Standard deviation of parameter estimates'
print 'median', np.median(bootreswls[:,5:], 0)
print 'mean ', np.mean(bootreswls[:,5:], 0)
print 'std ', np.std(bootreswls[:,5:], 0)
plt.figure()
for i in range(4):
plt.subplot(2,2,i+1)
plt.hist(bootreswls[:,i],50)
plt.title('var%d'%i)
#@savefig wls_bootstrap_rm2.png
plt.figtext(0.5, 0.935, 'WLS rm2 Bootstrap',
ha='center', color='black', weight='bold', size='large')
#..plt.show()
#..plt.close('all')
#::
#
# The following a random variables not fixed by a seed
#
# Bootstrap Results of parameters and parameter standard deviation
# OLS
#
# Parameter estimates
# median [ -3.26216383 228.52546429 -14.57239967 34.27155426 -227.02816597]
# mean [ -2.89855173 234.37139359 -14.98726881 27.96375666 -243.18361746]
# std [ 3.78704907 97.35797802 9.16316538 94.65031973 221.79444244]
#
# Standard deviation of parameter estimates
# median [ 5.44701033 81.96921398 7.58642431 80.64906783 200.19167735]
# mean [ 5.44840542 86.02554883 8.56750041 80.41864084 201.81196849]
# std [ 1.43425083 29.74806562 4.22063268 19.14973277 55.34848348]
#
# Bootstrap Results of parameters and parameter standard deviation
# WLS removed 2 outliers from sample
#
# Parameter estimates
# median [ -3.95876112 137.10419042 -9.29131131 88.40265447 -44.21091869]
# mean [ -3.67485724 135.42681207 -8.7499235 89.74703443 -46.38622848]
# std [ 2.96908679 56.36648967 7.03870751 48.51201918 106.92466097]
#
# Standard deviation of parameter estimates
# median [ 2.89349748 59.19454402 6.70583332 45.40987953 119.05241283]
# mean [ 2.97600894 60.14540249 6.92102065 45.66077486 121.35519673]
# std [ 0.55378808 11.77831934 1.69289179 7.4911526 23.72821085]
#
#
#
#Conclusion: problem with outliers and possibly heteroscedasticity
#-----------------------------------------------------------------
#
#in bootstrap results
#
#* bse in OLS underestimates the standard deviation of the parameters
# compared to standard deviation in bootstrap
#* OLS heteroscedasticity corrected standard errors for the original
# data (above) are close to bootstrap std
#* using WLS with 2 outliers removed has a relatively good match between
# the mean or median bse and the std of the parameter estimates in the
# bootstrap
#
#We could also include rsquared in bootstrap, and do it also for RLM.
#The problems could also mean that the linearity assumption is violated,
#e.g. try non-linear transformation of exog variables, but linear
#in parameters.
#
#
#for statsmodels
#
# * In this case rsquared for original data looks less random/arbitrary.
# * Don't change definition of rsquared from centered tss to uncentered
# tss when calculating rsquared in WLS if the original exog contains
# a constant. The increase in rsquared because of a change in definition
# will be very misleading.
# * Whether there is a constant in the transformed exog, wexog, or not,
# might affect also the degrees of freedom calculation, but I haven't
# checked this. I would guess that the df_model should stay the same,
# but needs to be verified with a textbook.
# * df_model has to be adjusted if the original data does not have a
# constant, e.g. when regressing an endog on a single exog variable
# without constant. This case might require also a redefinition of
# the rsquare and f statistic for the regression anova to use the
# uncentered tss.
# This can be done through keyword parameter to model.__init__ or
# through autodedection with hasconst = (exog.var(0)<1e-10).any()
# I'm not sure about fixed effects with a full dummy set but
# without a constant. In this case autodedection wouldn't work this
# way. Also, I'm not sure whether a ddof keyword parameter can also
# handle the hasconst case.
Jump to Line
Something went wrong with that request. Please try again.