-
Notifications
You must be signed in to change notification settings - Fork 2.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[MRG] Add factor analysis #3294
Changes from 1 commit
08cb2d1
eab1b6b
1df21ff
fdc6695
1648c00
0c35793
7327de7
efbe717
dd72bac
224aae5
9689409
c7c4503
1a09d8c
bd4aae8
471abd2
e78e4f0
82411e5
596a5c2
a0275bb
b4ecb6b
5d8d555
e5b4f00
626a6b0
eedb869
4fa1309
47ef9d2
653a43f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -48,18 +48,26 @@ class Factor(Model): | |
endog_names: str | ||
Names of endogeous variables. | ||
If specified, it will be used instead of the column names in endog | ||
n_obs: int | ||
The number of observations. To be used together with `corr` | ||
Should be equals to the number of rows in `endog`. | ||
|
||
""" | ||
def __init__(self, endog, n_factor, corr=None, method='pa', smc=True, | ||
missing='drop', endog_names=None): | ||
# CHeck validity of n_factor | ||
missing='drop', endog_names=None, n_obs=None): | ||
if endog is not None: | ||
k_endog = endog.shape[1] | ||
elif corr is not None: | ||
k_endog = corr.shape[0] | ||
|
||
# Check validity of n_factor | ||
if n_factor <= 0: | ||
raise ValueError('n_factor must be larger than 0! %d < 0' % | ||
(n_factor)) | ||
if endog is not None and n_factor > endog.shape[1]: | ||
if endog is not None and n_factor > k_endog: | ||
raise ValueError('n_factor must be smaller or equal to the number' | ||
' of columns of endog! %d > %d' % | ||
(n_factor, endog.shape[1])) | ||
(n_factor, k_endog)) | ||
self.n_factor = n_factor | ||
|
||
if corr is None and endog is None: | ||
|
@@ -74,13 +82,29 @@ def __init__(self, endog, n_factor, corr=None, method='pa', smc=True, | |
# Check validity of corr | ||
if corr is not None: | ||
if corr.shape[0] != corr.shape[1]: | ||
raise ValueError('Correlation matrix corr must be a square') | ||
if endog is not None and endog.shape[1] != corr.shape[0]: | ||
raise ValueError('The number of columns in endog must be ' | ||
'equal to the number of columns and rows corr') | ||
self.corr = corr | ||
raise ValueError('Correlation matrix corr must be a square ' | ||
'(rows %d != cols %d)' % corr.shape) | ||
if endog is not None and k_endog != corr.shape[0]: | ||
raise ValueError('The number of columns in endog (=%d) must be ' | ||
'equal to the number of columns and rows corr (=%d)' | ||
% (k_endog, corr.shape[0])) | ||
if endog_names is None: | ||
if hasattr(corr, 'index'): | ||
endog_names = corr.index | ||
if hasattr(corr, 'columns'): | ||
endog_names = corr.columns | ||
self.endog_names = endog_names | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. corr could be a pandas DataFrame |
||
|
||
if corr is not None: | ||
self.corr = np.asarray(corr) | ||
else: | ||
self.corr = None | ||
|
||
# Check validity of n_obs | ||
if n_obs is not None: | ||
if endog is not None and endog.shape[0] != n_obs: | ||
raise ValueError('n_obs must be equal to the number of rows in endog') | ||
|
||
# Do not preprocess endog if None | ||
if endog is not None: | ||
super(Factor, self).__init__(endog, exog=None, missing=missing) | ||
|
@@ -96,7 +120,12 @@ def endog_names(self): | |
if self.endog is not None: | ||
return self.data.ynames | ||
else: | ||
return None | ||
d = 0 | ||
n = self.corr.shape[0] - 1 | ||
while n > 0: | ||
d += 1 | ||
n //= 10 | ||
return [('var%0' + str(d) + 'd') % i for i in range(self.corr.shape[0])] | ||
|
||
@endog_names.setter | ||
def endog_names(self, value): | ||
|
@@ -112,24 +141,32 @@ def endog_names(self, value): | |
else: | ||
self._endog_names = None | ||
|
||
def fit(self, n_max_iter=50, tolerance=1e-6): | ||
def fit(self, maxiter=50, tol=1e-8): | ||
""" | ||
Extract factors | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fit is the public function and needs the full docstring maxiter and tol need renaming |
||
|
||
Parameters | ||
---------- | ||
maxiter : int | ||
Maximum number of iterations for iterative estimation algorithms | ||
tol : float | ||
Stopping critera (error tolerance) for iterative estimation algorithms | ||
|
||
""" | ||
if self.method == 'pa': | ||
return self._fit_pa(n_max_iter=n_max_iter, tolerance=tolerance) | ||
return self._fit_pa(maxiter=maxiter, tol=tol) | ||
else: | ||
raise ValueError("Unknown factor extraction approach '%s'" % self.method) | ||
|
||
def _fit_pa(self, n_max_iter=50, tolerance=1e-6): | ||
def _fit_pa(self, maxiter=50, tol=1e-8): | ||
""" | ||
Extract factors using the iterative principal axis method | ||
|
||
Parameters | ||
---------- | ||
n_max_iter : int | ||
maxiter : int | ||
Maximum number of iterations for communality estimation | ||
tolerance : float | ||
tol : float | ||
If `norm(communality - last_communality) < tolerance`, | ||
estimation stops | ||
|
||
|
@@ -146,12 +183,12 @@ def _fit_pa(self, n_max_iter=50, tolerance=1e-6): | |
raise ValueError('n_factor must be smaller or equal to the rank' | ||
' of endog! %d > %d' % | ||
(self.n_factor, self.n_comp)) | ||
if n_max_iter <= 0: | ||
if maxiter <= 0: | ||
raise ValueError('n_max_iter must be larger than 0! %d < 0' % | ||
(n_max_iter)) | ||
if tolerance <= 0 or tolerance > 0.01: | ||
(maxiter)) | ||
if tol <= 0 or tol > 0.01: | ||
raise ValueError('tolerance must be larger than 0 and smaller than' | ||
' 0.01! Got %f instead' % (tolerance)) | ||
' 0.01! Got %f instead' % (tol)) | ||
|
||
# Initial communality estimation | ||
if self.smc: | ||
|
@@ -161,7 +198,7 @@ def _fit_pa(self, n_max_iter=50, tolerance=1e-6): | |
|
||
# Iterative communality estimation | ||
eigenvals = None | ||
for i in range(n_max_iter): | ||
for i in range(maxiter): | ||
# Get eigenvalues/eigenvectors of R with diag replaced by | ||
# communality | ||
for j in range(len(R)): | ||
|
@@ -183,7 +220,7 @@ def _fit_pa(self, n_max_iter=50, tolerance=1e-6): | |
# Calculate new loadings and communality | ||
A = V.dot(sL) | ||
c = np.power(A, 2).sum(axis=1) | ||
if norm(c_last - c) < tolerance: | ||
if norm(c_last - c) < tol: | ||
break | ||
|
||
self.eigenvals = eigenvals | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,17 +32,25 @@ | |
columns=['Loc', 'Basal', 'Occ', 'Max', 'id', 'alt']) | ||
|
||
|
||
def test_auto_col_name(): | ||
# Test auto generated variable names when endog_names is None | ||
mod = Factor(None, 2, corr=np.zeros([11, 11]),endog_names=None, | ||
smc=False) | ||
assert_array_equal(mod.endog_names, | ||
['var00', 'var01', 'var02', 'var03', 'var04', 'var05', | ||
'var06', 'var07', 'var08', 'var09', 'var10',]) | ||
|
||
|
||
def test_direct_corr_matrix(): | ||
# Test specifying the correlation matrix directly | ||
mod = Factor(None, 2, corr=np.corrcoef(X.iloc[:, 1:-1], rowvar=0), | ||
smc=False) | ||
results = mod.fit(tolerance=1e-10) | ||
results = mod.fit(tol=1e-10) | ||
a = np.array([[0.965392158864, 0.225880658666255], | ||
[0.967587154301, 0.212758741910989], | ||
[0.929891035996, -0.000603217967568], | ||
[0.486822656362, -0.869649573289374]]) | ||
assert_array_almost_equal(results.loadings, a, decimal=8) | ||
|
||
# Test set and get endog_names | ||
mod.endog_names = X.iloc[:, 1:-1].columns | ||
assert_array_equal(mod.endog_names, ['Basal', 'Occ', 'Max', 'id']) | ||
|
@@ -62,7 +70,7 @@ def test_example_compare_to_R_output(): | |
# No rotation without squared multiple correlations prior | ||
# produce same results as in R `fa` | ||
mod = Factor(X.iloc[:, 1:-1], 2, smc=False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, the unit test is here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess I need better reading glasses. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No problem, just let me know if there is anything else need to be done |
||
results = mod.fit(tolerance=1e-10) | ||
results = mod.fit(tol=1e-10) | ||
a = np.array([[0.965392158864, 0.225880658666255], | ||
[0.967587154301, 0.212758741910989], | ||
[0.929891035996, -0.000603217967568], | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
naming is
nobs
, which we use in all model