Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] Add factor analysis #3294

Closed
wants to merge 27 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
08cb2d1
Add factor analysis
yl565 Nov 26, 2016
eab1b6b
Add factor_rotation package by @mvds314
yl565 Nov 26, 2016
1df21ff
Add factor_rotation package by mvds314, modify to fit statsmodel
yl565 Nov 27, 2016
fdc6695
Add factor rotation
yl565 Nov 27, 2016
1648c00
Add scree and variance explained plots
yl565 Nov 27, 2016
0c35793
Add loading pattern plot
yl565 Nov 27, 2016
7327de7
Add FactorResults
yl565 Nov 27, 2016
efbe717
Add docstrings for Factor and FactorResults class
yl565 Nov 27, 2016
dd72bac
Input validation
yl565 Nov 27, 2016
224aae5
Move factor_rotation unit test to /tests folder
yl565 Nov 27, 2016
9689409
Fix pep8 problems
yl565 Nov 27, 2016
c7c4503
Move plots to plots.py for re-usability
yl565 Nov 27, 2016
1a09d8c
Fix factor_rotation flakes problem
yl565 Nov 27, 2016
bd4aae8
Modify comments
yl565 Nov 27, 2016
471abd2
Fix py2 compatibility
yl565 Nov 16, 2017
e78e4f0
Fix py2 compatibility in test_rotation.py
yl565 Nov 16, 2017
82411e5
Use np.corrcoef instead of pd.DataFrame.corr
yl565 Nov 17, 2017
596a5c2
Use np.linalg.eigh instead of eig for symmetric matrix R
yl565 Nov 17, 2017
a0275bb
Structure change. Do plot and rotate in the results class.
yl565 Nov 17, 2017
b4ecb6b
Add reference Hofacker 2004
yl565 Dec 10, 2017
5d8d555
Add support to switch factor extraction approach
yl565 Dec 10, 2017
e5b4f00
Add support to use correlation matrix directly
yl565 Dec 11, 2017
626a6b0
Add tests
yl565 Dec 11, 2017
eedb869
Add n_obs, add docstrings, add variable auto naming
yl565 Dec 11, 2017
4fa1309
Improve docstrings
yl565 Dec 11, 2017
47ef9d2
change n_obs -> nobs, add a test for nobs
yl565 Dec 11, 2017
653a43f
Add "Status: experimental"
yl565 Dec 12, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 58 additions & 21 deletions statsmodels/multivariate/factor.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,18 +48,26 @@ class Factor(Model):
endog_names: str
Names of endogeous variables.
If specified, it will be used instead of the column names in endog
n_obs: int
The number of observations. To be used together with `corr`
Should be equals to the number of rows in `endog`.

"""
def __init__(self, endog, n_factor, corr=None, method='pa', smc=True,
missing='drop', endog_names=None):
# CHeck validity of n_factor
missing='drop', endog_names=None, n_obs=None):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

naming is nobs, which we use in all model

if endog is not None:
k_endog = endog.shape[1]
elif corr is not None:
k_endog = corr.shape[0]

# Check validity of n_factor
if n_factor <= 0:
raise ValueError('n_factor must be larger than 0! %d < 0' %
(n_factor))
if endog is not None and n_factor > endog.shape[1]:
if endog is not None and n_factor > k_endog:
raise ValueError('n_factor must be smaller or equal to the number'
' of columns of endog! %d > %d' %
(n_factor, endog.shape[1]))
(n_factor, k_endog))
self.n_factor = n_factor

if corr is None and endog is None:
Expand All @@ -74,13 +82,29 @@ def __init__(self, endog, n_factor, corr=None, method='pa', smc=True,
# Check validity of corr
if corr is not None:
if corr.shape[0] != corr.shape[1]:
raise ValueError('Correlation matrix corr must be a square')
if endog is not None and endog.shape[1] != corr.shape[0]:
raise ValueError('The number of columns in endog must be '
'equal to the number of columns and rows corr')
self.corr = corr
raise ValueError('Correlation matrix corr must be a square '
'(rows %d != cols %d)' % corr.shape)
if endog is not None and k_endog != corr.shape[0]:
raise ValueError('The number of columns in endog (=%d) must be '
'equal to the number of columns and rows corr (=%d)'
% (k_endog, corr.shape[0]))
if endog_names is None:
if hasattr(corr, 'index'):
endog_names = corr.index
if hasattr(corr, 'columns'):
endog_names = corr.columns
self.endog_names = endog_names
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

corr could be a pandas DataFrame
We need self.corr = np.asarray(corr) to make sure that we have an array for further processing.
endog_names could be taken from the index or column of the DataFrame,
e.g. if hasattr(corr, index): endog_names=...


if corr is not None:
self.corr = np.asarray(corr)
else:
self.corr = None

# Check validity of n_obs
if n_obs is not None:
if endog is not None and endog.shape[0] != n_obs:
raise ValueError('n_obs must be equal to the number of rows in endog')

# Do not preprocess endog if None
if endog is not None:
super(Factor, self).__init__(endog, exog=None, missing=missing)
Expand All @@ -96,7 +120,12 @@ def endog_names(self):
if self.endog is not None:
return self.data.ynames
else:
return None
d = 0
n = self.corr.shape[0] - 1
while n > 0:
d += 1
n //= 10
return [('var%0' + str(d) + 'd') % i for i in range(self.corr.shape[0])]

@endog_names.setter
def endog_names(self, value):
Expand All @@ -112,24 +141,32 @@ def endog_names(self, value):
else:
self._endog_names = None

def fit(self, n_max_iter=50, tolerance=1e-6):
def fit(self, maxiter=50, tol=1e-8):
"""
Extract factors
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fit is the public function and needs the full docstring

maxiter and tol need renaming


Parameters
----------
maxiter : int
Maximum number of iterations for iterative estimation algorithms
tol : float
Stopping critera (error tolerance) for iterative estimation algorithms

"""
if self.method == 'pa':
return self._fit_pa(n_max_iter=n_max_iter, tolerance=tolerance)
return self._fit_pa(maxiter=maxiter, tol=tol)
else:
raise ValueError("Unknown factor extraction approach '%s'" % self.method)

def _fit_pa(self, n_max_iter=50, tolerance=1e-6):
def _fit_pa(self, maxiter=50, tol=1e-8):
"""
Extract factors using the iterative principal axis method

Parameters
----------
n_max_iter : int
maxiter : int
Maximum number of iterations for communality estimation
tolerance : float
tol : float
If `norm(communality - last_communality) < tolerance`,
estimation stops

Expand All @@ -146,12 +183,12 @@ def _fit_pa(self, n_max_iter=50, tolerance=1e-6):
raise ValueError('n_factor must be smaller or equal to the rank'
' of endog! %d > %d' %
(self.n_factor, self.n_comp))
if n_max_iter <= 0:
if maxiter <= 0:
raise ValueError('n_max_iter must be larger than 0! %d < 0' %
(n_max_iter))
if tolerance <= 0 or tolerance > 0.01:
(maxiter))
if tol <= 0 or tol > 0.01:
raise ValueError('tolerance must be larger than 0 and smaller than'
' 0.01! Got %f instead' % (tolerance))
' 0.01! Got %f instead' % (tol))

# Initial communality estimation
if self.smc:
Expand All @@ -161,7 +198,7 @@ def _fit_pa(self, n_max_iter=50, tolerance=1e-6):

# Iterative communality estimation
eigenvals = None
for i in range(n_max_iter):
for i in range(maxiter):
# Get eigenvalues/eigenvectors of R with diag replaced by
# communality
for j in range(len(R)):
Expand All @@ -183,7 +220,7 @@ def _fit_pa(self, n_max_iter=50, tolerance=1e-6):
# Calculate new loadings and communality
A = V.dot(sL)
c = np.power(A, 2).sum(axis=1)
if norm(c_last - c) < tolerance:
if norm(c_last - c) < tol:
break

self.eigenvals = eigenvals
Expand Down
14 changes: 11 additions & 3 deletions statsmodels/multivariate/tests/test_factor.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,25 @@
columns=['Loc', 'Basal', 'Occ', 'Max', 'id', 'alt'])


def test_auto_col_name():
# Test auto generated variable names when endog_names is None
mod = Factor(None, 2, corr=np.zeros([11, 11]),endog_names=None,
smc=False)
assert_array_equal(mod.endog_names,
['var00', 'var01', 'var02', 'var03', 'var04', 'var05',
'var06', 'var07', 'var08', 'var09', 'var10',])


def test_direct_corr_matrix():
# Test specifying the correlation matrix directly
mod = Factor(None, 2, corr=np.corrcoef(X.iloc[:, 1:-1], rowvar=0),
smc=False)
results = mod.fit(tolerance=1e-10)
results = mod.fit(tol=1e-10)
a = np.array([[0.965392158864, 0.225880658666255],
[0.967587154301, 0.212758741910989],
[0.929891035996, -0.000603217967568],
[0.486822656362, -0.869649573289374]])
assert_array_almost_equal(results.loadings, a, decimal=8)

# Test set and get endog_names
mod.endog_names = X.iloc[:, 1:-1].columns
assert_array_equal(mod.endog_names, ['Basal', 'Occ', 'Max', 'id'])
Expand All @@ -62,7 +70,7 @@ def test_example_compare_to_R_output():
# No rotation without squared multiple correlations prior
# produce same results as in R `fa`
mod = Factor(X.iloc[:, 1:-1], 2, smc=False)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, the unit test is here

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess I need better reading glasses.
Thanks

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No problem, just let me know if there is anything else need to be done

results = mod.fit(tolerance=1e-10)
results = mod.fit(tol=1e-10)
a = np.array([[0.965392158864, 0.225880658666255],
[0.967587154301, 0.212758741910989],
[0.929891035996, -0.000603217967568],
Expand Down