Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

index exception when ANALYZING table with one column #31

Closed
fsaad opened this issue Jun 5, 2015 · 2 comments
Closed

index exception when ANALYZING table with one column #31

fsaad opened this issue Jun 5, 2015 · 2 comments

Comments

@fsaad
Copy link
Collaborator

fsaad commented Jun 5, 2015

I create a csv file with one column of data named ("c0") and 20 values. When I run ANALYZE, an exception in thrown

Here is a reproducible example of the procedure:

import bayeslite
import bayeslite.crosscat
import numpy as np
import math
import random
import sys

from crosscat.MultiprocessingEngine import MultiprocessingEngine
from bayeslite.shell.pretty import pp_cursor

def pprint(cursor):
    return pp_cursor(sys.stdout, cursor)

if __name__ == '__main__':
    # create one column of data, save to data.csv, with header c0
    t = 20
    data = np.random.rand(t)
    data = data.reshape(len(data),1)
    np.savetxt('data.csv', data, header='c0', comments='')

    btable = "table{}".format(t)
    generator = "table{}_cc".format(t)

    bdb = bayeslite.bayesdb_open()
    engine = bayeslite.crosscat.CrosscatMetamodel(
        MultiprocessingEngine())
    bayeslite.bayesdb_register_metamodel(bdb, engine)
    bayeslite.bayesdb_read_csv_file(bdb, btable, "data.csv",
                                    header=True, create=True)

    bql = '''
    SELECT * FROM {}
    '''.format(btable)
    c = bdb.execute(bql)
    pprint(c)


    bql = '''
    CREATE GENERATOR {} FOR {}
        USING crosscat (
           c0 NUMERICAL
        );
    '''.format(generator, btable)
    bdb.execute(bql)

    # exception thrown in the following call
    bql = '''
    INITIALIZE {} MODELS FOR {};
    '''.format(10, generator)
    bdb.execute(bql)

    bql = '''
    ANALYZE {} for {} ITERATIONS WAIT;
    '''.format(generator, 10)
    bdb.execute(bql)

    bql = '''
    CREATE TEMP TABLE simres AS
        SIMULATE c0 FROM {}
        LIMIT {};
    '''.format(generator, 15)
    bdb.execute(bql)

    bql = 'SELECT * FROM simres;'
    simdata = None
    with bdb.savepoint():
        c = bdb.execute(bql)
        simdata = np.array(c.fetchall())

And here is the stack trace:

In [19]: run one_col.py
             c0
---------------
  0.21819395493
 0.930373567089
 0.725379439808
 0.691447842751
 0.261562572085
 0.948943970262
  0.46605176487
0.0151432877238
 0.441854759811
 0.665655889346
0.0765081395686
 0.447978645136
 0.825578309208
 0.500403070452
 0.658746843184
 0.843358329166
 0.248048357726
  0.79623218477
 0.526216988005
 0.875729646947
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
/home/fsaad/Documents/pcp/bayeslite/experiments/one_col.py in <module>()
     55     ANALYZE {} for {} ITERATIONS WAIT;
     56     '''.format(generator, 10)
---> 57     bdb.execute(bql)
     58 
     59     bql = '''

/usr/local/lib/python2.7/dist-packages/bayeslite-0.1.dev-py2.7.egg/bayeslite/bayesdb.pyc in execute(self, string, bindings)
    149         if more:
    150             raise ValueError('>1 phrase in string')
--> 151         return bql.execute_phrase(self, phrase, bindings)
    152 
    153     def sql_execute(self, string, bindings=None):

/usr/local/lib/python2.7/dist-packages/bayeslite-0.1.dev-py2.7.egg/bayeslite/bql.pyc in execute_phrase(bdb, phrase, bindings)
    553             max_seconds=phrase.seconds,
    554             ckpt_iterations=phrase.ckpt_iterations,
--> 555             ckpt_seconds=phrase.ckpt_seconds)
    556         return empty_cursor(bdb)
    557 

/usr/local/lib/python2.7/dist-packages/bayeslite-0.1.dev-py2.7.egg/bayeslite/crosscat.pyc in analyze_models(self, bdb, generator_id, modelnos, iterations, max_seconds, ckpt_iterations, ckpt_seconds)
    588                         X_L=X_L_list,
    589                         X_D=X_D_list,
--> 590                         n_steps=n_steps,
    591                     )
    592                     if iterations is not None:

/usr/local/lib/python2.7/dist-packages/CrossCat-0.1.8-py2.7-linux-x86_64.egg/crosscat/LocalEngine.pyc in analyze(self, M_c, T, X_L, X_D, kernel_list, n_steps, c, r, max_iterations, max_time, do_diagnostics, diagnostics_every_N, ROW_CRP_ALPHA_GRID, COLUMN_CRP_ALPHA_GRID, S_GRID, MU_GRID, N_GRID, do_timing, CT_KERNEL)
    267             diagnostics_dict = munge_diagnostics(diagnostics_dict_list)
    268             if reprocess_diagnostics_func is not None:
--> 269                 diagnostics_dict = reprocess_diagnostics_func(diagnostics_dict)
    270             ret_tuple = ret_tuple + (diagnostics_dict, )
    271         if do_timing:

/usr/local/lib/python2.7/dist-packages/CrossCat-0.1.8-py2.7-linux-x86_64.egg/crosscat/utils/diagnostic_utils.pyc in default_reprocess_diagnostics_func(diagnostics_arr_dict)
     50     # column_paritition_assignments are column, iter, chain
     51     D = column_partition_assignments.shape[0] - 1
---> 52     f_z_statistic_0_1 = column_partition_assignments_to_f_z_statistic(column_partition_assignments, 1, 0)
     53     f_z_statistic_0_D = column_partition_assignments_to_f_z_statistic(column_partition_assignments, D, 0)
     54     diagnostics_arr_dict['f_z[0, 1]'] = f_z_statistic_0_1

/usr/local/lib/python2.7/dist-packages/CrossCat-0.1.8-py2.7-linux-x86_64.egg/crosscat/utils/diagnostic_utils.pyc in column_partition_assignments_to_f_z_statistic(column_partition_assignments, j, i)
     43     iter_column_chain_arr = column_partition_assignments.transpose((1, 0, 2))
     44     helper = lambda column_chain_arr: column_chain_to_ratio(column_chain_arr, j, i)
---> 45     as_list = map(helper, iter_column_chain_arr)
     46     return numpy.array(as_list)[:, numpy.newaxis]
     47 

/usr/local/lib/python2.7/dist-packages/CrossCat-0.1.8-py2.7-linux-x86_64.egg/crosscat/utils/diagnostic_utils.pyc in <lambda>(column_chain_arr)
     42         j, i=0):
     43     iter_column_chain_arr = column_partition_assignments.transpose((1, 0, 2))
---> 44     helper = lambda column_chain_arr: column_chain_to_ratio(column_chain_arr, j, i)
     45     as_list = map(helper, iter_column_chain_arr)
     46     return numpy.array(as_list)[:, numpy.newaxis]

/usr/local/lib/python2.7/dist-packages/CrossCat-0.1.8-py2.7-linux-x86_64.egg/crosscat/utils/diagnostic_utils.pyc in column_chain_to_ratio(column_chain_arr, j, i)
     32 
     33 def column_chain_to_ratio(column_chain_arr, j, i=0):
---> 34     chain_i_j = column_chain_arr[[i, j], :]
     35     is_same = numpy.diff(chain_i_j, axis=0)[0] == 0
     36     n_chains = len(is_same)

IndexError: index 1 is out of bounds for axis 0 with size 1
@riastradh-probcomp
Copy link
Contributor

Little aside: Please don't substitute strings directly into SQL/BQL queries! In most cases, you should be using query parameters:

cursor = bdb.execute('SIMULATE x, y, z FROM t GIVEN w = ? LIMIT ?', ('zot', 42))

If you absolutely must do string substitution, e.g. because you need to substitute a table name, use sqlite3_quote_name from bayeslite.sqlite3_util:

qt = sqlite3_quote_name(table_name)
qcn = sqlite3_quote_name(column_name)
bql = 'SELECT %s FROM %s' % (qt, qcn)

(This should be exposed by the bayeslite module -- that it is not is an API mistake.)

riastradh-probcomp added a commit that referenced this issue Jun 5, 2015
Fixes Github issue #31.  Requires Crosscat 0.1.9.

Dynamically ascertaining whether these tests should fail based on the
Crosscat version was too much trouble to implement.
@riastradh-probcomp
Copy link
Contributor

Fixed in 9e87fc5.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants