# Notebook for testing and exploring objectives in tesser

In [1]:
import pandas as pd
import numpy as np
from scipy.spatial import distance
from scipy import optimize
from tesser import network
from tesser import util
from tesser import sr
from tesser import fit

In [2]:
data_dir = "/home/rodrigo/Dropbox/tesser_successor/Data/"

In [3]:
x = util.load_struct_subject(data_dir, 101)

In [4]:
%timeit sr.clearn_sr(x, 0.5, 0.5,21)

131 µs ± 1.07 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [5]:
%load_ext Cython

convert function to cython

In [47]:
induct_all = util.load_induct(data_dir)
struct_all = util.load_struct(data_dir)

subj_filter = f'SubjNum == {101}'
induct_df = induct_all.query(subj_filter)
struct_df = struct_all.query(subj_filter)

n_states = len(np.unique(struct_df.objnum))

# get community matrix
net = network.temp_node_info()
comm = 1 - distance.squareform(distance.pdist(net['comm'][:, None], 'hamming'))

now to attempt fixing fitting times

In [48]:
def param_bounds(var_bounds, var_names):
    """Pack group-level parameters."""

    group_lb = [var_bounds[k][0] for k in var_names]
    group_ub = [var_bounds[k][1] for k in var_names]
    bounds = optimize.Bounds(group_lb, group_ub)
    return bounds

    
def fit_induct(struct_df, induct_df, fixed, var_names, var_bounds, n_states,
               f_optim=optimize.differential_evolution,
               verbose=False, options=None, comm=[]):
    """Fit induction data for one subject.

    For a given set of parameters, the structure learning task is used
    to generate a simulated SR matrix. Then this matrix is used to 
    simulate responses in the induction task. Parameters are optimized
    to obtain the set that maximizes the probability of the responses
    observed in the induction task.

    Parameters
    ----------
    struct_df : DataFrame
        Structure learning data.

    induct_df : DataFrame
        Induction test data.

    fixed : dict
        Parameter values for all fixed parameters.

    var_names : list
        String name for each variable parameter.

    var_bounds : dict
        Bounds (in low, high order) for each variable parameter.

    f_optim : function
        Function to use for parameter optimization.

    verbose : Boolean
        If true, more information about the search will be printed.

    options : dict
        Options to pass to f_optim.

    use_run : tuple
        Run to take the SR matrix from for predicting induction data,
        specified as (part_number, run_number).

    Returns
    -------
    param : dict
        Best-fitting parameters.

    logl : float
        Maximum log likelihood.
    """

    if options is None:
        options = {}

    param = fixed.copy()
    subjects = struct_df.SubjNum.unique()

    def f_fit(x):
        param.update(dict(zip(var_names, x)))
        # draft code to fit at the group level:
        logl = 0
        for subject in subjects:
            subj_filter = f'SubjNum == {subject}'
            subj_struct = struct_df.query(subj_filter)
            subj_induct = induct_df.query(subj_filter)
            subj_logl = fit.cmain(subj_struct,subj_induct, **param, n_states=n_states,
                                                     return_trial=False, comm=comm)
            logl += subj_logl
        # logl = get_induction_log_likelihood(struct_df, induct_df, **param,
        #                                     return_trial=False, use_run=use_run)
        return -logl

    bounds = param_bounds(var_bounds, var_names)
    res = f_optim(f_fit, bounds, disp=verbose, **options)

    # fitted parameters
    param = fixed.copy()
    param.update(dict(zip(var_names, res['x'])))

    logl = -res['fun']
    return param, logl

In [53]:
fixed = {'w': 1.0}
var_names = ['gamma', 'alpha', 'tau']
var_bounds = {'alpha': [0, 1], 'gamma': [0, 1], 'tau': [0, 10]}
fit_induct(struct_all, induct_all, fixed, var_names, var_bounds, 
           n_states=n_states, verbose=False, comm=comm)

({'w': 1.0,
  'gamma': 0.9915182972903186,
  'alpha': 0.04573502900941932,
  'tau': 1.0664064743190262},
 -1053.598065597301)

In [54]:
print(fit.cmain(struct_df, induct_df, 0.99, 0.02, 1.0, 1.0,n_states, True, comm=comm))

(-28.89193189195182, <MemoryView of 'ndarray' at 0x7f1cdad67890>)


In [55]:
import time
start_time =time.time()
fit.cmain(struct_df, induct_df, 0.5, 0.5, 0.5, 1.0, n_states, True, comm=comm)
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.0020296573638916016 seconds ---


In [56]:
import time
start_time =time.time()
fit.get_induction_log_likelihood_hybrid(struct_df, induct_df, 0.5, 0.5, 0.5, 1.0)
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.04805946350097656 seconds ---


In [36]:
%timeit fit.cmain(struct_df, induct_df, 0.5, 0.5, 0.5, 1.0, n_states, True, comm=comm)

1.39 ms ± 55.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [57]:
%timeit fit_induct(struct_all, induct_all, fixed, var_names, var_bounds,n_states=n_states, verbose=False, comm=comm)

1min 2s ± 8.57 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
