From 05234c26368d5f70c266edc33de9a5cbf093e34f Mon Sep 17 00:00:00 2001 From: unknown Date: Mon, 3 Aug 2015 12:36:49 +0200 Subject: [PATCH] new methods for preparing and fitting --- pathtools/markovchain.py | 179 +++++++++++++++------------------------ setup.py | 2 +- tests/test_pathsim.py | 7 +- 3 files changed, 72 insertions(+), 116 deletions(-) diff --git a/pathtools/markovchain.py b/pathtools/markovchain.py index 4d65353..910f95d 100644 --- a/pathtools/markovchain.py +++ b/pathtools/markovchain.py @@ -6,24 +6,19 @@ from __future__ import division -#import PathSim -#import csv + from collections import defaultdict, OrderedDict import random import collections import operator -#import scipy.sparse as sp + import numpy as np import sys import math -#import operator -#from scipy import stats + from scipy.special import gammaln from scipy.sparse import csr_matrix, coo_matrix -#from scipy.special import gamma -#import copy -#from random import choice -import itertools + import copy import tables as tb import warnings @@ -63,7 +58,6 @@ def __init__(self, k=1, reverse=False, use_prior=False, reset=True, prior=1., s self.reset_ = reset self.state_count_ = state_count - self.states_initial_ = [] self.parameter_count_ = 0 self.observation_count_ = 0 @@ -73,6 +67,7 @@ def __init__(self, k=1, reverse=False, use_prior=False, reset=True, prior=1., s #probabilities self.transition_dict_ = defaultdict(lambda : defaultdict(float)) + self.transition_dict_norm_ = None self.prediction_position_dict_ = dict() #self.states_ = dict() @@ -97,17 +92,13 @@ def __init__(self, k=1, reverse=False, use_prior=False, reset=True, prior=1., s raise Exception("Can't work with a specific alpha without vocabulary information!") if self.specific_prior_ is not None and self.modus_ != "bayes": raise Exception("Specific alpha only works mit Bayes modus!") - if self.specific_prior_ is not None and isinstance(self.specific_prior_, csr_matrix): - if self.specific_prior_.shape[0] != self.specific_prior_.shape[1]: - warnings.warn("Specific alpha dimensions are not the same. Only appropriate if one the matrix is 1xN for setting each row the same! Only works for csr_matrix!") - self.proba_from_unknown_ = 0 self.proba_to_unknown_ = dict() def _dict_divider(self, d): ''' - Internal function for dict divider and smoothing + Internal function for dict dividing and smoothing ''' if self.use_prior_ == True: @@ -134,8 +125,6 @@ def _dict_divider(self, d): for i, j in v.iteritems(): v[i] = j / s - ##print "row sum: ", float(sum(v.values())) - def _dict_ranker(self, d): ''' Apply ranks to a dict according to the values @@ -161,39 +150,12 @@ def _dict_ranker(self, d): return ranked_key_dict - def _distr_chips_row(self, matrix, chips): - ''' - Helper class! - Do not use outside. - See: https://github.com/psinger/HypTrails - ''' - - matrix = (matrix / matrix.sum()) * chips - - floored = matrix.floor() - rest_sum = int(chips - floored.sum()) - - matrix = matrix - floored - - idx = matrix.data.argpartition(-rest_sum)[-rest_sum:] - - i, j = matrix.nonzero() - - i_idx = i[idx] - j_idx = j[idx] - - if len(i_idx) > 0: - floored[i_idx, j_idx] += 1 - - floored.eliminate_zeros() - - del matrix - return floored def prepare_data(self, paths): ''' - preparing data + Function for preparing the data ALWAYS CALL FIRST + :param paths: List of lists containing the individual paths ''' states = set() if self.reset_: @@ -204,12 +166,7 @@ def prepare_data(self, paths): for line in paths: for ele in line: states.add(ele) - ##print self.state_distr_ - self.states_initial_ = frozenset(states) - - - #self.state_count_ = math.pow(float(len(states)), self.k_) if self.state_count_ is None: self.state_count_ = float(len(states)) @@ -217,17 +174,12 @@ def prepare_data(self, paths): raise Exception("You set the state_count too low!") self.parameter_count_ = pow(self.state_count_, self.k_) * (self.state_count_ - 1) - #print "initial state count", self.state_count_ - ##print self.states_initial_ - def fit(self, paths, ret=False): + def fit(self, paths): ''' - fitting the data and constructing MLE - ret = flag for returning the transition matrix + Fitting the data and constructing MLE + :param paths: List of lists containing the individual paths ''' - #print "=====================" - #print "K: ", self.k_ - #print "prior: ", self.alpha_ for line in paths: if self.reset_: @@ -246,14 +198,53 @@ def fit(self, paths, ret=False): else: self.transition_dict_[elemA][elemB] += 1 - ##print self.transition_dict_ - if self.modus_ == "mle": - self._dict_divider(self.transition_dict_) + self.transition_dict_norm_ = copy.deepcopy(self.transition_dict_) + self._dict_divider(self.transition_dict_norm_) - if ret: - return self.transition_dict_ + + def prepare_data_transitions(self, transitions): + ''' + Alternative method for preparing the data if transitions are directly available + :param transitions: dict of dicts containing the individual transition counts + Note that the keys need to be tuples + ALWAYS CALL FIRST + ''' + states = set() + if self.reset_: + states.add(RESET_STATE) + if self.state_count_ is not None: + self.state_count_ += 1 + + for k,v in transitions.iteritems(): + states.add(k[0]) + for k2 in v.keys(): + states.add(k2) + + self.states_initial_ = frozenset(states) + + if self.state_count_ is None: + self.state_count_ = float(len(states)) + + if self.state_count_ < float(len(states)): + raise Exception("You set the state_count too low!") + + self.parameter_count_ = pow(self.state_count_, self.k_) * (self.state_count_ - 1) + + + def fit_transitions(self, transitions, ret=False): + ''' + Alternative method for fitting the data if transitions are directly available + :param transitions: dict of dicts containing the individual transition counts + Note that the keys need to be tuples + ''' + + self.transition_dict_ = transitions + + if self.modus_ == "mle": + self.transition_dict_norm_ = copy.deepcopy(self.transition_dict_) + self._dict_divider(self.transition_dict_norm_) def loglikelihood(self): ''' @@ -264,27 +255,17 @@ def loglikelihood(self): raise Exception("Loglikelihood calculation does not work with modus='bayes'") likelihood = 0 - prop_counter = 0 - for path in self.paths_: - i = 0 - for j in xrange(self.k_, len(path)): - elemA = tuple(path[i:j]) - i += 1 - elemB = path[j] + for k,v in self.transition_dict_.iteritems(): + for x,c in v.iteritems(): if self.k_ == 0: - prop = self.transition_dict_[FAKE_ELEM][elemB] + prop = self.transition_dict_norm_[FAKE_ELEM][x] else: - prop = self.transition_dict_[elemA][elemB] - likelihood += math.log(prop) - prop_counter += 1 + prop = self.transition_dict_norm_[k][x] + likelihood += c * math.log(prop) - #print "likelihood", likelihood - #print "prop_counter", prop_counter return likelihood - - #@profile def bayesian_evidence(self): ''' Calculating the bayesian evidence @@ -302,6 +283,8 @@ def bayesian_evidence(self): if self.specific_prior_ is not None: if isinstance(self.specific_prior_, csr_matrix): is_csr = True + if self.specific_prior_.shape[0] != self.specific_prior_.shape[1]: + warnings.warn("Specific alpha dimensions are not the same. Only appropriate if one the matrix is 1xN for setting each row the same! Only works for csr_matrix!") if self.specific_prior_.shape[0] == 1: single_row = True if self.reset_: @@ -310,24 +293,16 @@ def bayesian_evidence(self): else: if self.specific_prior_.shape[1] < self.state_count_: raise Exception("your specific prior needs to at least cover all states in the trails, shape mismatch") + elif isinstance(self.specific_prior_, tb.group.RootGroup): is_hdf5 = True else: raise Exception("wrong specific prior format") - - evidence = 0 counter = 0 i = 0 - #only works for order 1 atm - # if self.reset_ == False: - # allkeys = frozenset(self.transition_dict_.keys()) - # for s in self.states_initial_: - # if (s,) not in allkeys: - # self.transition_dict_[(s,)] = {} - tmp = 0 for k,v in self.transition_dict_.iteritems(): @@ -354,28 +329,24 @@ def bayesian_evidence(self): indices = self.specific_prior_.indices[indptr_first:indptr_second] indptr = np.array([0,indices.shape[0]]) if self.reset_: - shape = (1, self.state_count_-1) + shape = (1, self.state_count_+1) else: shape = (1, self.state_count_) cx = csr_matrix((data, indices, indptr), shape=shape) - n_sum = sum(v.values()) if n_sum == 0.: raise Exception("The row sum should not be zero, something went wrong here!") prior_sum = 0 - if cx is not None: prior_sum += cx.sum() - prior_sum += int(self.state_count_) * self.alpha_ + for x, c in v.iteritems(): prior = self.alpha_ - # if empirical_prior > 0: - # prior += empirical_prior if cx is not None and x != RESET_STATE: idx = self.specific_prior_vocab_[x] prior += cx[0, idx] @@ -397,12 +368,8 @@ def bayesian_evidence(self): evidence += (first_term + second_term) - #print "evidence", evidence - ##print self.alpha_, empirical_prior, wrong_prior - ##print "pseudo counts: ", counter return evidence - def predict_eval(self, test, eval="rank"): ''' Evaluating via predicting sequencies using MLE @@ -417,11 +384,11 @@ def predict_eval(self, test, eval="rank"): raise Exception("Prediction only works with smoothing on!") if eval == "rank": - for k,v in self.transition_dict_.iteritems(): + for k,v in self.transition_dict_norm_.iteritems(): #print v self.prediction_position_dict_[k] = self._dict_ranker(v) - known_states = frozenset(self.transition_dict_.keys()) + known_states = frozenset(self.transition_dict_norm_.keys()) for line in test: #if self.k @@ -445,7 +412,7 @@ def predict_eval(self, test, eval="rank"): self.prediction_position_dict_[FAKE_ELEM][ FAKE_ELEM]) elif eval == "top": - row = self.transition_dict_[FAKE_ELEM] + row = self.transition_dict_norm_[FAKE_ELEM] items = row.items() random.shuffle(items) row = OrderedDict(items) @@ -474,7 +441,7 @@ def predict_eval(self, test, eval="rank"): self.prediction_position_dict_[elem][ FAKE_ELEM]) elif eval == "top": - row = self.transition_dict_[elem] + row = self.transition_dict_norm_[elem] items = row.items() random.shuffle(items) row = OrderedDict(items) @@ -486,17 +453,9 @@ def predict_eval(self, test, eval="rank"): position += p counter += 1 - average_pos = position / counter ##print "unknown elem counter", unknown_elem_counter #print "counter", counter #print "average position", average_pos return average_pos - - - - - - - \ No newline at end of file diff --git a/setup.py b/setup.py index 6bba538..8c9b4d0 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='pathtools', - version='0.5', + version='0.6', author='Philipp Singer', author_email='philipp.singer@gesis.org', packages=['pathtools'], diff --git a/tests/test_pathsim.py b/tests/test_pathsim.py index 2fbcf16..45a7bd0 100644 --- a/tests/test_pathsim.py +++ b/tests/test_pathsim.py @@ -13,12 +13,9 @@ print "===========" sim = PathSim(window_size=window_size, sim_func="cosine", delimiter=" ") - -<<<<<<< HEAD:tests/test_pathsim.py + sim.fit("../data/test_case_1") -======= - sim.fit("data/test_case_4") ->>>>>>> 937dda7df92974f40735aaa253c172267e17a7f0:test_pathsim.py + print sim.sim("1","1") print sim.sim("1","2")