Skip to content

Commit

Permalink
new methods for preparing and fitting
Browse files Browse the repository at this point in the history
  • Loading branch information
unknown authored and unknown committed Aug 3, 2015
1 parent 1b9c30c commit 05234c2
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 116 deletions.
179 changes: 69 additions & 110 deletions pathtools/markovchain.py
Expand Up @@ -6,24 +6,19 @@

from __future__ import division

#import PathSim
#import csv

from collections import defaultdict, OrderedDict
import random
import collections
import operator
#import scipy.sparse as sp

import numpy as np
import sys
import math
#import operator
#from scipy import stats

from scipy.special import gammaln
from scipy.sparse import csr_matrix, coo_matrix
#from scipy.special import gamma
#import copy
#from random import choice
import itertools

import copy
import tables as tb
import warnings
Expand Down Expand Up @@ -63,7 +58,6 @@ def __init__(self, k=1, reverse=False, use_prior=False, reset=True, prior=1., s
self.reset_ = reset

self.state_count_ = state_count
self.states_initial_ = []
self.parameter_count_ = 0
self.observation_count_ = 0

Expand All @@ -73,6 +67,7 @@ def __init__(self, k=1, reverse=False, use_prior=False, reset=True, prior=1., s

#probabilities
self.transition_dict_ = defaultdict(lambda : defaultdict(float))
self.transition_dict_norm_ = None

self.prediction_position_dict_ = dict()
#self.states_ = dict()
Expand All @@ -97,17 +92,13 @@ def __init__(self, k=1, reverse=False, use_prior=False, reset=True, prior=1., s
raise Exception("Can't work with a specific alpha without vocabulary information!")
if self.specific_prior_ is not None and self.modus_ != "bayes":
raise Exception("Specific alpha only works mit Bayes modus!")
if self.specific_prior_ is not None and isinstance(self.specific_prior_, csr_matrix):
if self.specific_prior_.shape[0] != self.specific_prior_.shape[1]:
warnings.warn("Specific alpha dimensions are not the same. Only appropriate if one the matrix is 1xN for setting each row the same! Only works for csr_matrix!")


self.proba_from_unknown_ = 0
self.proba_to_unknown_ = dict()

def _dict_divider(self, d):
'''
Internal function for dict divider and smoothing
Internal function for dict dividing and smoothing
'''

if self.use_prior_ == True:
Expand All @@ -134,8 +125,6 @@ def _dict_divider(self, d):
for i, j in v.iteritems():
v[i] = j / s

##print "row sum: ", float(sum(v.values()))

def _dict_ranker(self, d):
'''
Apply ranks to a dict according to the values
Expand All @@ -161,39 +150,12 @@ def _dict_ranker(self, d):

return ranked_key_dict

def _distr_chips_row(self, matrix, chips):
'''
Helper class!
Do not use outside.
See: https://github.com/psinger/HypTrails
'''

matrix = (matrix / matrix.sum()) * chips

floored = matrix.floor()
rest_sum = int(chips - floored.sum())

matrix = matrix - floored

idx = matrix.data.argpartition(-rest_sum)[-rest_sum:]

i, j = matrix.nonzero()

i_idx = i[idx]
j_idx = j[idx]

if len(i_idx) > 0:
floored[i_idx, j_idx] += 1

floored.eliminate_zeros()

del matrix
return floored

def prepare_data(self, paths):
'''
preparing data
Function for preparing the data
ALWAYS CALL FIRST
:param paths: List of lists containing the individual paths
'''
states = set()
if self.reset_:
Expand All @@ -204,30 +166,20 @@ def prepare_data(self, paths):
for line in paths:
for ele in line:
states.add(ele)
##print self.state_distr_

self.states_initial_ = frozenset(states)


#self.state_count_ = math.pow(float(len(states)), self.k_)
if self.state_count_ is None:
self.state_count_ = float(len(states))

if self.state_count_ < float(len(states)):
raise Exception("You set the state_count too low!")

self.parameter_count_ = pow(self.state_count_, self.k_) * (self.state_count_ - 1)
#print "initial state count", self.state_count_
##print self.states_initial_

def fit(self, paths, ret=False):
def fit(self, paths):
'''
fitting the data and constructing MLE
ret = flag for returning the transition matrix
Fitting the data and constructing MLE
:param paths: List of lists containing the individual paths
'''
#print "====================="
#print "K: ", self.k_
#print "prior: ", self.alpha_

for line in paths:
if self.reset_:
Expand All @@ -246,14 +198,53 @@ def fit(self, paths, ret=False):
else:
self.transition_dict_[elemA][elemB] += 1

##print self.transition_dict_


if self.modus_ == "mle":
self._dict_divider(self.transition_dict_)
self.transition_dict_norm_ = copy.deepcopy(self.transition_dict_)
self._dict_divider(self.transition_dict_norm_)

if ret:
return self.transition_dict_

def prepare_data_transitions(self, transitions):
'''
Alternative method for preparing the data if transitions are directly available
:param transitions: dict of dicts containing the individual transition counts
Note that the keys need to be tuples
ALWAYS CALL FIRST
'''
states = set()
if self.reset_:
states.add(RESET_STATE)
if self.state_count_ is not None:
self.state_count_ += 1

for k,v in transitions.iteritems():
states.add(k[0])
for k2 in v.keys():
states.add(k2)

self.states_initial_ = frozenset(states)

if self.state_count_ is None:
self.state_count_ = float(len(states))

if self.state_count_ < float(len(states)):
raise Exception("You set the state_count too low!")

self.parameter_count_ = pow(self.state_count_, self.k_) * (self.state_count_ - 1)


def fit_transitions(self, transitions, ret=False):
'''
Alternative method for fitting the data if transitions are directly available
:param transitions: dict of dicts containing the individual transition counts
Note that the keys need to be tuples
'''

self.transition_dict_ = transitions

if self.modus_ == "mle":
self.transition_dict_norm_ = copy.deepcopy(self.transition_dict_)
self._dict_divider(self.transition_dict_norm_)

def loglikelihood(self):
'''
Expand All @@ -264,27 +255,17 @@ def loglikelihood(self):
raise Exception("Loglikelihood calculation does not work with modus='bayes'")

likelihood = 0
prop_counter = 0

for path in self.paths_:
i = 0
for j in xrange(self.k_, len(path)):
elemA = tuple(path[i:j])
i += 1
elemB = path[j]
for k,v in self.transition_dict_.iteritems():
for x,c in v.iteritems():
if self.k_ == 0:
prop = self.transition_dict_[FAKE_ELEM][elemB]
prop = self.transition_dict_norm_[FAKE_ELEM][x]
else:
prop = self.transition_dict_[elemA][elemB]
likelihood += math.log(prop)
prop_counter += 1
prop = self.transition_dict_norm_[k][x]
likelihood += c * math.log(prop)

#print "likelihood", likelihood
#print "prop_counter", prop_counter
return likelihood


#@profile
def bayesian_evidence(self):
'''
Calculating the bayesian evidence
Expand All @@ -302,6 +283,8 @@ def bayesian_evidence(self):
if self.specific_prior_ is not None:
if isinstance(self.specific_prior_, csr_matrix):
is_csr = True
if self.specific_prior_.shape[0] != self.specific_prior_.shape[1]:
warnings.warn("Specific alpha dimensions are not the same. Only appropriate if one the matrix is 1xN for setting each row the same! Only works for csr_matrix!")
if self.specific_prior_.shape[0] == 1:
single_row = True
if self.reset_:
Expand All @@ -310,24 +293,16 @@ def bayesian_evidence(self):
else:
if self.specific_prior_.shape[1] < self.state_count_:
raise Exception("your specific prior needs to at least cover all states in the trails, shape mismatch")

elif isinstance(self.specific_prior_, tb.group.RootGroup):
is_hdf5 = True
else:
raise Exception("wrong specific prior format")



evidence = 0
counter = 0
i = 0

#only works for order 1 atm
# if self.reset_ == False:
# allkeys = frozenset(self.transition_dict_.keys())
# for s in self.states_initial_:
# if (s,) not in allkeys:
# self.transition_dict_[(s,)] = {}

tmp = 0

for k,v in self.transition_dict_.iteritems():
Expand All @@ -354,28 +329,24 @@ def bayesian_evidence(self):
indices = self.specific_prior_.indices[indptr_first:indptr_second]
indptr = np.array([0,indices.shape[0]])
if self.reset_:
shape = (1, self.state_count_-1)
shape = (1, self.state_count_+1)
else:
shape = (1, self.state_count_)
cx = csr_matrix((data, indices, indptr), shape=shape)


n_sum = sum(v.values())

if n_sum == 0.:
raise Exception("The row sum should not be zero, something went wrong here!")

prior_sum = 0

if cx is not None:
prior_sum += cx.sum()

prior_sum += int(self.state_count_) * self.alpha_

for x, c in v.iteritems():
prior = self.alpha_

# if empirical_prior > 0:
# prior += empirical_prior
if cx is not None and x != RESET_STATE:
idx = self.specific_prior_vocab_[x]
prior += cx[0, idx]
Expand All @@ -397,12 +368,8 @@ def bayesian_evidence(self):

evidence += (first_term + second_term)

#print "evidence", evidence
##print self.alpha_, empirical_prior, wrong_prior
##print "pseudo counts: ", counter
return evidence


def predict_eval(self, test, eval="rank"):
'''
Evaluating via predicting sequencies using MLE
Expand All @@ -417,11 +384,11 @@ def predict_eval(self, test, eval="rank"):
raise Exception("Prediction only works with smoothing on!")

if eval == "rank":
for k,v in self.transition_dict_.iteritems():
for k,v in self.transition_dict_norm_.iteritems():
#print v
self.prediction_position_dict_[k] = self._dict_ranker(v)

known_states = frozenset(self.transition_dict_.keys())
known_states = frozenset(self.transition_dict_norm_.keys())

for line in test:
#if self.k
Expand All @@ -445,7 +412,7 @@ def predict_eval(self, test, eval="rank"):
self.prediction_position_dict_[FAKE_ELEM][
FAKE_ELEM])
elif eval == "top":
row = self.transition_dict_[FAKE_ELEM]
row = self.transition_dict_norm_[FAKE_ELEM]
items = row.items()
random.shuffle(items)
row = OrderedDict(items)
Expand Down Expand Up @@ -474,7 +441,7 @@ def predict_eval(self, test, eval="rank"):
self.prediction_position_dict_[elem][
FAKE_ELEM])
elif eval == "top":
row = self.transition_dict_[elem]
row = self.transition_dict_norm_[elem]
items = row.items()
random.shuffle(items)
row = OrderedDict(items)
Expand All @@ -486,17 +453,9 @@ def predict_eval(self, test, eval="rank"):

position += p
counter += 1


average_pos = position / counter
##print "unknown elem counter", unknown_elem_counter
#print "counter", counter
#print "average position", average_pos
return average_pos







2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -5,7 +5,7 @@

setup(
name='pathtools',
version='0.5',
version='0.6',
author='Philipp Singer',
author_email='philipp.singer@gesis.org',
packages=['pathtools'],
Expand Down
7 changes: 2 additions & 5 deletions tests/test_pathsim.py
Expand Up @@ -13,12 +13,9 @@
print "==========="

sim = PathSim(window_size=window_size, sim_func="cosine", delimiter=" ")

<<<<<<< HEAD:tests/test_pathsim.py

sim.fit("../data/test_case_1")
=======
sim.fit("data/test_case_4")
>>>>>>> 937dda7df92974f40735aaa253c172267e17a7f0:test_pathsim.py


print sim.sim("1","1")
print sim.sim("1","2")
Expand Down

0 comments on commit 05234c2

Please sign in to comment.