In [2]:
%doctest_mode

Exception reporting mode: Plain
Doctest mode is: ON


In [5]:
from time import gmtime, strftime

In [None]:
import doctest
doctest.testmod()

In [6]:
from pyspark import StorageLevel
import pydoop.hdfs as hdfs

In [4]:
from math import log, exp
from functools import reduce

"""
sources: 
http://windowoffice.tumblr.com/post/33548509/logsum-underflow-trick-re-discovery
https://facwiki.cs.byu.edu/nlp/index.php/Log_Domain_Computations
https://mikelove.wordpress.com/2011/06/06/log-probabilities-trick/
"""
def log_add(logx,logy):

    if (logy > logx):
        logy,logx = logx,logy
        
    negdiff = logy - logx

    if negdiff < -20:
        return logx

    return logx + log(1.0 + exp(negdiff))

def sum_logs(logs):
    """
    return the log of total values
    >>> exp(sum_logs([log(0.1),log(0.4),log(0.6)]))
    1.1
    """
    return reduce(log_add,logs) 

def normalize_logs(logs):
    """
    convert the logs to normalized probablities
    >>>normalize_logs(array([-1000,-1000,-990]))
    array([  4.53958078e-05,   4.53958078e-05,   9.99909208e-01])
    """
    log_total = sum_logs(logs)
    logprobs = logs-log_total
    probs = np.exp(logprobs)
    return probs

def pick_random_points(logprobs,num_points):
    """
    picks random entries according to the logprobs vector
    >>>np.random.seed(24)
    >>>pick_random_points(array([log(0.1),log(0.1),log(0.5),log(0.3)]),num_points=10)
    array([3, 2, 3, 2, 2, 3, 3, 2, 1, 2])
    """
    probs = normalize_logs(logprobs)
    xk = np.arange(len(probs))
    custm = stats.rv_discrete(name='custm', values=(xk, probs))
    return custm.rvs(size=num_points)

def logprobs_to_normprobs(logprobs):
    probs = np.exp(logprobs)
    norm_prbs = probs/np.sum(probs)
    return norm_prbs

def logprobs_to_normprobs_safe(logprobs):
#     this version uses log addition
    logprob_total = sum_logs(logprobs)
    norm_logprbs = logprobs-logprob_total
    norm_prbs = np.exp(norm_logprbs)
    return norm_prbs

def calc_exp_log_prob(probs,logprobs):
    return np.sum([0 if prob<=0 else prob*logprob for prob,logprob in zip(probs,logprobs)])

In [1]:
from pyspark import SparkContext, SparkConf
sc = SparkContext(conf = SparkConf().setMaster("yarn-client").setAppName("app").set("spark.executor.memory", "512"))

In [2]:
END_STATE = sc.broadcast(100)
END_STATE_NAME = sc.broadcast("End")
START_STATE = sc.broadcast(0)
STATE_NAMES = sc.broadcast(['Start','frontpage','news','tech','local','opinion','on-air','misc','weather','msn-news','health','living','business','msn-sports','sports','summary','bbs','travel'])
num_partitions = 10
# num_clusters = sc.broadcast(10)

In [10]:
def pad_traj(traj):
    return [START_STATE.value]+list(traj)+[END_STATE.value,END_STATE.value]
def unpad_traj(traj):
    return traj[1:-2]
def state_to_name(state):
    if state==END_STATE:
        return END_STATE_NAME
    else:
        return STATE_NAMES[state]
def traj_to_namedstate(traj):
    return map(state_to_name,traj)

In [9]:
EA_SMOOTH_TRANSITION = 0.1

def create_smooth_transitions(states=[]):
    return {(i,j):EA_SMOOTH_TRANSITION for i in states for j in states}

def create_smooth_transitions_list(states=[]):
    return [((i,j),EA_SMOOTH_TRANSITION) for i in states for j in states]

In [8]:
def emit_ind_trans_prop(ind__traj__clsp_rob):
    '''
    >>> ind__traj__clsp_rob = (0, ([0, 1, 1, 100, 100], array([ 0.50891031,  0.26625345,  0.22483623]))))
    >>> emit_ind_trans_prop(ind__traj__clsp_rob)
    [((0, (0, 1)), 0.50891030999999998), ((0, (1, 1)), 0.50891030999999998), ((0, (1, 100)), 0.50891030999999998), ((0, (100, 100)), 0.50891030999999998), ((1, (0, 1)), 0.26625345), ((1, (1, 1)), 0.26625345), ((1, (1, 100)), 0.26625345), ((1, (100, 100)), 0.26625345), ((2, (0, 1)), 0.22483623), ((2, (1, 1)), 0.22483623), ((2, (1, 100)), 0.22483623), ((2, (100, 100)), 0.22483623)]
    '''
    index = ind__traj__clsp_rob[0]
    traj = ind__traj__clsp_rob[1][0]
    clusters_probs = ind__traj__clsp_rob[1][1]
    
    cl_step_propbs = []
    
    trans = zip(traj[:-1],traj[1:])
    for c_ind, cls_p in enumerate(clusters_probs):
        for step in trans:        
            cl_step_propbs.append(((c_ind, step), (cls_p, index)))
#             ini_p = smooth_trans_mtrx[step]
#             cl_step_propbs.append(((c_ind, step), (ini_p + cls_p, index)))
            #             yield ((c_ind, step), cls_p)
        
    return cl_step_propbs
    

In [7]:
def sort_list_based_on_another(base, dependent):
    '''
    >>> sort_list_based_on_another([0, 2, 1], [0.22, 0.55, 0.66])
    ([0, 1, 2], [0.22, 0.66, 0.55])
    '''
    z = zip(base, dependent)
    sz = sorted(z)
    return [t[0] for t in sz], [t[1] for t in sz]
    

In [72]:
msnbc_no_header = sc.textFile("/ea/msnbc_no_header.seq", num_partitions)

In [73]:
#read trajectories
list_traj = msnbc_no_header.map(lambda line: [int(i) for i in line.split()]) \
                           .filter(lambda seq: len(seq) <= 500) \
                           .map(pad_traj) \
                           .zipWithIndex() \
                           .map(lambda t: (t[1], t[0]))


In [156]:
list_traj.take(2)

[(0, [0, 1, 1, 100, 100]), (1, [0, 2, 100, 100])]

In [49]:
num_traj = 500
list_traj_probs = sc.parallelize(xrange(num_traj), num_partitions) \
                                .map(lambda v: np.random.rand(1,num_clusters.value)[0]) 
    
list_traj_probs.take(4)

[array([ 0.33029398,  0.4153317 ,  0.24605769,  0.17777304,  0.53953642,
        0.51564893,  0.91912959,  0.54545018,  0.01338867,  0.04689129]), array([ 0.00617019,  0.04893082,  0.12358357,  0.81829436,  0.32779294,
        0.54885038,  0.79694105,  0.40638564,  0.74990192,  0.75861914]), array([ 0.34184011,  0.41096995,  0.96619596,  0.72706043,  0.60755683,
        0.28279102,  0.50355959,  0.75314604,  0.49912015,  0.14816885]), array([ 0.0922865 ,  0.07525316,  0.83075079,  0.91919112,  0.15721844,
        0.09215071,  0.82578609,  0.96152506,  0.7221524 ,  0.73515981])]

In [2]:
def em_train2(list_traj, num_iterations=1):
    
    num_traj = list_traj.count()
    
    # measure_relative_state_size = states prior probs
    states = list_traj.flatMap(lambda states: states[1])
    states_counts = states.countByValue()
    states_counts_sum = sum(states_counts.values())
    states_dict = {k:float(v)/float(states_counts_sum)  for k,v in states_counts.iteritems()}
#     states = sc.parallelize([k:float(v)/float(states_counts_sum)  for k,v in states_counts.iteritems()])
#   [(step, EA_SMOOTH_TRANSITION), ]
    smooth_mtrx_list = sc.parallelize(create_smooth_transitions_list(states_dict)).flatMap(lambda t:[((c,t[0]), t[1]) for c in range(num_clusters.value)])
    
    #init trajectories probs (init_traj_probs
    # list_traj_probs = sc.parallelize(np.random.rand(num_traj,3),num_partitions).map(lambda vec: vec/np.sum(vec))
    #sasha: probably need to make in list_traj, so no need to join
    list_traj_probs = sc.parallelize(xrange(num_traj), num_partitions) \
                                .map(lambda v: np.random.rand(1,num_clusters.value)[0]) \
                                .map(lambda vec: vec/np.sum(vec)) \
                                .zipWithIndex() \
                                .map(lambda t: (t[1], t[0]))
        
    
    
#     global mikita_markov_models
    for i in range(num_iterations):
        
        print str(i) + ' iteration: ' + strftime("%Y-%m-%d %H:%M:%S", gmtime())
        
        # join trajectories and clusters initial random probs
        # ((index), ([traj], [cluster_prob]))
        list_traj__traj_probs = list_traj.join(list_traj_probs) 
        
        # ((cluster, trans), (p, index))
        # [((0, (0, 1)), (0.24919889991572008, 0)), ((0, (1, 1)), (0.24919889991572008, 0))]
        list_cls_trans__p_index = list_traj__traj_probs.flatMap(emit_ind_trans_prop)
        # ((cluster, trans), p)
        # [((0, (0, 1)), 0.24919889991572008), ((0, (1, 1)), 0.24919889991572008)]
        list_cls_trans__p = list_cls_trans__p_index.map(lambda t: (t[0],t[1][0]))
        list_cls_trans__p.persist(StorageLevel.MEMORY_AND_DISK)        
        # [((cluster, trans), p), ]  
        # [((0, (0, 1)), 0.15297377614141211), ((0, (1, 1)), 0.15297377614141211)
        list_cls_trans__p = list_cls_trans__p.union(smooth_mtrx_list)
        
        #################################  CALC MARKOV   ##############################
        
        # ((cluster, from), (trans, p))
        # [((0, 0), ((0, 3), 0.45565240865632561)), ((0, 3), ((3, 100), 0.45565240865632561))]
#         list_cls_from__trans_p = list_cls_trans__p.map(lambda t: ((t[0][0],t[0][1][0]), (t[0][1],t[1][0])))
        list_cls_from__trans_p = list_cls_trans__p.map(lambda t: ((t[0][0],t[0][1][0]), (t[0][1],t[1])))
        
        # ((cluster, from), p)
        # [((0, 0), 0.45565240865632561), ((0, 3), 0.45565240865632561)]
        list_cls_from__p = list_cls_from__trans_p.map(lambda t: (t[0], t[1][1]))
        # ((claster, from), summ)
        # [((1, 3), 88045.381738379481), ((1, 13), 93509.65969107172), ((0, 100), 428820.63873856084), ((2, 10), 53400.141021192758)]
        cls_from__summ = list_cls_from__p.reduceByKey(lambda x,y: x+y)
    
        # ((cluster,trans), sum)
        # [((2, (16, 100)), 452.38469331913268), ((0, (15, 5)), 1081.6800474165561), ((0, (8, 8)), 142241.31289751496)]
#         cls_trans__summ = list_cls_trans__p.map(lambda t: ((t[0][0],t[0][1]), t[1][0])) \
        cls_trans__summ = list_cls_trans__p.map(lambda t: ((t[0][0],t[0][1]), t[1])) \
                                                .reduceByKey(lambda x,y: x+y)

#         print cls_trans__summ.take(3)
        
        #  ((cluster, from), ((trans, trans_sum), from_summ))
        # [((2, 12), (((12, 10), 1019.2083959882245), 113267.37822605985)), ((2, 12), (((12, 2), 3666.330047976217), 113267.37822605985))
        cls_from__transsum_summ = cls_trans__summ.map(lambda t: ((t[0][0],t[0][1][0]), (t[0][1], t[1]))) \
                                                .join(cls_from__summ)

#         print cls_from__transsum_summ.take(3)
            
        # [((2, (0, 4)), 0.05168807562712003), ((2, (0, 16)), 0.00042905633895048338), ((2, (0, 6)), 0.16451598030626441)]
        markov_models = cls_from__transsum_summ.map(lambda t: ((t[0][0], t[1][0][0]), t[1][0][1]/t[1][1]))
        
#         markov_models_TF = cls_from__transsum_summ.map(lambda t: ((t[0][0], t[1][0][0]), t[1][0][1] > t[1][1]))
        
#         print markov_models_TF.filter(lambda t: t[1] == False).collect()
        
        #################################  CALC NEW PROBS   ##############################
        
        
        # [((2, (16, 100)), ((0.42948178892760447, 264), 0.043625160028027611)), ((2, (16, 100)), ((0.52153298549282001, 262602), 0.043625160028027611))]
        list_cls_trans__p_index_mrkp = list_cls_trans__p_index.join(markov_models)
        list_cls_trans__p_index_mrkp.persist(StorageLevel.MEMORY_ONLY)
        
        # ((cluster, index), (trans, logP))
        # [((0, 789061), ((11, 7), -5.004076355127463)), ((0, 789061), ((11, 7), -5.004076355127463))]
        list_cls_index__trans_logmrkp = list_cls_trans__p_index_mrkp.map(lambda t: ((t[0][0],t[1][0][1]), (t[0][1], math.log(t[1][1]))))

        # walk_logprobs
        # ((cluster, index), [(trans1, logP1),((trans2, logP2))])
        # [ ((0, 784470), [((9, 100), -1.3163148315938333), ((0, 9), -2.717692054383914), ((100, 100), 0.0)])]
        list_cls_index__grp_trans_logmrkp = list_cls_index__trans_logmrkp.groupByKey()
        
        # calc_walk_probs
        # (index, (cluster, walk_logprob))
        # [(784470, (0, -4.0340068859777469)), (950019, (1, -2.8974930003028816))
        cls_index__sumlogmrkp = list_cls_index__grp_trans_logmrkp.map(lambda t: (t[0][1], (t[0][0], sum([tr_logp[1] for tr_logp in t[1]]))))
        
        # [(index, [(cluster1, walk_logprob1), (cluster2, walk_logprob2), (cluster3, walk_logprob3)], ]      
        # [(0, [(0, -3.7289041683043873), (2, -3.7284874541593469), (1, -3.7306682864068987)], (655362, <pyspark.resultiterable.ResultIterable object at 0x7f01bf534790>)]
        index__cls_sumlogmrkp = cls_index__sumlogmrkp.groupByKey()
        
        # cluster_probs = logprobs_to_normprobs_safe
        # (index, [p1,p2,p3])
        # [(655362, [0.3332682408091523, 0.3329057626978422, 0.333825996493006]), ]
        list_traj_probs = index__cls_sumlogmrkp.map(lambda t: (t[0], \
                                    ([c_logp[0] for c_logp in t[1]], \
                                     logprobs_to_normprobs_safe(array([c_logp[1] for c_logp in t[1]]))))) \
                                  .map(lambda t: (t[0], (sort_list_based_on_another(t[1][0], t[1][1]))[1] ))
        
        
        
        list_cls_trans__p.unpersist()
        
    list_assignment = list_traj_probs.map(lambda t: (t[0], np.argmax(t[1])))
    return markov_models, list_assignment

In [135]:
print strftime("%Y-%m-%d %H:%M:%S", gmtime())
mk = em_train2(list_traj, 2)
print mk.collect()

2015-07-23 07:09:17
0 iteration: 2015-07-23 07:09:32
1 iteration: 2015-07-23 07:09:32


KeyboardInterrupt: 

#Random Data test

In [5]:
from pandas import DataFrame

In [3]:
def calc_next_step(trans_vec):
    next_step_vec = np.random.multinomial(1, trans_vec, size=1)[0]
    return np.sum(next_step_vec*range(len(trans_vec)))

def generate_test_trajs(trans_mtrx,num_trajs=100,traj_len=10):
    trajs = []
    for i in range(num_trajs):
        test_traj = generate_test_traj(trans_mtrx,traj_len)
        trajs.append(test_traj)
    return trajs
def generate_test_traj(trans_mtrx,traj_len=10):
    cur_pos = 0
    test_traj = [cur_pos]
    for j in range(traj_len):
        next_pos = calc_next_step(trans_mtrx[cur_pos])
        test_traj.append(next_pos)
        cur_pos = next_pos
    return array(test_traj)

In [4]:
# trans_mtrx = [[0.1,0.8,0.1],[0.4,0.2,0.4],[0.1,0.2,0.7]]
trans_mtrx = [[0.8,0.1,0.1],[1,0,0],[0.9,0.1,0]]
def generate_tarjs_per_cluster(list_trans_mtrx,num_trajs=100,traj_len=20):
    trajs = []
    labels = []
    for index,trans_mtrx in enumerate(list_trans_mtrx):
        cur_test_trajs = generate_test_trajs(trans_mtrx,num_trajs,traj_len)
        trajs += cur_test_trajs
        labels += [index]*num_trajs
    
    return DataFrame(data={'label':labels,'traj':trajs})

In [15]:
trans_mtrx0 = [[0.8,0.1,0.05,0.05],[0.95,0,0,0.05],[0.9,0.05,0,0.05],[0.9,0.05,0,0.05]]
trans_mtrx1 = [[0.1,0.8,0.05,0.05],[0,0.7,0.25,0.05],[0,0.75,0.1,0.15],[0,0.7,0.25,0.05]]
trans_mtrx2 = [[0.1,0.15,0.7,0.05],[0.05,0.1,0.8,0.05],[0.2,0.15,0.6,0.05],[0.1,0.15,0.7,0.05]]
list_trans_mtrx = [trans_mtrx0,trans_mtrx1,trans_mtrx2]
num_trajs=100
df_traj = generate_tarjs_per_cluster(list_trans_mtrx,num_trajs=num_trajs,traj_len=40)
df_traj = df_traj.iloc[np.random.permutation(len(df_traj))]
df_traj = df_traj.reset_index(drop=True)
df_traj['is_train']=True
df_traj.ix[df_traj.index<int(len(df_traj)*0.2),'is_train']=False

In [6]:
num_clusters=sc.broadcast(3)
train_trajs = df_traj[df_traj.is_train==True].traj
test_trajs = df_traj[df_traj.is_train==True].traj
sc.
# train_walks = [traj_to_walk(traj) for traj in train_trajs]
# test_walks = [traj_to_walk(traj) for traj in test_trajs]

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.readBroadcastFromFile.
: java.lang.IllegalStateException: Cannot call methods on a stopped SparkContext
	at org.apache.spark.SparkContext.assertNotStopped(SparkContext.scala:102)
	at org.apache.spark.SparkContext.broadcast(SparkContext.scala:1044)
	at org.apache.spark.api.java.JavaSparkContext.broadcast(JavaSparkContext.scala:648)
	at org.apache.spark.api.python.PythonRDD$.readBroadcastFromFile(PythonRDD.scala:399)
	at org.apache.spark.api.python.PythonRDD.readBroadcastFromFile(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:606)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
	at py4j.Gateway.invoke(Gateway.java:259)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:207)
	at java.lang.Thread.run(Thread.java:745)


In [None]:
states = [0,1,2,3]
train_walks = trajs_to_walks(train_trajs,states)
test_walks = trajs_to_walks(test_trajs,states)

In [207]:
train_trajsRDD = sc.parallelize(train_trajs.tolist(),num_partitions) \
                        .map(lambda t: t.tolist()) \
                        .zipWithIndex() \
                        .map(lambda t: (t[1], t[0]))
                        
    
# train_trajsRDD.take(3)
# train_trajsRDD.collect()

In [232]:
markovsRDD, assignmentsRDD = em_train2(train_trajsRDD,5)
assignments = assignmentsRDD.collect()
markovs_by_cls = markovsRDD.map(lambda t: (t[0][0],(t[0][1],t[1]))).groupByKey().collect()

0 iteration: 2015-07-23 09:06:35
1 iteration: 2015-07-23 09:06:35
2 iteration: 2015-07-23 09:06:35
3 iteration: 2015-07-23 09:06:35
4 iteration: 2015-07-23 09:06:35


KeyboardInterrupt: 

In [None]:
markovs_dict = {}
for c in markovs_by_cls:    
    m = {tr_p[0]:tr_p[1] for tr_p in c[1]}
    markovs_dict[c[0]] = m
markovs_dict

In [229]:
assignments.sort(key=lambda t: t[0])
list_assignment = [ass[1] for ass in assigments]


In [226]:
df_train = df_traj[df_traj.is_train]

In [230]:
real_assign = range(3)
df = DataFrame({'real':array(df_traj[df_traj.is_train==True].label),'assign':list_assignment})
real_assign = range(3)
df['compare'] = df.apply(lambda x: real_assign[x['real']]==x['assign'],axis=1)
len(df[df.compare])

93

In [231]:
# all_combs = itertools.chain(*(itertools.combinations(filter_vars, i) for i in range(1,3)))
import itertools
assigns = {}
for real_assign in itertools.permutations(range(3)):
    df['compare'] = df.apply(lambda x: real_assign[x['real']]==x['assign'],axis=1)
    num_eq = len(df[df.compare])
    assigns[real_assign] = num_eq

import operator
print max(assigns.iteritems(), key=operator.itemgetter(1))

((0, 1, 2), 93)


In [50]:
# arr = range(100)


init = sc.parallelize(xrange(10000), 3)

for i in range(5):
    print i
    c = sc.broadcast(1)
    init = init.map(lambda n: n+c.value)
    
# init.collect()


0
1
2
3
4


In [60]:
rdd = sc.parallelize(xrange(100), 3)
rdd1 = rdd.map(lambda t: ({'from':t, 'to':t+1}, {t}))
rdd2 = rdd1.map(lambda t:  (t[0]['from'], {t[1]*10}))
rdd2.take(4)

[(0, 0), (1, 10), (2, 20), (3, 30)]

In [65]:
rdd = sc.parallelize(xrange(10), 3)
rdd1 = rdd.map(lambda t: ({'from':t, 'to':t+1}, {t}))

rdd2 = rdd1.map(lambda t: (t[0]['from'], {t[1]*10}))
rdd3 = rdd2.union([({'from':222}, 9999)])
rdd3.collect()

AttributeError: 'list' object has no attribute '_jrdd_deserializer'

In [61]:
print sys.getsizeof((1,2))
print sys.getsizeof({'from':1, 'to':2})

72
280
