In [1]:
%doctest_mode

Exception reporting mode: Plain
Doctest mode is: ON


In [2]:
from pandas import DataFrame, Series
import pandas as pd
import matplotlib.pyplot as plt
import string
from datetime import datetime, date, time

import re
%pylab inline
import numpy as np
import scipy as sci
import scipy.stats as stats
from scipy.optimize import leastsq

from collections import Counter

import operator

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


In [3]:
from time import gmtime, strftime

In [None]:
import doctest
doctest.testmod()

In [4]:
from pyspark import StorageLevel
import pydoop.hdfs as hdfs

In [5]:
from math import log, exp
from functools import reduce

"""
sources: 
http://windowoffice.tumblr.com/post/33548509/logsum-underflow-trick-re-discovery
https://facwiki.cs.byu.edu/nlp/index.php/Log_Domain_Computations
https://mikelove.wordpress.com/2011/06/06/log-probabilities-trick/
"""
def log_add(logx,logy):

    if (logy > logx):
        logy,logx = logx,logy
        
    negdiff = logy - logx

    if negdiff < -20:
        return logx

    return logx + log(1.0 + exp(negdiff))

def sum_logs(logs):
    """
    return the log of total values
    >>> exp(sum_logs([log(0.1),log(0.4),log(0.6)]))
    1.1
    """
    return reduce(log_add,logs) 

def normalize_logs(logs):
    """
    convert the logs to normalized probablities
    >>>normalize_logs(array([-1000,-1000,-990]))
    array([  4.53958078e-05,   4.53958078e-05,   9.99909208e-01])
    """
    log_total = sum_logs(logs)
    logprobs = logs-log_total
    probs = np.exp(logprobs)
    return probs

def pick_random_points(logprobs,num_points):
    """
    picks random entries according to the logprobs vector
    >>>np.random.seed(24)
    >>>pick_random_points(array([log(0.1),log(0.1),log(0.5),log(0.3)]),num_points=10)
    array([3, 2, 3, 2, 2, 3, 3, 2, 1, 2])
    """
    probs = normalize_logs(logprobs)
    xk = np.arange(len(probs))
    custm = stats.rv_discrete(name='custm', values=(xk, probs))
    return custm.rvs(size=num_points)

def logprobs_to_normprobs(logprobs):
    probs = np.exp(logprobs)
    norm_prbs = probs/np.sum(probs)
    return norm_prbs

def logprobs_to_normprobs_safe(logprobs):
#     this version uses log addition
    logprob_total = sum_logs(logprobs)
    norm_logprbs = logprobs-logprob_total
    norm_prbs = np.exp(norm_logprbs)
    return norm_prbs

def calc_exp_log_prob(probs,logprobs):
    return np.sum([0 if prob<=0 else prob*logprob for prob,logprob in zip(probs,logprobs)])

In [56]:
from pyspark import SparkContext, SparkConf
sc = SparkContext(conf = SparkConf().setMaster("yarn-client").setAppName("app").set("spark.executor.memory", "1500M"))

In [53]:
sc.stop()


In [57]:
END_STATE = sc.broadcast(100)
END_STATE_NAME = sc.broadcast("End")
START_STATE = sc.broadcast(0)
STATE_NAMES = sc.broadcast(['Start','frontpage','news','tech','local','opinion','on-air','misc','weather','msn-news','health','living','business','msn-sports','sports','summary','bbs','travel'])
# num_partitions = 3
# num_clusters = sc.broadcast(3)

In [8]:
def pad_traj(traj):
    return [START_STATE.value]+list(traj)+[END_STATE.value,END_STATE.value]
def unpad_traj(traj):
    return traj[1:-2]
def state_to_name(state):
    if state==END_STATE:
        return END_STATE_NAME
    else:
        return STATE_NAMES[state]
def traj_to_namedstate(traj):
    return map(state_to_name,traj)

In [9]:
EA_SMOOTH_TRANSITION = 0.1

def create_smooth_transitions(states=[]):
    return {(i,j):EA_SMOOTH_TRANSITION for i in states for j in states}

def create_smooth_transitions_list(states=[]):
    return [((i,j),EA_SMOOTH_TRANSITION) for i in states for j in states]

In [10]:
def emit_ind_trans_prop(ind__traj__clsp_rob):
    '''
    >>> ind__traj__clsp_rob = (0, ([0, 1, 1, 100, 100], array([ 0.50891031,  0.26625345,  0.22483623]))))
    >>> emit_ind_trans_prop(ind__traj__clsp_rob)
    [((0, (0, 1)), 0.50891030999999998), ((0, (1, 1)), 0.50891030999999998), ((0, (1, 100)), 0.50891030999999998), ((0, (100, 100)), 0.50891030999999998), ((1, (0, 1)), 0.26625345), ((1, (1, 1)), 0.26625345), ((1, (1, 100)), 0.26625345), ((1, (100, 100)), 0.26625345), ((2, (0, 1)), 0.22483623), ((2, (1, 1)), 0.22483623), ((2, (1, 100)), 0.22483623), ((2, (100, 100)), 0.22483623)]
    '''
    index = ind__traj__clsp_rob[0]
    traj = ind__traj__clsp_rob[1][0]
    clusters_probs = ind__traj__clsp_rob[1][1]
    
    cl_step_propbs = []
    
    trans = zip(traj[:-1],traj[1:])
    for c_ind, cls_p in enumerate(clusters_probs):
        for step in trans:        
            cl_step_propbs.append(((c_ind, step), (cls_p, index)))
#             ini_p = smooth_trans_mtrx[step]
#             cl_step_propbs.append(((c_ind, step), (ini_p + cls_p, index)))
            #             yield ((c_ind, step), cls_p)
        
    return cl_step_propbs
    

In [11]:
def sort_list_based_on_another(base, dependent):
    '''
    >>> sort_list_based_on_another([0, 2, 1], [0.22, 0.55, 0.66])
    ([0, 1, 2], [0.22, 0.66, 0.55])
    '''
    z = zip(base, dependent)
    sz = sorted(z)
    return [t[0] for t in sz], [t[1] for t in sz]
    

In [55]:
def em_train2(list_traj, num_iterations=1, num_clusters=3,num_partitions=3):
    
    persistedRDDs = []
    num_part = num_partitions
    
    
    list_traj = list_traj.partitionBy(num_part).persist(StorageLevel.DISK_ONLY).setName('list_traj')
    persistedRDDs.append(list_traj)
    
    num_traj = list_traj.count()
    
    # measure_relative_state_size = states prior probs
    states = list_traj.flatMap(lambda states: states[1])
    states_counts = states.countByValue()
    states_counts_sum = sum(states_counts.values())
    states_dict = {k:float(v)/float(states_counts_sum)  for k,v in states_counts.iteritems()}
#     states = sc.parallelize([k:float(v)/float(states_counts_sum)  for k,v in states_counts.iteritems()])
#   [(step, EA_SMOOTH_TRANSITION), ]
    smooth_mtrx_list = sc.parallelize(create_smooth_transitions_list(states_dict)).flatMap(lambda t:[((c,t[0]), t[1]) for c in range(num_clusters)]) \
                                                                                  .persist(StorageLevel.MEMORY_AND_DISK) \
                                                                                  .setName('smooth_mtrx_list')
                                
    
    
    #init trajectories probs (init_traj_probs
    # list_traj_probs = sc.parallelize(np.random.rand(num_traj,3),num_partitions).map(lambda vec: vec/np.sum(vec))
    #sasha: probably need to make in list_traj, so no need to join
    list_traj_probs = sc.parallelize(xrange(num_traj), num_partitions) \
                                .map(lambda v: np.random.rand(1,num_clusters)[0]) \
                                .map(lambda vec: vec/np.sum(vec)) \
                                .zipWithIndex() \
                                .map(lambda t: (t[1], t[0])) \
                                .partitionBy(num_part).persist(StorageLevel.DISK_ONLY) \
                                .setName('list_traj_probs')
                            
    persistedRDDs.append(list_traj_probs)
    
    t = TimeHelper()
    
    for i in range(num_iterations):
      
        # join trajectories and clusters initial random probs
        # ((index), ([traj], [cluster_prob]))
        list_traj__traj_probs = list_traj.join(list_traj_probs, num_part) 
        
        # ((cluster, trans), (p, index))
        # [((0, (0, 1)), (0.24919889991572008, 0)), ((0, (1, 1)), (0.24919889991572008, 0))]
        list_cls_trans__p_index = list_traj__traj_probs.flatMap(emit_ind_trans_prop)
        
        # ((cluster, trans), p)
        # [((0, (0, 1)), 0.24919889991572008), ((0, (1, 1)), 0.24919889991572008)]
        list_cls_trans__p = list_cls_trans__p_index.map(lambda t: (t[0],t[1][0]))
#         list_cls_trans__p.persist(StorageLevel.MEMORY_AND_DISK)      

        # [((cluster, trans), p), ]  
        # [((0, (0, 1)), 0.15297377614141211), ((0, (1, 1)), 0.15297377614141211)
        list_cls_trans__p = list_cls_trans__p.union(smooth_mtrx_list) \
                                            .partitionBy(num_part*2) \
                                            .persist(StorageLevel.DISK_ONLY) \
                                            .setName('list_cls_trans__p'+str(i))                
        persistedRDDs.append(list_cls_trans__p)
        
        list_traj_probs.unpersist()
        #################################  CALC MARKOV   ##############################
        
        # ((cluster, from), (trans, p))
        # [((0, 0), ((0, 3), 0.45565240865632561)), ((0, 3), ((3, 100), 0.45565240865632561))]
#         list_cls_from__trans_p = list_cls_trans__p.map(lambda t: ((t[0][0],t[0][1][0]), (t[0][1],t[1][0])))
        list_cls_from__trans_p = list_cls_trans__p.map(lambda t: ((t[0][0],t[0][1][0]), (t[0][1],t[1])))
        
        # ((cluster, from), p)
        # [((0, 0), 0.45565240865632561), ((0, 3), 0.45565240865632561)]
        list_cls_from__p = list_cls_from__trans_p.map(lambda t: (t[0], t[1][1]))
        # ((claster, from), summ)
        # [((1, 3), 88045.381738379481), ((1, 13), 93509.65969107172), ((0, 100), 428820.63873856084), ((2, 10), 53400.141021192758)]
        cls_from__summ = list_cls_from__p.reduceByKey(lambda x,y: x+y) \
                                         .coalesce(num_part)
    
        # ((cluster,trans), sum)
        # [((2, (16, 100)), 452.38469331913268), ((0, (15, 5)), 1081.6800474165561), ((0, (8, 8)), 142241.31289751496)]
#         cls_trans__summ = list_cls_trans__p.map(lambda t: ((t[0][0],t[0][1]), t[1][0])) \
        cls_trans__summ = list_cls_trans__p.map(lambda t: ((t[0][0],t[0][1]), t[1])) \
                                           .reduceByKey(lambda x,y: x+y) \
                                           .coalesce(num_part)
        
        #  ((cluster, from), ((trans, trans_sum), from_summ))
        # [((2, 12), (((12, 10), 1019.2083959882245), 113267.37822605985)), ((2, 12), (((12, 2), 3666.330047976217), 113267.37822605985))
        cls_from__transsum_summ = cls_trans__summ.map(lambda t: ((t[0][0],t[0][1][0]), (t[0][1], t[1]))) \
                                                .join(cls_from__summ, num_part)
            
        # [((2, (0, 4)), 0.05168807562712003), ((2, (0, 16)), 0.00042905633895048338), ((2, (0, 6)), 0.16451598030626441)]
        markov_models = cls_from__transsum_summ.map(lambda t: ((t[0][0], t[1][0][0]), t[1][0][1]/t[1][1]))
                
            
        #################################  CALC NEW PROBS   ##############################
                
        # [((2, (16, 100)), ((0.42948178892760447, 264), 0.043625160028027611)), ((2, (16, 100)), ((0.52153298549282001, 262602), 0.043625160028027611))]
        list_cls_trans__p_index_mrkp = list_cls_trans__p_index.join(markov_models, num_part)
#         list_cls_trans__p_index_mrkp.persist(StorageLevel.MEMORY_ONLY)
        
        # ((cluster, index), (trans, logP))
        # [((0, 789061), ((11, 7), -5.004076355127463)), ((0, 789061), ((11, 7), -5.004076355127463))]
        list_cls_index__trans_logmrkp = list_cls_trans__p_index_mrkp.map(lambda t: ((t[0][0],t[1][0][1]), (t[0][1], math.log(t[1][1]))))

        # walk_logprobs
        # ((cluster, index), [(trans1, logP1),((trans2, logP2))])
        # [ ((0, 784470), [((9, 100), -1.3163148315938333), ((0, 9), -2.717692054383914), ((100, 100), 0.0)])]
        list_cls_index__grp_trans_logmrkp = list_cls_index__trans_logmrkp.groupByKey()
        
        # calc_walk_probs
        # (index, (cluster, walk_logprob))
        # [(784470, (0, -4.0340068859777469)), (950019, (1, -2.8974930003028816))
        cls_index__sumlogmrkp = list_cls_index__grp_trans_logmrkp.map(lambda t: (t[0][1], (t[0][0], sum([tr_logp[1] for tr_logp in t[1]]))))
        
        # [(index, [(cluster1, walk_logprob1), (cluster2, walk_logprob2), (cluster3, walk_logprob3)], ]      
        # [(0, [(0, -3.7289041683043873), (2, -3.7284874541593469), (1, -3.7306682864068987)], (655362, <pyspark.resultiterable.ResultIterable object at 0x7f01bf534790>)]
        index__cls_sumlogmrkp = cls_index__sumlogmrkp.groupByKey()
        
        # cluster_probs = logprobs_to_normprobs_safe
        # (index, [p1,p2,p3])
        # [(655362, [0.3332682408091523, 0.3329057626978422, 0.333825996493006]), ]
        list_traj_probs = index__cls_sumlogmrkp.map(lambda t: (t[0], \
                                    ([c_logp[0] for c_logp in t[1]], \
                                     logprobs_to_normprobs_safe(array([c_logp[1] for c_logp in t[1]]))))) \
                                  .map(lambda t: (t[0], (sort_list_based_on_another(t[1][0], t[1][1]))[1] )) \
                                  .partitionBy(num_part).persist(StorageLevel.DISK_ONLY) \
                                  .setName('list_traj_probs'+str(i))
        
        
        print list_traj_probs.count()
        
        for prdd in persistedRDDs:
            prdd.unpersist()
        del persistedRDDs[:]
            
        t.print_full_time_from_last()
        
    persistedRDDs.append(list_traj_probs)     
    persistedRDDs.append(smooth_mtrx_list)
        
    list_assignment = list_traj_probs.map(lambda t: (t[0], np.argmax(t[1])))
    return markov_models, list_assignment, persistedRDDs

In [13]:
def part_pers(rdd, partitions=3, persist=StorageLevel.MEMORY_AND_DISK, name=''):
    rdd = rdd.partitionBy(partitions)
    rdd = rdd.persist(persist)
    if name != '':
        rdd = rdd.setName(name)
    
    
    

In [23]:
class TimeHelper:
    
    def __init__(self, prefix_str="time"):   
        self.prefix_str = prefix_str
        self.reset()
    
    def get_time_from_last(self):
        now = datetime.datetime.now()
        delta = now - self.last
        self.last = now        
        return delta
    
    def print_time_from_last(self):
        print self.prefix_str + ": " + str(get_time_from_last())
        
    def print_full_time_from_last(self):
        print self.prefix_str + ": " + strftime("%Y-%m-%d %H:%M:%S", gmtime()) + " " + str(self.get_time_from_last())
        
    def reset(self):
        self.last=datetime.datetime.now()
    
def print_count(rdd, name):
#     rdd.cache()
    rdd.persist(StorageLevel.DISK_ONLY)
    print name + ": " + str(rdd.count())

#MSNBC Data

In [58]:
msnbc_no_header = sc.textFile("/ea/msnbc_no_header.seq", 6)

In [59]:
#read trajectories
list_trajRDD = msnbc_no_header.map(lambda line: [int(i) for i in line.split()]) \
                           .filter(lambda seq: len(seq) <= 500) \
                           .map(pad_traj) \
                           .zipWithIndex() \
                           .map(lambda t: (t[1], t[0]))


In [None]:
th = TimeHelper()
th.print_full_time_from_last()
markovsRDD, assignmentsRDD, persistedRDDs = em_train2(list_trajRDD, num_iterations=4,num_clusters=6, num_partitions=6)
assignments = assignmentsRDD.collect()
markovs_by_cls = markovsRDD.map(lambda t: (t[0][0],(t[0][1],t[1]))).groupByKey().collect()

for prdd in persistedRDDs:
    prdd.unpersist()
    
th.print_full_time_from_last()

In [51]:
# assignments3 = assignments
# markovs_by_cls3 = markovs_by_cls

#Random Data test

In [33]:
from pandas import DataFrame

In [34]:
def calc_next_step(trans_vec):
    next_step_vec = np.random.multinomial(1, trans_vec, size=1)[0]
    return np.sum(next_step_vec*range(len(trans_vec)))

def generate_test_trajs(trans_mtrx,num_trajs=100,traj_len=10):
    trajs = []
    for i in range(num_trajs):
        test_traj = generate_test_traj(trans_mtrx,traj_len)
        trajs.append(test_traj)
    return trajs
def generate_test_traj(trans_mtrx,traj_len=10):
    cur_pos = 0
    test_traj = [cur_pos]
    for j in range(traj_len):
        next_pos = calc_next_step(trans_mtrx[cur_pos])
        test_traj.append(next_pos)
        cur_pos = next_pos
    return array(test_traj)

In [35]:
# trans_mtrx = [[0.1,0.8,0.1],[0.4,0.2,0.4],[0.1,0.2,0.7]]
trans_mtrx = [[0.8,0.1,0.1],[1,0,0],[0.9,0.1,0]]
def generate_tarjs_per_cluster(list_trans_mtrx,num_trajs=100,traj_len=20):
    trajs = []
    labels = []
    for index,trans_mtrx in enumerate(list_trans_mtrx):
        cur_test_trajs = generate_test_trajs(trans_mtrx,num_trajs,traj_len)
        trajs += cur_test_trajs
        labels += [index]*num_trajs
    
    return DataFrame(data={'label':labels,'traj':trajs})

In [36]:
trans_mtrx0 = [[0.8,0.1,0.05,0.05],[0.95,0,0,0.05],[0.9,0.05,0,0.05],[0.9,0.05,0,0.05]]
trans_mtrx1 = [[0.1,0.8,0.05,0.05],[0,0.7,0.25,0.05],[0,0.75,0.1,0.15],[0,0.7,0.25,0.05]]
trans_mtrx2 = [[0.1,0.15,0.7,0.05],[0.05,0.1,0.8,0.05],[0.2,0.15,0.6,0.05],[0.1,0.15,0.7,0.05]]
list_trans_mtrx = [trans_mtrx0,trans_mtrx1,trans_mtrx2]
num_trajs=100
df_traj = generate_tarjs_per_cluster(list_trans_mtrx,num_trajs=num_trajs,traj_len=40)
df_traj = df_traj.iloc[np.random.permutation(len(df_traj))]
df_traj = df_traj.reset_index(drop=True)
df_traj['is_train']=True
df_traj.ix[df_traj.index<int(len(df_traj)*0.2),'is_train']=False

In [37]:
num_clusters = sc.broadcast(3)
train_trajs = df_traj[df_traj.is_train==True].traj
test_trajs = df_traj[df_traj.is_train==True].traj

# train_walks = [traj_to_walk(traj) for traj in train_trajs]
# test_walks = [traj_to_walk(traj) for traj in test_trajs]

In [229]:
states = [0,1,2,3]
train_walks = trajs_to_walks(train_trajs,states)
test_walks = trajs_to_walks(test_trajs,states)

NameError: name 'trajs_to_walks' is not defined

In [38]:
train_trajsRDD = sc.parallelize(train_trajs.tolist(),3) \
                        .map(lambda t: t.tolist()) \
                        .zipWithIndex() \
                        .map(lambda t: (t[1], t[0]))
                        
    
# train_trajsRDD.take(3)
# train_trajsRDD.collect()

In [43]:
t = TimeHelper()
markovsRDD, assignmentsRDD, persistedRDDs = em_train2(train_trajsRDD,num_iterations=10,num_clusters=3,num_partitions=7)
assignments = assignmentsRDD.collect()
markovs_by_cls = markovsRDD.map(lambda t: (t[0][0],(t[0][1],t[1]))).groupByKey().collect()

for prdd in persistedRDDs:
    prdd.unpersist()

t.print_full_time_from_last()

240
time: 2015-07-29 11:31:44 0:00:01.759580
240
time: 2015-07-29 11:31:46 0:00:01.710593
240
time: 2015-07-29 11:31:48 0:00:01.703535
240
time: 2015-07-29 11:31:50 0:00:01.701546
240
time: 2015-07-29 11:31:51 0:00:01.717647
240
time: 2015-07-29 11:31:53 0:00:01.689329
240
time: 2015-07-29 11:31:55 0:00:01.713638
240
time: 2015-07-29 11:31:56 0:00:01.786585
240
time: 2015-07-29 11:31:58 0:00:01.818904
240
time: 2015-07-29 11:32:00 0:00:01.762461
time: 2015-07-29 11:32:01 0:00:18.321497


In [37]:
markovs_dict = {}
for c in markovs_by_cls:    
    m = {tr_p[0]:tr_p[1] for tr_p in c[1]}
    markovs_dict[c[0]] = m
markovs_dict

{0: {(1, 2): 0.00033967391304347831, (0, 1): 0.10090928464871778, (3, 2): 0.00053078556263269649, (1, 3): 0.071671195652173919, (3, 3): 0.069532908704883239, (3, 0): 0.88163481953290879, (3, 1): 0.048301486199575375, (0, 3): 0.053313916317397167, (0, 2): 0.051893159053775659, (2, 0): 0.91422594142259428, (0, 0): 0.79388363998010936, (2, 3): 0.042538354253835432, (2, 1): 0.042538354253835432, (2, 2): 0.00069735006973500717, (1, 0): 0.92764945652173925, (1, 1): 0.00033967391304347831}, 1: {(0, 1): 0.84889434889434878, (1, 2): 0.23746689368142268, (3, 2): 0.25978473581213307, (0, 0): 0.074938574938574934, (3, 3): 0.049412915851272013, (3, 0): 0.00048923679060665359, (3, 1): 0.69031311154598818, (2, 1): 0.74509477884935149, (1, 1): 0.71467082860385933, (2, 0): 0.00016627868307283005, (1, 3): 0.047814982973893305, (2, 3): 0.15314266711007646, (2, 2): 0.10159627535749916, (1, 0): 4.729474082482029e-05, (0, 3): 0.050368550368550362, (0, 2): 0.025798525798525797}, 2: {(0, 1): 0.158683666437110

In [38]:
assignments.sort(key=lambda t: t[0])
list_assignment = [ass[1] for ass in assignments]


In [39]:
df_train = df_traj[df_traj.is_train]

In [40]:
real_assign = range(3)
df = DataFrame({'real':array(df_traj[df_traj.is_train==True].label),'assign':list_assignment})
real_assign = range(3)
df['compare'] = df.apply(lambda x: real_assign[x['real']]==x['assign'],axis=1)
len(df[df.compare])

240

In [41]:
# all_combs = itertools.chain(*(itertools.combinations(filter_vars, i) for i in range(1,3)))
import itertools
assigns = {}
for real_assign in itertools.permutations(range(3)):
    df['compare'] = df.apply(lambda x: real_assign[x['real']]==x['assign'],axis=1)
    num_eq = len(df[df.compare])
    assigns[real_assign] = num_eq

import operator
print max(assigns.iteritems(), key=operator.itemgetter(1))

((0, 1, 2), 240)


#Experiments

In [26]:
init = sc.parallelize(xrange(100000), 3)
init.cache()
gstart = datetime.datetime.now()

for i in range(6):
    print i
    start = datetime.datetime.now()
    
    init2 = init.map(lambda n: (n, n*3)).partitionBy(3)
#     init2.cache()
    
    init3 = init.map(lambda n: (n, n*2)).partitionBy(3)
#     init3.cache()
        
    init4 = init2.join(init3, 3)
#     init4.count()
#     init4.cache()
    
    init = init4.map(lambda n: n[0])
#     init.cache()
    
    print init.count()    
    print str(datetime.datetime.now() - start)
    
init.cache()
init.count()
print str(datetime.datetime.now() - gstart)
    


0
100000
0:00:00.980719
1
100000
0:00:01.326909
2
100000
0:00:01.207190
3
100000
0:00:01.225242
4
100000
0:00:01.192086
5
100000
0:00:01.164828
0:00:07.475116


In [214]:
init = sc.parallelize(xrange(10000000), 3)
init.cache()

for i in range(6):
    print i
    start = datetime.datetime.now()
    
    init2 = init.map(lambda n: (n, n*3))        
    init = init2.map(lambda n: n[0])
    
#     init.cache()
    
#     print init.count()    
    print str(datetime.datetime.now() - start)
    
init.count()
    


0
0:00:00.000042
1
0:00:00.000019
2
0:00:00.000017
3
0:00:00.000016
4
0:00:00.000017
5
0:00:00.000016


10000000

In [60]:
init = sc.parallelize(xrange(10000000), 3)
init.cache()

for i in range(6):
    print i
    start = datetime.datetime.now()
    
    init2 = init.map(lambda n: (n, n*3))        
    init = init2.map(lambda n: n[0])    
    init.cache()
    
    print init.count()    
    print str(datetime.datetime.now() - start)
    

0
10000000
0:00:02.626615
1
10000000
0:00:02.361970
2
10000000
0:00:02.462773
3
10000000
0:00:03.307909
4
10000000
0:00:02.428035
5
10000000
0:00:02.227196


In [167]:
# arr = range(100)

#     c = sc.broadcast(1)
init = sc.parallelize(xrange(10000), 3)
t= TimeHelper()

for i in range(6):
    print i

    init2 = init.map(lambda n: n+1)
    init3 = init2.flatMap(lambda n: [(n,1)])
    
    init4 = init3.reduceByKey(lambda x,y: x+y)
    
    
#     print init5.take(3)
    init = init4.map(lambda n: n[0])        
#     print init4.take(3)
    init.persist(StorageLevel.MEMORY_AND_DISK)
    init.cache()
    
    
    print init.count()
    t.print_time_from_last()
    
    
init.count()
# t.print_time_from_last()
# init.unpersist()

0
10000
time: 0:00:00.299933
1
10000
time: 0:00:00.315004
2
10000
time: 0:00:00.248155
3
10000
time: 0:00:00.244719
4
10000
time: 0:00:00.252504
5
10000
time: 0:00:00.243843


10000