#DATASCI W261: Machine Learning at Scale 

* **Sayantan Satpati**
* **sayantan.satpati@ischool.berkeley.edu**
* **W261**
* **Week-7**
* **Assignment-7**
* **Date of Submission: 27-OCT-2015**

#  === Week 9: Page Rank ===

### First BASE Version [Before HW9 was published]

In [21]:
%%writefile mrjob_hw90.py
from mrjob.job import MRJob
from mrjob.step import MRStep
from mrjob.protocol import RawProtocol
from mrjob.compat import get_jobconf_value
import sys
import ast

'''
Sample Input Data:
1	0.2,[2,4]
2	0.2,[3,5]
3	0.2,[4]
4	0.2,[5]
5	0.2,[1,2,3]
'''

class PageRank(MRJob):
    
    INPUT_PROTOCOL = RawProtocol
    
    def steps(self):
        return [
            MRStep(mapper=self.mapper,
                  reducer=self.reducer)
        ]
    

    def mapper(self, key, value):
        value = value.strip().replace("\"","")
        t = value.split("|")
        
        #sys.stderr.write('[M] {0} | {1} | {2}\n'.format(key, t[0], t[1]))
        
        node = key
        score = t[0]
        neighbors = ast.literal_eval(t[1])
        
        # Emit the Graph Structure
        yield int(node), ('NODE', value)
        
        # Emit the mass
        for n in neighbors:
            yield n, ('SCORE', float(score)/len(neighbors))
                   
        #self.increment_counter('page_rank', 'dangling_node', amount=1)
            
    def combiner(self, key, values):
        pass

        
    def reducer(self, key, values):
        prev_score = None
        adj_list = None
        total_score = 0

        for value_type, value in values:
            #sys.stderr.write('[R1] {0} | {1} | {2}\n'.format(key, value_type, value))
            if value_type == 'NODE':
                t = value.strip().split("|")
                prev_score = t[0]
                adj_list = t[1]
            else:
                assert value_type == 'SCORE'
                total_score += value
        
        '''
        node['prev_score'] = node['score']

        d = self.options.damping_factor
        node['score'] = 1 - d + d * total_score
        '''

        #sys.stderr.write('[R2] {0} | {1} | {2}\n\n'.format(key, total_score, adj_list))
        yield key, '{0}|{1}'.format(total_score, adj_list)

   
if __name__ == '__main__':
    PageRank.run()

Overwriting mrjob_hw90.py


In [22]:
!chmod a+x mrjob_hw90.py

In [23]:
# Run Once
!python mrjob_hw90.py pr.txt

using configs in /Users/ssatpati/.mrjob.conf
creating tmp directory /var/folders/h5/1q71m1c54cn07f16c232pqgm38ynd8/T/mrjob_hw90.ssatpati.20151029.205324.659854

PLEASE NOTE: Starting in mrjob v0.5.0, protocols will be strict by default. It's recommended you run your job with --strict-protocols or set up mrjob.conf as described at https://pythonhosted.org/mrjob/whats-new.html#ready-for-strict-protocols

writing to /var/folders/h5/1q71m1c54cn07f16c232pqgm38ynd8/T/mrjob_hw90.ssatpati.20151029.205324.659854/step-0-mapper_part-00000
Counters from step 1:
  (no counters found)
writing to /var/folders/h5/1q71m1c54cn07f16c232pqgm38ynd8/T/mrjob_hw90.ssatpati.20151029.205324.659854/step-0-mapper-sorted
> sort /var/folders/h5/1q71m1c54cn07f16c232pqgm38ynd8/T/mrjob_hw90.ssatpati.20151029.205324.659854/step-0-mapper_part-00000
writing to /var/folders/h5/1q71m1c54cn07f16c232pqgm38ynd8/T/mrjob_hw90.ssatpati.20151029.205324.659854/step-0-reducer_part-00000
Counters from step 1:
  (no count

In [27]:
%reload_ext autoreload
%autoreload 2
from mrjob_hw90 import PageRank
import pprint

input_file = 'pr.txt'
input_file_iter = input_file + '1'

cnt = 0

# After so many iterations (TBD: Convergence Criteria)
while cnt < 5:
    print "\n\nIteration: " + str(cnt+1) + ":"
    if cnt == 0:
        mr_job = PageRank(args=[input_file,
                                '--no-strict-protocol'])
    else:
        mr_job = PageRank(args=[input_file_iter,
                                '--no-strict-protocol'])

    with mr_job.make_runner() as runner: 
        runner.run()

        with open(input_file_iter, 'w') as f:
            for line in runner.stream_output():
                parsed_line = mr_job.parse_output_line(line)
                print parsed_line
                f.write(line)
                
    cnt += 1

    #print "# MR Counters:"
    #pprint.pprint(runner.counters()[0])



Iteration: 1:
(1, '0.0666666666667|[2,4]')
(2, '0.166666666667|[3,5]')
(3, '0.166666666667|[4]')
(4, '0.3|[5]')
(5, '0.3|[1,2,3]')


Iteration: 2:
(1, '0.1|[2,4]')
(2, '0.133333333333|[3,5]')
(3, '0.183333333333|[4]')
(4, '0.2|[5]')
(5, '0.383333333334|[1,2,3]')


Iteration: 3:
(1, '0.127777777778|[2,4]')
(2, '0.177777777778|[3,5]')
(3, '0.194444444444|[4]')
(4, '0.233333333333|[5]')
(5, '0.266666666667|[1,2,3]')


Iteration: 4:
(1, '0.088888888889|[2,4]')
(2, '0.152777777778|[3,5]')
(3, '0.177777777778|[4]')
(4, '0.258333333333|[5]')
(5, '0.322222222222|[1,2,3]')


Iteration: 5:
(1, '0.107407407407|[2,4]')
(2, '0.151851851852|[3,5]')
(3, '0.183796296296|[4]')
(4, '0.222222222222|[5]')
(5, '0.334722222222|[1,2,3]')
