#DATASCI W261: Machine Learning at Scale

## Problem: Find out the pages which have more than 400 visits

Complete code in MrJob class file 

In [23]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
%%writefile top_pages.py
"""Find Vroots with more than 400 visits.

This program will take a CSV data file and output tab-seperated lines of

    Vroot -> number of visits

To run:

    python top_pages.py anonymous-msweb.data

To store output:

    python top_pages.py anonymous-msweb.data > top_pages.out
"""
from mrjob.job import MRJob
import csv

def csv_readline(line):
    """Given a sting CSV line, return a list of strings."""
    for row in csv.reader([line]):
        return row

class TopPages(MRJob):

    def mapper(self, line_no, line):
        """Extracts the Vroot that was visited"""
        cell = csv_readline(line)
        if cell[0] == 'V':
            yield cell[1], 1

    def reducer(self, vroot, visit_counts):
        """Sumarizes the visit counts by adding them together.  If total visits
        is more than 400, yield the results"""
        total = sum(visit_counts)
        if total > 400:
            yield vroot, total
        
if __name__ == '__main__':
    TopPages.run()

Overwriting top_pages.py


Driver code is ready

In [28]:
from top_pages import TopPages
import csv

mr_job = TopPages(args=['anonymous-msweb.data'])
with mr_job.make_runner() as runner:
    runner.run()
    n = 0
    for line in runner.stream_output():
        print mr_job.parse_output_line(line)
        n += 1
        
    print
    print "total lines:", n



('1000', 912)
('1001', 4451)
('1002', 749)
('1003', 2968)
('1004', 8463)
('1007', 865)
('1008', 10836)
('1009', 4628)
('1010', 698)
('1014', 728)
('1017', 5108)
('1018', 5330)
('1020', 1087)
('1024', 521)
('1025', 2123)
('1026', 3220)
('1027', 507)
('1030', 1115)
('1031', 574)
('1032', 1446)
('1034', 9383)
('1035', 1791)
('1036', 759)
('1037', 1160)
('1038', 1110)
('1040', 1506)
('1041', 1500)
('1045', 474)
('1046', 636)
('1052', 842)
('1053', 670)
('1058', 672)
('1067', 548)
('1070', 602)
('1074', 584)
('1076', 444)
('1078', 462)
('1295', 716)

total lines: 38
