In [None]:
#==========================================================================
# Hello. In this notebook, I will be providing Map Reduce 
# examples in Python, which will work as Hadoop Streaming jobs.
# @author: Souradeep Sinha
#==========================================================================

In [None]:
#==========================================================================
# This example is from Tanmay Deshpande's list of sample Map 
# Reduce problems which are originally written in Java
# at http://hadooptutorials.co.in/tutorials/mapreduce/advanced-map-reduce-examples-1.html#
# 
# Problem Statement:
# XYZ.com is an online music website where users listen to 
# various tracks, the data gets collected like shown below. 
# Write a map reduce program to get following stats:
# 
# Number of unique listeners
# Number of times the track was shared with others
# Number of times the track was listened to on the radio
# Number of times the track was listened to in total
# Number of times the track was skipped on the radio
# The data is coming in log files and looks like as shown below.
# 
# UserId|TrackId|Shared|Radio|Skip
# 111115|222|0|1|0
# 111113|225|1|0|0
# 111117|223|0|1|1
# 111115|225|1|0|0
#==========================================================================

In [None]:
# Problem 1, Mapper Function. File saved as mapper.py

from sys import stdin

def mapper():
    for line in stdin:
        data = line.split("|")
        if len(data) == 5:
            uid, tid = data[:2]
            print "{0}\t{1}".format(tid, uid)
        else:
            continue

In [None]:
# Problem 1, Reduce Function. File saved as reducer.py

from sys import stdin

def reducer():
    oldKey = None
    aggregator = list()
    for line in stdin:
        data = line.split("\t")
        if not(len(data) == 2):
            continue
        thisKey, thisValue = data
        if oldKey and not(oldKey == thisKey):
            print "{0}\t{1}".format(oldKey, str(len(list(set(aggregator)))))
            aggregator = list()
            oldKey = thisKey
        oldKey = thisKey
        aggregator.append(thisValue)
    print "{0}\t{1}".format(oldKey, str(len(list(set(aggregator)))))

In [1]:
# The mapper ad the reducer can be passed to the Hadoop 
# framework along with the data file, output folder (in  
# HDFS) and the mapper-reducer file combinations. Please
# ensure that the output folder does not already exist.

# For a sanity check, the mapper and reducer can be checked
# on the local machine terminal, using the first 1000 lines
# of the log file (log.dat) and piping the results into 
# mapper and reducer functions as follows:

In [None]:
# $> cat -1000 log.dat | python mapper.py \
# $> | sort | python reducer.py