### Count ratings

In [2]:
%%file ratings.py
from mrjob.job import MRJob

class MRRatingCounter(MRJob):
    def mapper(self, key, line):
        userId, movieId, rating, timestamp = line.split('\t')
        yield rating, 1
    def reducer(self, rating, occurences):
        yield rating, sum(occurences)
        
if __name__ == '__main__':
    MRRatingCounter.run()

Overwriting ratings.py


### Average number of friends by age

In [11]:
%%file friends_by_age.py

from mrjob.job import MRJob

class MRAverageFriend(MRJob):
    def mapper(self, key, line):
        u_id, name, age, friend_count = line.split(',') 
        yield age.strip(), int(friend_count)
    def reducer(self, key, count_lst):
        c = 0
        total = 0
        for count in count_lst:
            c += 1
            total += count
        yield key, total/c
    
if __name__ == "__main__":
    MRAverageFriend.run()

Overwriting friends_by_age.py


In [None]:
# %load avg_friend_by_age.txt
"18"	343.375
"19"	213.27272727272728
"20"	165.0
"21"	350.875
"22"	206.42857142857142
"23"	246.3
"24"	233.8
"25"	197.45454545454547
"26"	242.05882352941177
"27"	228.125
"28"	209.1
"29"	215.91666666666666
"30"	235.8181818181818
"31"	267.25
"32"	207.9090909090909
"33"	325.3333333333333
"34"	245.5
"35"	211.625
"36"	246.6
"37"	249.33333333333334
"38"	193.53333333333333
"39"	169.28571428571428
"40"	250.8235294117647
"41"	268.55555555555554
"42"	303.5
"43"	230.57142857142858
"44"	282.1666666666667
"45"	309.53846153846155
"46"	223.69230769230768
"47"	233.22222222222223
"48"	281.4
"49"	184.66666666666666
"50"	254.6
"51"	302.14285714285717
"52"	340.6363636363636
"53"	222.85714285714286
"54"	278.0769230769231
"55"	295.53846153846155
"56"	306.6666666666667
"57"	258.8333333333333
"58"	116.54545454545455
"59"	220.0
"60"	202.71428571428572
"61"	256.22222222222223
"62"	220.76923076923077
"63"	384.0
"64"	281.3333333333333
"65"	298.2
"66"	276.44444444444446
"67"	214.625
"68"	269.6
"69"	235.2


### Min temperature

In [19]:
%%file min_temp.py

from mrjob.job import MRJob

class MRMinTemp(MRJob):
    def mapper(self, key, line):
        station, date, temp_type, temp, _, _, _, _ = line.split(',')
        if temp_type == 'TMIN':
            yield station, float(temp)
    def reducer(self, key, temp_lst):
        yield key, min(temp_lst)
        
if __name__ == "__main__":
    MRMinTemp.run()

Overwriting min_temp.py


In [None]:
# %load min_temp.txt
"EZE00100082"	-135.0
"ITE00100554"	-148.0


### Word count

In [29]:
%%file word_count.py

from mrjob.job import MRJob
import re

class MRWordCount(MRJob):
    def mapper(self, _, line):
        line = re.sub('[^A-Za-z]', ' ', line)
        for word in line.split():
            yield word.lower(), 1
    def reducer(self, word, count):
        yield word, sum(count)
        
if __name__ == "__main__":
    MRWordCount.run()

Overwriting word_count.py


### Chained MR

In [32]:
%%file word_count_chained.py

from mrjob.job import MRJob
from mrjob.step import MRStep
import re

class MRWordCount(MRJob):
    def steps(self):
        return [
            MRStep(mapper=self.mapper_get_words, reducer=self.reducer_count_words),
            MRStep(mapper=self.mapper_flip_key_val, reducer=self.reducer_final)
        ]
    def mapper_get_words(self, _, line):
        line = re.sub('[^A-Za-z]', ' ', line)
        for word in line.split():
            yield word.lower(), 1
    def reducer_count_words(self, word, count):
        yield word, sum(count)
        
    def mapper_flip_key_val(self, word, count):
        yield '%04d'%int(count), word
        
    def reducer_final(self, count, word):
        for w in word:
            yield count, w
        
if __name__ == "__main__":
    MRWordCount.run()

Overwriting word_count_chained.py


### Total money spent by customer

In [36]:
%%file total_spend.py

from mrjob.job import MRJob
from mrjob.step import MRStep


class MRTotalSpent(MRJob):
    def steps(self):
        return [
            MRStep(mapper=self.mapper, reducer= self.reducer),
            MRStep(mapper=self.mapper_2, reducer=self.reducer_2)
        ]
    def mapper(self, _, line):
        customer, item, amount = line.split(',')
        yield customer, float(amount)
    def reducer(self, customer, amount):
        yield customer, sum(amount)
    def mapper_2(self, customer, total):
        yield '%04.02f'%total, customer
    def reducer_2(self, total, customer):
        for name in customer:
            yield total, name
        
if __name__ == "__main__":
    MRTotalSpent.run()

Overwriting total_spend.py


### Combiner
* Save network overhead.
* Mapper does some reduction before sending it to reducer over network.


In [38]:
%%file word_count_with_combiner.py

from mrjob.job import MRJob
import re

class MRWordCountCombiner(MRJob):
    def mapper(self, _, line):
        line = re.sub('[^A-Za-z]', ' ', line)
        for word in line.split():
            yield word.lower(), 1
    def reducer(self, word, count):
        yield word, sum(count)
        
    def combiner(self, word, count):
        yield word, sum(count)
if __name__ == "__main__":
    MRWordCountCombiner.run()

Overwriting word_count_with_combiner.py


### Most Rated movie

In [51]:
%%file most_rated_movie.py

from mrjob.job import MRJob
from mrjob.step import MRStep

class MostRatedMovie(MRJob):
    def steps(self):
        return [
            MRStep(mapper=self.mapper_get_ratings, reducer=self.reducer_movie_review_count),
            MRStep(reducer=self.reducer_find_max)
        ]
    
    def mapper_get_ratings(self, _, line):
        uid, mid, rating, time = line.split('\t')
        yield mid, 1
    
    def reducer_movie_review_count(self, movie ,count):
        yield None, (sum(count), movie)
    
    def reducer_find_max(self, _, values):
        yield max(values)
        
if __name__ == "__main__":
    MostRatedMovie.run()

Overwriting most_rated_movie.py


In [None]:
# %load most_rated_movie.txt
583	"50"

### Additional options

In [55]:
%%file most_rated_movie_name.py

from mrjob.job import MRJob
from mrjob.step import MRStep

class MostRatedMovieName(MRJob):
    def configure_options(self):
        super(MostRatedMovieName, self).configure_options()
        self.add_file_option('--items', help='path to u.item') # provide extra file which will go to every node
    def steps(self):
        return [
            MRStep(mapper=self.mapper_get_ratings,reducer_init=self.reducer_init, reducer=self.reducer_movie_review_count),
            MRStep(reducer=self.reducer_find_max)
        ]
    
    # Reducer_init runs befroe reducer
    def reducer_init(self):
        self.movieName = {}
        with open('u.Item') as f:
            for line in f:
                fields = line.split('|')
                self.movieName[fields[0]] = fields[1]
    def mapper_get_ratings(self, _, line):
        uid, mid, rating, time = line.split('\t')
        yield mid, 1
    
    def reducer_movie_review_count(self, movie ,count):
        yield None, (sum(count), self.movieName[movie])
    
    def reducer_find_max(self, _, values):
        yield max(values)
        
if __name__ == "__main__":
    MostRatedMovieName.run()

Overwriting most_rated_movie_name.py


In [None]:
# %load most_rated_movie_name.txt
583	"Star Wars (1977)"


### SuperHero Social Network

In [14]:
%%file popular_superhero.py

from mrjob.job import MRJob
from mrjob.step import MRStep

class PopularSuperHero(MRJob):
    def configure_options(self):
        super(PopularSuperHero, self).configure_options()
        self.add_file_option('--name', help='path to Marvel-names.txt')
        
    def steps(self):
        return [
            MRStep(mapper=self.mapper, reducer=self.reducer),
            MRStep(mapper_init= self.mapper_lookup, mapper=self.mapper_sort, reducer=self.reducer_max)
        ]
    
    def mapper(self, _, line):
        data = line.split()
        yield int(data[0]), len(data) - 1
    
    def reducer(self, hero_id, total):
        yield hero_id, sum(total)
        
    def mapper_lookup(self):
        self.lookup = {}
        with open('Marvel-names.txt') as f:
            for data in f:
                fields = data.split('"')
                heroId = int(fields[0])
                self.lookup[heroId] = fields[1]
        
    def mapper_sort(self, hero_id, total):
        hero_name = self.lookup[hero_id]
        yield None, (total, hero_name)
        
    def reducer_max(self, _, hero_lst):
        yield max(hero_lst)
        
if __name__ == "__main__":
    PopularSuperHero.run()

Overwriting popular_superhero.py


### Degrees of seperation