<a href="https://colab.research.google.com/github/smduarte/spbd-2324/blob/main/lab3/SPBD_Labs_mapreduce2_exercise_solution2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MrJob MapReduce Python Example

Word count implemented in pure Python, using the library MrJob.

[MrJob](https://mrjob.readthedocs.io/en/latest/) can be used to write MapReduce jobs and run them on several platforms.

Some key advantages:
+ Write **multi-step** MapReduce jobs in pure Python;
+ Test on your **local machine**;
+ Deploy jobs in several cloud plataforms of several vendors.

In [None]:
#@title Download the dataset and install MrJob
!wget -q -O os_maias.txt https://www.dropbox.com/s/n24v0z7y79np319/os_maias.txt?dl=0
!pip install mrjob --quiet
!wget -q -O /etc/mrjob.conf https://raw.githubusercontent.com/smduarte/spbd-2324/main/lab2/mrjob.conf

##1. MrJob MapReduce Word Frequency

In [None]:
%%file desc_word_freq.py

import string
from mrjob.job import MRJob, MRStep

MAX_FREQ=100000

class MRWordCountFrequency(MRJob):

    def mapper_words(self, _, line):
      line = line.strip()
      line = line.translate(str.maketrans('', '', string.punctuation+'«»'))
      for word in line.split():
        yield word, 1

    def reducer_words(self, key, values):
        yield key, sum(values)

    def mapper_partition_sort(self, word, freq):
      yield '%05d' % (MAX_FREQ-freq), word

    def reducer_partition_sort(self, freq, words):
      for word in words:
        yield word, MAX_FREQ-int(freq)

    def mapper_total_sort(self, word, freq):
      yield None, (word, freq)

    def reducer_total_sort(self, _, values):
      for word, freq in sorted(values, key= lambda x: x[1], reverse=True):
        yield word, freq

    def steps(self):
        return [ MRStep(mapper=self.mapper_words, reducer=self.reducer_words),
                 MRStep(mapper=self.mapper_partition_sort, reducer=self.reducer_partition_sort),
                 MRStep(mapper=self.mapper_total_sort, reducer=self.reducer_total_sort)]

if __name__ == '__main__':
    MRWordCountFrequency.run()

In [None]:
!rm -rf results
!python -m desc_word_freq  --output-dir results --cleanup NONE os_maias.txt
!head results/*

##2. Weblog DDOS Attack Analysis

In [None]:
!wget -q -O web.log https://www.dropbox.com/s/0r8902uj9yum7dg/web.log?dl=0
!wc web.log

### 1. Count the number of unique IP addresses involved in the DDOS attack.



In [None]:
%%file unique_ips.py

from mrjob.job import MRJob, MRStep

class MRUniqueIPs(MRJob):

    def steps(self):
      return [MRStep(mapper=self.mapper_ip, reducer=self.reducer_ip),
              MRStep(reducer=self.reducer_filter)]

    def mapper_ip(self, _, line):
      _, ip_source, _ = line.strip().split(' ', 2)
      yield ip_source, None

    def reducer_ip(self, ip_source, _):
      yield None, 1

    def reducer_filter(self, _, values):
      yield "UNIQUE IPs", sum(values)

if __name__ == '__main__':
    MRUniqueIPs.run()

In [None]:
!rm -rf results
!python -m unique_ips  --output-dir results --cleanup NONE web.log
!head results/*

### 2. For each interval of 10 seconds, provide the following information: [number of requests, average execution time, maximum time, minimum time]





In [None]:
%%file interval_stats.py

from statistics import *
from mrjob.job import MRJob, MRStep

class MRIntervalStats(MRJob):

  def mapper(self, _, line):
        vals = line.strip().split(' ')
        timestamp = vals[0]
        execution_time = float(vals[5])
        interval = timestamp[0:18] # YYYY-MM-DDTHH:MM:S -> 10s intervals
        yield interval, execution_time

  def reducer(self, interval, values):
      times = list(values)
      yield interval, (len(times), min(times), mean(times), max(times))

if __name__ == '__main__':
    MRIntervalStats.run()

In [None]:
%%shell
rm -rf results
python -m interval_stats --output-dir results --cleanup NONE web.log && head results/*

In [None]:
%%file interval_stats2.py

from statistics import *
from mrjob.job import MRJob, MRStep

class MRIntervalStats2(MRJob):

  def mapper(self, _, line):
        vals = line.strip().split(' ')
        timestamp = vals[0]
        execution_time = float(vals[5])
        interval = timestamp[0:18] # YYYY-MM-DDTHH:MM:S -> 10s intervals
        yield interval, (execution_time, execution_time, execution_time, 1)

  def combiner(self, interval, values):
        tuples = list(values)
        sum_exec_time = sum( [i[0] for i in tuples] )
        min_exec_time = min( [i[1] for i in tuples] )
        max_exec_time = max( [i[2] for i in tuples] )
        tot_exec_time = sum( [i[3] for i in tuples] )
        yield interval, (sum_exec_time, min_exec_time, max_exec_time, tot_exec_time)

  def reducer(self, interval, values):
        tuples = list(values)
        sum_exec_time = sum( [i[0] for i in tuples] )
        min_exec_time = min( [i[1] for i in tuples] )
        max_exec_time = max( [i[2] for i in tuples] )
        tot_exec_time = sum( [i[3] for i in tuples] )
        yield interval, (tot_exec_time, min_exec_time, sum_exec_time/tot_exec_time, max_exec_time)

if __name__ == '__main__':
    MRIntervalStats2.run()

In [None]:
%%shell
rm -rf results
python -m interval_stats2 --output-dir results --cleanup NONE web.log && head results/*

In [None]:
%%file interval_stats2.py

from statistics import *
from mrjob.job import MRJob, MRStep

class MRIntervalStats2(MRJob):

  def mapper(self, _, line):
        vals = line.strip().split(' ')
        timestamp = vals[0]
        execution_time = float(vals[5])
        interval = timestamp[0:18] # YYYY-MM-DDTHH:MM:S -> 10s intervals
        yield interval, (execution_time, execution_time, execution_time, 1)

  def combiner(self, interval, values):
        sum_exec_time = sum( [i[0] for i in values] )
        min_exec_time = min( [i[1] for i in values] )
        max_exec_time = max( [i[2] for i in values] )
        tot_exec_time = sum( [i[3] for i in values] )
        yield interval, (sum_exec_time, min_exec_time, max_exec_time, tot_exec_time)

  def reducer(self, interval, values):
        sum_exec_time = sum( [i[0] for i in values] )
        min_exec_time = min( [i[1] for i in values] )
        max_exec_time = max( [i[2] for i in values] )
        tot_exec_time = sum( [i[3] for i in values] )
        yield interval, (tot_exec_time, min_exec_time, sum_exec_time/tot_exec_time, max_exec_time)

if __name__ == '__main__':
    MRIntervalStats2.run()

### 3. Create an inverted index that, for each interval of 10 seconds, has a list of (unique) IPs executing accesses (to each URL).

In [None]:
%%file inverted_index.py

from mrjob.job import MRJob, MRStep

class MRInvertedIndex(MRJob):

  def mapper(self, _, line):
        vals = line.strip().split(' ')
        if len(vals) >= 6:
          timestamp = vals[0]
          interval = timestamp[0:18] # YYYY-MM-DDTHH:MM:S -> 10s intervals

          source_ip = vals[1]
          target_url = vals[4]
          yield "{}-{}".format(interval, target_url), source_ip

  def reducer(self, key, values):
    yield key, list(values)

if __name__ == '__main__':
    MRInvertedIndex.run()

In [None]:
%%shell
rm -rf results
python -m inverted_index --output-dir results web.log && head results/*