### Question 1 - Daily and monthly statistics

  For each sensor, compute the minimum, average and maximum values of the two sensor metrics. Produce results for each day.

 **Requirement**: Solve this question using MapReduce (MrJob) and Spark Core.

In [14]:
#@title Mount Google Drive
from google.colab import drive
drive.mount('/content/drive') # Faz o mount da drive

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
#@title Install MRJob and load dataset
!pip install mrjob --quiet # Faz a instalação do MRJob
!wget -q -O /etc/mrjob.conf https://raw.githubusercontent.com/smduarte/spbd-2223/main/lab2/mrjob.conf # Faz a configuração do MRJob
# Faz o import dos dados
!head -10000 /content/drive/MyDrive/projeto_spbd/sds011-2020-01-01.csv > files
!head -10000 /content/drive/MyDrive/projeto_spbd/sds011-2020-01-02.csv >> files
!head -10000 /content/drive/MyDrive/projeto_spbd/sds011-2020-01-03.csv >> files

In [16]:
#@title Resolution using MRJob
%%file min_max_mean.py

from statistics import *
from mrjob.job import MRJob, MRStep

class MRMinMaxMean(MRJob):
  # Mapper
  def mapper(self, _, line):
    sensor, sistema, latit, longit, period, p1, p2 = line.split(';') # Faz o split de cada linha do ficheiro csv, dividindo-o em 6 campos
    dia = period[0:10] # Retira o dia do period
    p1 = float(p1) # Passa a métrica p1 para float
    p2 = float(p2) # Passa a métrica p2 para float
    valores = (p1,p2) # Junta as duas métricas, p1 e p2, como value
    k = (sensor, dia) # Cria uma tuple (sensor, dia)
    yield k, valores
  # Reducer
  def reducer(self, k, valores):
    vp1 = [] # Cria uma lista vazia para inserir os valores de p1
    vp2 = [] # Cria uma lista vazia para inserir os valores de p2
    for p1, p2 in valores:
      vp1.append(p1) # Cada valor de p1 é adicionado à lista criada
      vp2.append(p2) # Cada valor de p2 é adicionado à lista criada
    yield k, (max(vp1), min(vp1), mean(vp1), max(vp2), min(vp2), mean(vp2)) # Para cada sensor, é impresso os valores máximo, mínimo e médio de cada lista (vp1 e vp2)

if __name__ == '__main__':
    MRMinMaxMean.run()

Overwriting min_max_mean.py


In [17]:
!rm -rf results
!python -m min_max_mean  --output-dir results --cleanup NONE '/content/files'
!head results/*
# Impressão dos resultados

Using configs in /etc/mrjob.conf
No configs specified for inline runner
Running step 1 of 1...
Creating temp directory /tmp/min_max_mean.root.20221221.171621.595709
job output is in results
==> results/part-00000 <==
["1000", "2020-01-01"]	[341.53, 297.97, 319.75, 275.53, 239.2, 257.365]
["1000", "2020-01-02"]	[12.13, 10.53, 11.186666666666667, 9.0, 7.37, 8.223333333333333]
["1000", "2020-01-03"]	[18.0, 15.0, 16.5, 13.73, 12.4, 13.065000000000001]
["10009", "2020-01-01"]	[75.3, 69.03, 72.16499999999999, 44.77, 44.53, 44.650000000000006]
["10009", "2020-01-02"]	[21.53, 21.0, 21.265, 12.33, 12.3, 12.315000000000001]
["10009", "2020-01-03"]	[14.17, 12.23, 13.2, 8.4, 7.4, 7.9]
["10011", "2020-01-01"]	[112.47, 105.67, 109.07, 56.9, 52.9, 54.9]
["10011", "2020-01-02"]	[35.6, 35.37, 35.485, 16.77, 16.67, 16.72]
["10011", "2020-01-03"]	[24.37, 23.23, 23.8, 12.1, 11.33, 11.715]
["10029", "2020-01-01"]	[5.57, 5.57, 5.57, 5.1, 5.1, 5.1]

==> results/part-00001 <==
["18455", "2020-01-02"]	[154.18,

In [18]:
#@title Install Pyspark
!pip install --quiet pyspark # Faz a instalação do Pyspark

In [21]:
#@title Resolution using Spark Core
import pyspark
from operator import *

sc = pyspark.SparkContext('local[*]') # Cria o SparkContext, em local mode

try:
  lines = sc.textFile('/content/files').map( lambda line: line.strip() ) # Faz o carregamento dos dados e o strip de cada linha do ficheiro
  
  sensors = lines.map( lambda line: line.split(';')) # Faz o split de cada linha do ficheiro csv, dividindo-o em 6 campos
  sensors1 = sensors.filter( lambda values: len(values) > 0) # Faz o filtro para reter, apenas, as linhas que são superiores a zero, ou seja, não nulas
  sensors2 = sensors1.map( lambda values: ((values[0],values[4][0:10]), (float(values[5]), float(values[6])))) # Faz o map, sendo o número do sensor a key e a tuple (p1, p2), em float, a value
  sensors3 = sensors2.map( lambda kv : (kv[0], (1, kv[1][0], kv[1][0], kv[1][0], kv[1][1], kv[1][1], kv[1][1]))) # Faz o map, sendo o número de do sensor a key e a tuple (1, p1, p1, p1, p2, p2, p2)
  sensors4 = sensors3.reduceByKey( lambda a, b : (a[0] + b[0], a[1] + b[1], max(a[2],b[2]), min(a[3],b[3]), a[4] + b[4], max(a[5],b[5]), min(a[6],b[6])) ) # Faz o reduceByKey, criando uma value correspondente a: (1+1, p1+p1, max(p1,p1), min(p1,p1), p2+p2, max(p2,p2), min(p2,p2)). Soma 1+1, p1+p1 e p2+p2, de cada linha, e preserva o min e max para cada valor de p1 e p2. 
  sensors5 = sensors4.map( lambda kv : ( kv[0] , (kv[1][2] , kv[1][3] , kv[1][1] / kv[1][0] , kv[1][5] , kv[1][6], kv[1][4] / kv[1][0]))) # Faz o map, sendo o número do sensor a key e a tuple (max p1, min p1, mean p1, max p2, min p2, mean p2)
  sensors6 = sensors5.sortByKey() # Faz o sortByKey, por ordem crescente do número do sensor
       
  for sensor in sensors6.take(20): 
    print(sensor)
  # Faz o print dos 20 primeiros valores
  sc.stop()
except Exception as e: 
  print(e)
  sc.stop()

(('1000', '2020-01-01'), (341.53, 297.97, 319.75, 275.53, 239.2, 257.365))
(('1000', '2020-01-02'), (12.13, 10.53, 11.186666666666667, 9.0, 7.37, 8.223333333333334))
(('1000', '2020-01-03'), (18.0, 15.0, 16.5, 13.73, 12.4, 13.065000000000001))
(('10009', '2020-01-01'), (75.3, 69.03, 72.16499999999999, 44.77, 44.53, 44.650000000000006))
(('10009', '2020-01-02'), (21.53, 21.0, 21.265, 12.33, 12.3, 12.315000000000001))
(('10009', '2020-01-03'), (14.17, 12.23, 13.2, 8.4, 7.4, 7.9))
(('10011', '2020-01-01'), (112.47, 105.67, 109.07, 56.9, 52.9, 54.9))
(('10011', '2020-01-02'), (35.6, 35.37, 35.485, 16.77, 16.67, 16.72))
(('10011', '2020-01-03'), (24.37, 23.23, 23.8, 12.1, 11.33, 11.715))
(('10029', '2020-01-01'), (5.57, 5.57, 5.57, 5.1, 5.1, 5.1))
(('10029', '2020-01-02'), (0.98, 0.6, 0.79, 0.98, 0.6, 0.79))
(('10029', '2020-01-03'), (0.7, 0.43, 0.565, 0.7, 0.43, 0.565))
(('1004', '2020-01-01'), (637.47, 478.33, 557.9, 455.2, 331.97, 393.58500000000004))
(('1004', '2020-01-02'), (57.1, 56.4