<a href="https://colab.research.google.com/github/reuven-itzhakov/Cloud-Computing/blob/main/HW3_MapReduce.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install firebase

In [None]:
from firebase import firebase

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from collections import Counter

import time
import random
from datetime import datetime

import json

In [None]:
FBconn = firebase.FirebaseApplication('https://cloud-comp-sloth-default-rtdb.europe-west1.firebasedatabase.app/', None)

In [None]:
sensors = [
    "Humidity(Outdoor)",
    "Humidity(Indoor)",
    "Pressure(Outdoor)",
    "Pressure(Indoor)",
    "Temperature(Outdoor)",
    "Temperature(Indoor)",
    "Dlight(Outdoor)",
    "Distance(Indoor)"
]

In [None]:
# Retrieve sensor data from Firebase and format it
def map_sensor(sensor_name):
    # Get data for the sensor from Firebase
    data = FBconn.get(f'/sensors/{sensor_name}', None)

    # Initialize a list to store the processed data
    grouped = []

    # Iterate through the data and format it as (rounded_value, timestamp) tuples
    for timestamp, value in data.items():
        grouped.append((float(np.round(value,2)), timestamp))
    return grouped

In [None]:
def reduce_sensor(grouped):
    """
    Groups timestamps by sensor value.
    Args:
        grouped: A list of (value, timestamp) tuples.
    Returns:
        A dictionary where keys are sensor values and values are lists of timestamps.
    """
    mapped = {}
    for value, timestamp in grouped:
        if value not in mapped:
            mapped[value] = []
        mapped[value].append(timestamp)
    return mapped


In [None]:
# Initialize an empty dictionary to store the map-reduced sensor data.
mapreduced_sensors = {}

# Apply map and reduce functions to each sensor and store the results.
for sensor in sensors:
    mapped_sensor = map_sensor(sensor)
    reduced_sensor = reduce_sensor(mapped_sensor)
    mapreduced_sensors[sensor] = reduced_sensor

In [None]:
sensor_data = mapreduced_sensors # Preprocessed sensor data

# Function to calculate sensor summary
def calculate_sensor_summary(sensor_data, sensors):
    summary_rows = []
    for sensor in sensors:
        value_dict = sensor_data[sensor]
        all_values = []
        value_counter = Counter()
        for val, timestamps in value_dict.items():
            count = len(timestamps)
            all_values.extend([val] * count)
            value_counter[val] += count

        min_val = min(all_values)
        max_val = max(all_values)
        avg_val = sum(all_values) / len(all_values)
        most_common_val, _ = value_counter.most_common(1)[0]

        summary_rows.append({
            "sensor": sensor,
            "min_value": min_val,
            "max_value": max_val,
            "avg_value": round(avg_val, 2),
            "1_most_common_value": most_common_val
        })

    return pd.DataFrame(summary_rows)

# Generate summary table for Humidity sensors
for i in range(0, len(sensors), 2):
    humidity_summary_df = calculate_sensor_summary(sensor_data, [sensors[i], sensors[i+1]])
    display(humidity_summary_df)

In [None]:
# Flatten the data for DataFrame construction
records = []
sensor_data = mapreduced_sensors
for sensor, values in sensor_data.items():
    for value, timestamps in values.items():
        for ts in timestamps:
            records.append({
                "Sensor": sensor,
                "Value": value,
                "Timestamp": int(ts)
            })

df = pd.DataFrame(records) # Create DataFrame

# Histogram - Distribution of Values per Sensor
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x="Value", hue="Sensor", kde=False, bins=10, multiple="stack")
plt.title("Histogram: Distribution of Sensor Values")
plt.tight_layout()
plt.show()