<br>
<br>
 <center> <font size = "5"> Plegma Dataset </font></center>
 <br>
 <center> <font size = "4"> Dataset stats </font></center>
 <br>
 <center> <font size = "3"> </font></center>
<br>
<br>

---

#### Functions

In [None]:
def calculate_cells(df):
    
    """
    Calculate the total number of cells in a DataFrame and the total number of missing values.

    Parameters:
    df (pandas.DataFrame): The DataFrame for which to calculate the total number of cells and missing values.

    Returns:
    total_cells (int): Total number of cells in the DataFrame.
    total_missing (int): Total number of missing values (NaN) in the DataFrame.
    """
    total_cells = np.product(df.shape)
    total_missing = df.isna().sum().sum()
    return total_cells, total_missing


def calculate_uptime(total_cells, total_missing):
    
    """
    Calculate the 'uptime' of a DataFrame, defined as the proportion of non-missing values to the total number of values.

    Parameters:
    total_cells (int): The total number of cells in the DataFrame.
    total_missing (int): The total number of missing values (NaN) in the DataFrame.

    Returns:
    float: The 'uptime' percentage, calculated as the proportion of non-missing values in the DataFrame.
    """
    return (total_cells - total_missing) / total_cells



def process_file(electric_data_filepath, env_data_filepath):

    """
    Process electric and environmental data files.

    This function reads electric and environmental data from specified file paths, calculates the total and NaN values,
    and computes the number of timestamps and issue counts from the electric data.

    Parameters:
    electric_data_filepath (Pathlib.Path or str): Path to the electric data file.
    env_data_filepath (Pathlib.Path or str): Path to the environmental data file.

    Returns:
    tuple: A tuple containing the following elements:
        - total (int): The total number of data points (cells) from both electric and environmental data.
        - nan (int): The total number of NaN (missing) values from both electric and environmental data.
        - timestamps (int): The number of timestamps present in the electric data.
        - issues (int): The sum of the last column values in the electric data, representing issue counts.
    
    Note:
    - The function assumes the last column of electric data contains issue counts.
    - The function skips the first and last columns of electric data, and the first column of environmental data
      for total and NaN calculations.
    """
    electric_data = pd.read_csv(electric_data_filepath, header=0)
    env_data = pd.read_csv(env_data_filepath, header=0)
    
    timestamps = len(electric_data.iloc[:, -1])
    issues = np.sum(electric_data.iloc[:, -1].values)
    
    el_total, el_NaN = calculate_cells(electric_data.iloc[:, 1:-1])
    env_total, env_NaN = calculate_cells(env_data.iloc[:, 1:])
    
    total = el_total + env_total
    nan = el_NaN + env_NaN

    return total, nan, timestamps, issues


def calculate_overall_percentages(total_nan, total_values, total_issues, total_timestamps):
    """
    Calculate the overall percentages of NaN values and issues across all houses.

    Parameters:
    total_nan (int): The aggregated total number of NaN values across all houses.
    total_values (int): The aggregated total number of values across all houses.
    total_issues (int): The aggregated total number of issues across all houses.
    total_timestamps (int): The aggregated total number of timestamps across all houses.

    Returns:
    tuple: A tuple containing:
        - overall_nan_percentage (float): The overall percentage of NaN values.
        - overall_issues_percentage (float): The overall percentage of issues.
    """
    overall_nan_percentage = (total_nan / total_values) * 100 if total_values > 0 else 0
    overall_issues_percentage = (total_issues / total_timestamps) * 100 if total_timestamps > 0 else 0
    return overall_nan_percentage, overall_issues_percentage

#### Statistics calculation

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np


# Initialize variables to aggregate totals
total_nan_values = 0
total_values = 0
total_issues = 0
total_timestamps = 0

# Iterate over each house folder
dataset_path = Path('clean_dataset')
for house_folder in dataset_path.iterdir():
    if house_folder.is_dir():
        print(f'Processing {house_folder.name}')

        # Process each meter file
        for meter_file in (house_folder / 'Electric_data').iterdir():
            if meter_file.name != 'appliances_metadata.csv':
                electric_data_filepath = house_folder / 'Electric_data' / meter_file.name
                env_data_filepath = house_folder / 'Environmental_data' / meter_file.name

                if electric_data_filepath.exists() and env_data_filepath.exists():
                    total, nan, timestamps, issues = process_file(electric_data_filepath, env_data_filepath)
                    
                    # Aggregate the results
                    total_values += total
                    total_nan_values += nan
                    total_timestamps += timestamps
                    total_issues += issues

# Calculate overall percentages
overall_nan_percentage, overall_issues_percentage = calculate_overall_percentages(
    total_nan_values, total_values, total_issues, total_timestamps
)

print("Total number of recorded values:", total_values)
print("Overall NaN Percentage (%):", overall_nan_percentage)
print("Overall Issues Percentage(%):", overall_issues_percentage)