<br>
<br>
 <center> <font size = "5"> Plegma Dataset </font></center>
 <br>
 <center> <font size = "4"> Dataset stats </font></center>
 <br>
 <center> <font size = "3"> </font></center>
<br>
<br>

---

#### Functions

In [1]:
def calculate_cells(df):
    
    """
    Calculate the total number of cells in a DataFrame and the total number of missing values.

    Parameters:
    df (pandas.DataFrame): The DataFrame for which to calculate the total number of cells and missing values.

    Returns:
    total_cells (int): Total number of cells in the DataFrame.
    total_missing (int): Total number of missing values (NaN) in the DataFrame.
    """
    total_cells = np.product(df.shape)
    total_missing = df.isna().sum().sum()
    return total_cells, total_missing


def calculate_uptime(total_cells, total_missing):
    
    """
    Calculate the 'uptime' of a DataFrame, defined as the proportion of non-missing values to the total number of values.

    Parameters:
    total_cells (int): The total number of cells in the DataFrame.
    total_missing (int): The total number of missing values (NaN) in the DataFrame.

    Returns:
    float: The 'uptime' percentage, calculated as the proportion of non-missing values in the DataFrame.
    """
    return (total_cells - total_missing) / total_cells



def process_file(electric_data_filepath, env_data_filepath):

    """
    Process electric and environmental data files.

    This function reads electric and environmental data from specified file paths, calculates the total and NaN values,
    and computes the number of timestamps and issue counts from the electric data.

    Parameters:
    electric_data_filepath (Pathlib.Path or str): Path to the electric data file.
    env_data_filepath (Pathlib.Path or str): Path to the environmental data file.

    Returns:
    tuple: A tuple containing the following elements:
        - total (int): The total number of data points (cells) from both electric and environmental data.
        - nan (int): The total number of NaN (missing) values from both electric and environmental data.
        - timestamps (int): The number of timestamps present in the electric data.
        - issues (int): The sum of the last column values in the electric data, representing issue counts.
    
    Note:
    - The function assumes the last column of electric data contains issue counts.
    - The function skips the first and last columns of electric data, and the first column of environmental data
      for total and NaN calculations.
    """
    electric_data = pd.read_csv(electric_data_filepath, header=0)
    env_data = pd.read_csv(env_data_filepath, header=0)
    
    timestamps = len(electric_data.iloc[:, -1])
    issues = np.sum(electric_data.iloc[:, -1].values)
    
    el_total, el_NaN = calculate_cells(electric_data.iloc[:, 1:-1])
    env_total, env_NaN = calculate_cells(env_data.iloc[:, 1:])
    
    total = el_total + env_total
    nan = el_NaN + env_NaN

    return total, nan, timestamps, issues


def calculate_overall_percentages(total_nan, total_values, total_issues, total_timestamps):
    """
    Calculate the overall percentages of NaN values and issues across all houses.

    Parameters:
    total_nan (int): The aggregated total number of NaN values across all houses.
    total_values (int): The aggregated total number of values across all houses.
    total_issues (int): The aggregated total number of issues across all houses.
    total_timestamps (int): The aggregated total number of timestamps across all houses.

    Returns:
    tuple: A tuple containing:
        - overall_nan_percentage (float): The overall percentage of NaN values.
        - overall_issues_percentage (float): The overall percentage of issues.
    """
    overall_nan_percentage = (total_nan / total_values) * 100 if total_values > 0 else 0
    overall_issues_percentage = (total_issues / total_timestamps) * 100 if total_timestamps > 0 else 0
    return overall_nan_percentage, overall_issues_percentage


def get_wattage(df, appliance_name):
    """
    Retrieves the wattage for a given appliance.

    This function returns a predefined wattage value for a special case
    ('P_agg') and looks up the wattage in a DataFrame for other appliances.
    If the appliance is not found in the DataFrame, it returns infinity.

    Parameters:
    df (pandas.DataFrame): DataFrame containing appliance wattages.
    appliance_name (str): Name of the appliance to get the wattage for.

    Returns:
    float: The wattage of the appliance. Returns 15000 for 'P_agg',
           the wattage from the DataFrame for other appliances, or
           infinity if the appliance is not found in the DataFrame.
    """
    if appliance_name == 'P_agg':
        return 15000
    return df.get(appliance_name, float('inf'))


def count_abnormal_values(data, threshold):
    """
    Counts the number of values in a pandas Series that are outside of a given threshold.

    This function evaluates each element in the Series to determine whether it is
    less than zero or greater than the provided threshold. It sums these occurrences
    and returns the total count.

    Parameters:
    data (pandas.Series): The data series to be evaluated.
    threshold (float): The upper threshold value.

    Returns:
    int: The count of values in the Series that are less than zero or greater than the threshold.
    """
    return ((data < 0) | (data > threshold)).sum()


def count_abnormal_env(dataframe):
    """
    Counts the number of entries in a DataFrame that are out of specified thresholds for temperature and humidity.

    This function applies threshold checks for temperature and humidity in the provided DataFrame.
    For temperature, the threshold is set between -20°C and 60°C.
    For humidity, the threshold is set between 0% and 100%.
    The function counts the number of entries falling outside these ranges.

    Parameters:
    dataframe (pandas.DataFrame): The DataFrame containing temperature and humidity data.
                                  The temperature and humidity columns are expected to be named
                                  'temperature (�C)' and 'humidity (%)' respectively.

    Returns:
    int: The total count of entries where either temperature or humidity falls outside the specified thresholds.
    """
    temp_out_of_range = dataframe['temperature (C)'].apply(lambda x: x < -10 or x > 50 if pd.notnull(x) else False)
    humidity_out_of_range = dataframe['humidity (%)'].apply(lambda x: x < 0 or x > 100 if pd.notnull(x) else False)

    return temp_out_of_range.sum() + humidity_out_of_range.sum()


#### Statistics calculation for recored values, NaN and Issues percentages

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import os


# Initialize variables to aggregate totals
total_nan_values = 0
total_values = 0
total_issues = 0
total_timestamps = 0

# Iterate over each house folder
dataset_path = Path('clean_dataset')
for house_folder in dataset_path.iterdir():
    if house_folder.is_dir():
        # Process each meter file
        for meter_file in (house_folder / 'Electric_data').iterdir():
            if meter_file.name != 'appliances_metadata.csv':
                electric_data_filepath = house_folder / 'Electric_data' / meter_file.name
                env_data_filepath = house_folder / 'Environmental_data' / meter_file.name

                if electric_data_filepath.exists() and env_data_filepath.exists():
                    total, nan, timestamps, issues = process_file(electric_data_filepath, env_data_filepath)
                    
                    # Aggregate the results
                    total_values += total
                    total_nan_values += nan
                    total_timestamps += timestamps
                    total_issues += issues

# Calculate overall percentages
overall_nan_percentage, overall_issues_percentage = calculate_overall_percentages(
    total_nan_values, total_values, total_issues, total_timestamps
)

print("Total number of recorded values:", total_values)
print("Overall NaN Percentage (%):", overall_nan_percentage)
print("Overall Issues Percentage(%):", overall_issues_percentage)

Total number of recorded values: 218410245
Overall NaN Percentage (%): 6.864927512901238
Overall Issues Percentage(%): 0.8286280263667919


#### Statistics calculation for abnormal values calculation 

In [3]:
# Electric measurments
# -------------------

# Set your dataset paths
dataset_path = 'raw_dataset'
clean_dataset_path = 'clean_dataset'

# Cache the wattage for each appliance
wattage_cache = {}

count = 0
for house in os.listdir(dataset_path):
    metadata_path = os.path.join(clean_dataset_path, house, 'Electric_data', 'appliances_metadata.csv')
    appliances_metadata = pd.read_csv(metadata_path, header=0)
    
    # Cache the wattage values
    for _, row in appliances_metadata.iterrows():
        wattage_cache[row['appliance']] = row['wattage [W]']
    raw_data_path = os.path.join(dataset_path, house, 'Raw_collected_data')
    for month in os.listdir(raw_data_path):
        month_path = os.path.join(raw_data_path, month)
        for file_name in os.listdir(month_path):  
            if file_name != 'environmental_sensor.csv':
                file_path = os.path.join(month_path, file_name)
                data = pd.read_csv(file_path, header=0, parse_dates=['timestamp'], index_col='timestamp')
            
                for column in (col for col in data.columns if col not in ['V', 'A']):
                    threshold = get_wattage(wattage_cache, column)
                    count += count_abnormal_values(data[column], threshold)

print(f'Total count of abnormal values: {count}')




# Environmental measurments

count_env = 0

dataset_path = 'Raw_Dataset'

for house in os.listdir(dataset_path):
    dataset_path = 'raw_dataset'
    dataset_path= os.path.join(dataset_path,house,'Raw_collected_data')
    
    for month in os.listdir(dataset_path):
        env_path = os.path.join(dataset_path,month, 'environmental_sensor.csv')
        
        # Check if the file exists before attempting to read it
        if os.path.exists(env_path):
            data = pd.read_csv(env_path, header=0, parse_dates=['timestamp'], index_col='timestamp')
            count_env += count_abnormal_env(data)
        else:
            continue
           
print(f'Total count of abnormal environmental values: {count_env}')

print(f'Total percentage of abnormal values: {(count_env+count)/total_values * 100}')

Total count of abnormal values: 3621841
Total count of abnormal environmental values: 0
Total percentage of abnormal values: 1.6582743176722319


In [4]:
# Environmental measurments

count_env = 0

dataset_path = 'Raw_Dataset'

for house in os.listdir(dataset_path):
    dataset_path = 'raw_dataset'
    dataset_path= os.path.join(dataset_path,house,'Raw_collected_data')
    
    for month in os.listdir(dataset_path):
        env_path = os.path.join(dataset_path,month, 'environmental_sensor.csv')
        
        # Check if the file exists before attempting to read it
        if os.path.exists(env_path):
            data = pd.read_csv(env_path, header=0, parse_dates=['timestamp'], index_col='timestamp')
            count_env += count_abnormal_env(data)
        else:
            continue
           
print(f'Total count of abnormal environmental values: {count_env}')

print(f'Total percentage of abnormal values: {(count_env+count)/total_values * 100}')

Total count of abnormal environmental values: 0
Total percentage of abnormal values: 1.6582743176722319
