In [10]:
"""
Data Processor Module

This module defines a DataProcessor class for loading, processing, and visualizing a CSV file containing the data.
Methods:
        load_data(report_title="Report of actions taken to prepare the dataset for analysis:"):
            Load data from the specified CSV file and perform initial processing steps.
            
        process():
            Perform data processing steps, such as calculating transients and normalized transients.
        
        show_features(features):
            Visualize specified features over time.

"""

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import logging

class DataProcessor:

    def __init__(self, csv_file_path):
        self.df = None
        self.csv_file_path = csv_file_path

    def load_data(self, report_title="Report of actions taken to prepare the dataset for analysis:"):
        logging.info(report_title)
        logging.info("Loading data...")

        try:
            self.df = pd.read_csv(self.csv_file_path)

            # Change the data type of 'timestamp_tz' to datetime
            if 'timestamp_tz' in self.df.columns:
                self.df['timestamp_tz'] = pd.to_datetime(self.df['timestamp_tz'], errors='coerce')

            # List of columns to drop
            columns_to_drop = ['timestamp', 'coil_reversed', 'device', 'channel', 'hz', 'firmware', 'hour', 'Unnamed: 0.1', 'Unnamed: 0', 'event_id']

            # Drop the specified columns
            dropped_columns = [col for col in columns_to_drop if col in self.df.columns]
            self.df.drop(columns=dropped_columns, inplace=True)

            # Print a message to indicate which columns were dropped
            if dropped_columns:
                logging.info(f'Dropped columns: {", ".join(dropped_columns)}')
                logging.info('The timestamp column was dropped because there are two columns')

            # Check for missing values
            if self.df.isna().values.any():
                data_shape = self.df.shape
                logging.info(f'File shape: {data_shape[0]} rows and {data_shape[1]} columns')
                logging.info('Missing values report:')
                missing_values = self.df.isna().sum()
                percentage_missing = (missing_values / len(self.df)) * 100
                for column, total_missing, percentage in zip(missing_values.index, missing_values, percentage_missing):
                    if total_missing > 0:
                        logging.info(f'{column} - {total_missing} - {percentage:.2f}%')

        except Exception as e:
            logging.error(f"Error loading data: {str(e)}")

    def process(self):
        if self.df is None:
            logging.error("Data has not been loaded. Cannot process.")
            return

        try:
            # Calculate Transients
            for i in range(1, 11):
                transient_name = f'Transient_{i}'
                self.df[transient_name] = self.df[f'peak_{i}'] - self.df['current']

            # Normalize Transients
            for i in range(1, 11):
                transient_name = f'Transient_{i}'
                norm_transient_name = f'normalized_transient_{i}'
                self.df[norm_transient_name] = self.df[transient_name] / np.sqrt((self.df[[f'Transient_{j}' for j in range(1, 10)]] ** 2).sum(axis=1))

            # Calculate Mean of Transients
            self.df['mean_transient'] = self.df[[f'Transient_{i}' for i in range(1, 11)]].mean(axis=1)

            # Calculate Standard Deviation of Transients
            self.df['std_transient'] = self.df[[f'Transient_{i}' for i in range(1, 11)]].std(axis=1)

            # Calculate the mean of the highest 7 transients
            self.df['mean_top7_transient'] = self.df[[f'Transient_{i}' for i in range(1, 11)]].apply(lambda row: row.nlargest(7).mean(), axis=1)

        except Exception as e:
            logging.error(f"Error processing data: {str(e)}")

    def show_features(self, features):
        if self.df is None:
            logging.error("Data has not been loaded. Cannot show features.")
            return

        try:
            sensor_df = self.df.copy()

            # Convert the 'timestamp_tz' column to datetime format
            sensor_df['timestamp_tz'] = pd.to_datetime(sensor_df['timestamp_tz'], errors='coerce')

            # Drop rows with missing 'timestamp_tz' values
            sensor_df = sensor_df.dropna(subset=['timestamp_tz'])

            sensor_df = sensor_df.sort_values(by='timestamp_tz')

            # Create an array of dates at 1:00 AM, 7:00 AM, 1:00 PM, and 7:00 PM each day
            date_ranges = pd.date_range(sensor_df['timestamp_tz'].dt.floor('D').min(), sensor_df['timestamp_tz'].dt.floor('D').max(), freq='D')
            x_ticks = np.concatenate([
                date_ranges + pd.DateOffset(hours=1),
                date_ranges + pd.DateOffset(hours=7),
                date_ranges + pd.DateOffset(hours=13),
                date_ranges + pd.DateOffset(hours=19),
            ])

            # Plot the selected features against time
            for feature in features:
                fig, ax = plt.subplots(figsize=(12, 6))
                ax.set_xticks(x_ticks)
                ax.xaxis.set_major_formatter(mdates.DateFormatter("%I %p"))
                plt.plot(sensor_df['timestamp_tz'], sensor_df[feature])
                plt.title(f'{feature} vs. Time')
                plt.xlabel('Time')
                plt.ylabel(feature)
                plt.xticks(rotation=45)

                # Add vertical lines at 1 AM each day (colored green)
                for date in date_ranges:
                    plt.axvline(date + pd.DateOffset(hours=1), color='green', linestyle='--', linewidth=1)

                plt.grid(True)
                plt.show()

        except Exception as e:
            logging.error(f"Error showing features: {str(e)}")

# Configure logging
logging.basicConfig(level=logging.INFO) 