In [3]:
import clickhouse_connect
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
import os
import plotly.graph_objects as go

In [4]:
# load .env file
env_path = Path('../') / '.env'
load_dotenv(env_path)

True

## Query data from Clickhouse

In [5]:
# Set up Clickhouse client
HOST_CH = os.getenv("HOST_CH")
USERNAME_CH = os.getenv("USERNAME_CH")
PASSWORD_CH = os.getenv("PASSWORD_CH")
client = clickhouse_connect.get_client(host=HOST_CH, port=8443, username=USERNAME_CH, password=PASSWORD_CH)

del HOST_CH, USERNAME_CH, PASSWORD_CH

In [28]:
USE_LOCAL_DATA = True
SAVE_LOCAL_DATA = True

# File info
file_name = 'heat_pump_errors.csv'
file_dir = 'data'
file_path = os.path.join(file_dir, file_name)

# Error types
error_types = ['hp1_mainLineCurrentProtection', 'hp1_compressorPhaseCurrentProtection', 'hp1_ipmModuleProtection', 
            'hp1_compressorOilReturnProtection', 'hp1_highPressureSwitchProtection', 'hp1_firstStartPreHeatProtection', 
            'hp1_gasDischargeTempSensorProtection', 'hp1_evaporatorCoilTempSensorProtection', 'hp1_acVoltageProtection', 
            'hp1_lowPressureSwitchProtection', 'hp1_highPressureSwitchLockProtection', 'hp1_lowPressureSwitchLockProtection', 
            'hp1_evaporatingPressureLockProtection', 'hp1_condenserPressureLockProtection', 'hp2_mainLineCurrentProtection', 
            'hp2_compressorPhaseCurrentProtection', 'hp2_ipmModuleProtection', 'hp2_compressorOilReturnProtection', 
            'hp2_highPressureSwitchProtection', 'hp2_firstStartPreHeatProtection', 'hp2_gasDischargeTempSensorProtection', 
            'hp2_evaporatorCoilTempSensorProtection', 'hp2_acVoltageProtection', 'hp2_lowPressureSwitchProtection', 
            'hp2_highPressureSwitchLockProtection', 'hp2_lowPressureSwitchLockProtection', 'hp2_evaporatingPressureLockProtection', 
            'hp2_condenserPressureLockProtection', 'hp1_ambientTempSensorFailure', 'hp1_evaporatorCoilTempSensorFailure', 
            'hp1_gasDischargeTempSensorFailure', 'hp1_gasReturnTempSensorFailure', 'hp1_evaporatorPressureSensorFailure', 
            'hp1_condenserPressureSensorFailure', 'hp1_fanFailure', 'hp1_eviPressureSensorFailure', 'hp1_eviInletTempSensorFailure', 
            'hp1_eviOutletTempSensorFailure', 'hp1_masterSlaveCommunicationFailure', 'hp1_controlPcbModuleCommunicationFailure', 
            'hp1_compressorPhaseCurrentFailure', 'hp1_compressorDriverFailure', 'hp1_moduleVDCVoltageFailure', 'hp1_acCurrentFailure', 
            'hp1_eepromFailure', 'hp1_fanDrivePcbFailure', 'hp1_inletWaterTempSensorFailure', 'hp1_outletWaterTempSensorFailure', 
            'hp1_innerCoilTempSensorFailure', 'hp1_dcWaterPumpFailure', 'hp2_ambientTempSensorFailure', 
            'hp2_evaporatorCoilTempSensorFailure', 'hp2_gasDischargeTempSensorFailure', 'hp2_gasReturnTempSensorFailure', 
            'hp2_evaporatorPressureSensorFailure', 'hp2_condenserPressureSensorFailure', 'hp2_fanFailure', 'hp2_eviPressureSensorFailure', 
            'hp2_eviInletTempSensorFailure', 'hp2_eviOutletTempSensorFailure', 'hp2_masterSlaveCommunicationFailure', 
            'hp2_controlPcbModuleCommunicationFailure', 'hp2_compressorPhaseCurrentFailure', 'hp2_compressorDriverFailure', 
            'hp2_moduleVDCVoltageFailure', 'hp2_acCurrentFailure', 'hp2_eepromFailure', 'hp2_fanDrivePcbFailure', 
            'hp2_inletWaterTempSensorFailure', 'hp2_outletWaterTempSensorFailure', 'hp2_innerCoilTempSensorFailure', 'hp2_dcWaterPumpFailure']

if USE_LOCAL_DATA:
    df_errors = pd.read_csv(file_path)
else:
    # Create query
    start_date = '2023-10-01'
    end_date = '2024-02-06'
    where_clause = f"\nOR ".join([error_type + " == 1" for error_type in error_types])
    query = f"""
    SELECT
        toStartOfDay(time_ts) AS time_day,
        clientid,
        {', '.join([f"any({error_type}) AS {error_type}" for error_type in error_types])}
    FROM
        cic_stats
    WHERE
        time_day >= '{start_date}' AND time_day <= '{end_date}'
    GROUP BY
        time_day,
        clientid
    HAVING
        ({where_clause})
    """

    # Execute query
    df_errors = client.query_df(query)

    if SAVE_LOCAL_DATA:
        # Create the subdirectory if it doesn't exist
        if not os.path.exists(file_dir):
            os.makedirs(file_dir)

        # Save the DataFrame to a CSV file in the "data" subdirectory
        df_errors.to_csv(file_path, index=False)


    

In [35]:
min_count = 1

# Get the error counts
error_counts = df_errors[error_types].sum(axis=0)
error_counts = error_counts[error_counts >= min_count].sort_values(ascending=True)

# Create the bar chart
fig = go.Figure(data=[go.Bar(x=error_counts.values, y=error_counts.index, orientation='h', text=error_counts.values, textposition='auto')])

# Set the chart title and axis labels
fig.update_layout(title='Error Occurrence Count',
                  xaxis_title='CiC-days', 
                  yaxis_title='Error Type',
                  height=600,)

# Set the x-axis to log scale
fig.update_xaxes(type='log')

# Show the chart
fig.show()


In [36]:
# Melt the dataframe to get error types in a column
df_error_types = df_errors.melt(id_vars=['time_day', 'clientid'], value_vars=error_types, var_name='error_type', value_name='error_occurred')

# Drop rows where error_occurred is nan or 0
df_error_types = df_error_types.dropna(subset=['error_occurred'])
df_error_types = df_error_types[df_error_types['error_occurred'] > 0]

# Count the number of unique CiCs where each error type occurred
df_error_counts = df_error_types.groupby('error_type')['clientid'].nunique().sort_values(ascending=True).rename('occurences').reset_index()

# Create the bar chart
fig = go.Figure(data=[go.Bar(x=df_error_counts['occurences'], y=df_error_counts['error_type'], orientation='h', text=df_error_counts['occurences'], textposition='auto')])
fig.update_layout(title='Error Occurrence Count (per CiC)', 
                  xaxis_title='# of unique CiCs', 
                  yaxis_title='Error Type',
                  height=600,)
fig.update_xaxes(type='log')
fig.show()
