In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import OrderedDict
from pandas.tseries.offsets import MonthEnd
from datetime import datetime
import pytz


In [3]:
consumption_data =pd.read_csv('customers-data/export.csv')

In [4]:
consumption_data

Unnamed: 0,metering_point_id,value,start_time_local,end_time_local,constant
0,707057500010945590,19.770,2022-01-01T00:00:00.000Z,2022-01-01T01:00:00.000Z,1
1,707057500010945590,21.792,2022-01-01T01:00:00.000Z,2022-01-01T02:00:00.000Z,1
2,707057500010945590,19.620,2022-01-01T02:00:00.000Z,2022-01-01T03:00:00.000Z,1
3,707057500010945590,20.622,2022-01-01T03:00:00.000Z,2022-01-01T04:00:00.000Z,1
4,707057500010945590,20.820,2022-01-01T04:00:00.000Z,2022-01-01T05:00:00.000Z,1
...,...,...,...,...,...
2531635,707057500094415316,0.000,2022-12-31T19:00:00.000Z,2022-12-31T20:00:00.000Z,1
2531636,707057500094415316,0.000,2022-12-31T20:00:00.000Z,2022-12-31T21:00:00.000Z,1
2531637,707057500094415316,0.000,2022-12-31T21:00:00.000Z,2022-12-31T22:00:00.000Z,1
2531638,707057500094415316,0.000,2022-12-31T22:00:00.000Z,2022-12-31T23:00:00.000Z,1


In [5]:
def preprocess_df(df):
    df["start_time_local"] = pd.to_datetime(df["start_time_local"])
    df["start_time_local"] = df["start_time_local"].dt.tz_convert('Europe/Oslo')
    df.rename(columns={'start_time_local':'Time(Local)'}, inplace=True)
    df.drop(columns = ["end_time_local", "constant"], inplace = True)
    return df
    
    

In [6]:
updated_df = preprocess_df(consumption_data)

In [8]:
for meter_id in updated_df["metering_point_id"].unique():
    meter_values = updated_df.loc[(updated_df["metering_point_id"] == meter_id)]

Unnamed: 0,metering_point_id,value,Time(Local)
0,707057500010945590,19.770,2022-01-01 01:00:00+01:00
1,707057500010945590,21.792,2022-01-01 02:00:00+01:00
2,707057500010945590,19.620,2022-01-01 03:00:00+01:00
3,707057500010945590,20.622,2022-01-01 04:00:00+01:00
4,707057500010945590,20.820,2022-01-01 05:00:00+01:00
...,...,...,...
8755,707057500010945590,18.414,2022-12-31 20:00:00+01:00
8756,707057500010945590,20.484,2022-12-31 21:00:00+01:00
8757,707057500010945590,18.966,2022-12-31 22:00:00+01:00
8758,707057500010945590,18.984,2022-12-31 23:00:00+01:00


In [21]:
def get_variance(consumption_df):
    
    """
    computes the variance for each meter
    returns two dictionaries, one to connect the meter-id to its values and one to connect the meter-id to its variance
    """
    meter_ids = consumption_df["metering_point_id"].unique()

    #print(len(meter_ids))

    meter_value_dict = {meter_id : consumption_df[consumption_df["metering_point_id"] == meter_id] for meter_id in meter_ids}

    var_met_dict = {meter_id : meter_value_dict[meter_id]["value"].var() for meter_id in meter_ids }

    return meter_value_dict, var_met_dict

In [22]:
meter_val_dict, meter_variance_dict = get_variance(consumption_df=updated_df)
print(len(meter_variance_dict.keys()))
sorted_var_dict = sorted(meter_variance_dict.items(), key=lambda x:x[1])

289


In [23]:
import plotly.express as px

def plot_timeframes(val_dict, var_dict, n_plots):
    """
    plots the timeframes for the n_plots meters with the lowest variance
    """
    sorted_keys = [var_dict[i][0] for i in range(len(var_dict))]
    for meter_id in sorted_keys[:n_plots]:
        #display(val_dict[meter_id]["value"])
        #print("Variance = ", var_dict[meter_id])
        fig = px.line(val_dict[meter_id], x = "start_time_local", y = "value")
        fig.show()

ModuleNotFoundError: No module named 'plotly'

In [24]:
plot_timeframes(meter_val_dict, sorted_var_dict, 5)

NameError: name 'plot_timeframes' is not defined

In [25]:
def get_wanted_amount_of_meters(val_dict, var_dict, n_meters):
    """_summary_

    Args:
        val_dict (_type_): _description_
        var_dict (_type_): _description_
        n_meters (_type_): _description_

    Returns:
        list: Returns a list of given amount of datasets where they are sorted based on the variance 

    """
    sorted_keys = [var_dict[i][0] for i in range(len(var_dict))]
    return [val_dict[meter_id] for meter_id in sorted_keys[:n_meters]]

In [35]:
type(consumption_data["metering_point_id"].iloc[0])

numpy.int64

In [26]:
df_list = get_wanted_amount_of_meters(val_dict= meter_val_dict, var_dict= sorted_var_dict, n_meters = 200)

In [28]:
total_value_per_hour = consumption_data.groupby(["Time(Local)"]).agg({"value": "sum"})

In [29]:
total_value_per_hour["value"]

Time(Local)
2022-01-01 01:00:00+01:00    14202.043
2022-01-01 02:00:00+01:00    13954.733
2022-01-01 03:00:00+01:00    14314.283
2022-01-01 04:00:00+01:00    14006.993
2022-01-01 05:00:00+01:00    13951.225
                               ...    
2022-12-31 20:00:00+01:00    12836.692
2022-12-31 21:00:00+01:00    12553.593
2022-12-31 22:00:00+01:00    12427.895
2022-12-31 23:00:00+01:00    12585.111
2023-01-01 00:00:00+01:00    12095.609
Name: value, Length: 8759, dtype: float64

In [33]:
consumption_data.groupby("metering_point_id")

TypeError: 'bool' object is not callable

In [30]:
for (index, val) in enumerate(total_value_per_hour["value"]):
    #print(index, val)
    if val < 1000:
        print(index, val)

In [31]:
def get_hour_val_for_meter(df : pd.DataFrame, date : datetime, meter_id : np.int64):
    return df.loc[(df["Time(Local)"] == date), (df["metering_point_id"] == meter_id)]
            
        