# Clean Intervals

Based off Oskar's script called 'clean_intervals' for the commercial data.

In [2]:
import pandas as pd
import s3fs
import boto3
from io import StringIO
from datetime import datetime, date
import numpy as np


### Clean

* 0 - Remove all Intervals with Zero or Negative Energy (End of a session is captured in session data)
* 1 - Remove intervals of duration less than 1 second
* 2 - Replace negative energy values with zero
* 3a - Ignore interval data with super high “Power” (limit to station max: 50kW) -> Assuming power data is in kW. 
* 3b - Also limit maximal energy, to 50kW times Interval-duration

* optional: Sort by session ID and interval ID
* optional: round to 4 decimals

Note: only done for non-fleet.

Note:
Some of the intervals in a session might be missing in a few cases because the data was not stored/captured
Ideally “Session Time” and sum of “Interval Duration” should match, if it does not, please use “Session Time”

In [3]:
date_united = '26072019'
df_raw = pd.read_csv('s3://script.chargepoint.data/Residential_Data/clean/Charging_Intervals_United'+date_united+'.csv')
print("Len of CSV: ", len(df_raw.index))
print("Columns: ", list(df_raw))

Len of CSV:  40285809
Columns:  ['Interval ID', 'Session ID', 'Interval Start Time (Local)', 'Interval Duration (Secs)', 'Peak Power', 'Average Power', 'Interval Energy']


# Helper Fun

In [4]:
def apply_inplace(df, field, fun):
    return pd.concat([df.drop(field, axis=1), df[field].apply(fun)], axis=1) 

In [5]:
def remove_less_or_equal(data, col, threshold):
    print("removing {} occurences of {} of less than {}".format(sum(data[col] <= threshold), col, threshold))
    return data[data[col] > threshold]

In [6]:
def remove_less_than(data, col, threshold):
    print("removing {} occurences of {} of less than {}".format(sum(data[col] < threshold), col, threshold))
    return data[data[col] >= threshold]

In [9]:
def replace_neg_with_zero(data, col):
    print("In {}, {} negative entries were replaced with 0".format(col, sum(data[col] < 0)))
    fun = lambda x: x if (x > 0) else 0.0
    return apply_inplace(data, col, fun)

# Clean 0 - Remove Intervals with Zero or negative Energy

In [7]:
df = remove_less_or_equal(df_raw, col="Interval Energy", threshold=0.0)

removing 29369560 occurences of Interval Energy of less than 0.0


# Clean 1 - Remove intervals of length 0

In [8]:
df = remove_less_than(df, col='Interval Duration (Secs)', threshold=1)

removing 93427 occurences of Interval Duration (Secs) of less than 1


# Clean 2 - Replace negative Power with 0

In [10]:
df = replace_neg_with_zero(df, col='Peak Power')
df = replace_neg_with_zero(df, col='Average Power')

In Peak Power, 0 negative entries were replaced with 0
In Average Power, 0 negative entries were replaced with 0


# Clean 3 - Limit Peak / Avg Power and Interval Energy

In [13]:
def limit_power(data, col, threshold):
    print("In {}, {} extreme entries were replaced with {}".format(col, sum(data[col] > threshold), threshold))
    fun = lambda x: min(x, threshold)
    return apply_inplace(data, col, fun)

In [14]:
def limit_energy_sophisticated(data, threshold):
    assert sum(data['Interval Duration (Secs)'] == 0) == 0
    col = 'Interval Energy'
    col_s = 'Interval Duration (Secs)'
    to_replace = data[col]  > (threshold/3600)*data[col_s]
    print("In {}, {} extreme entries were replaced with {}".format(col, sum(to_replace), threshold))
    data.loc[:, col] = np.minimum(data[col], (threshold/3600)*data[col_s])
    return data

In [15]:
df = limit_power(df, col='Peak Power', threshold=50)
df = limit_power(df, col='Average Power', threshold=50)
df = limit_energy_sophisticated(df, threshold=50)

In Peak Power, 10 extreme entries were replaced with 50
In Average Power, 634 extreme entries were replaced with 50
In Interval Energy, 1982 extreme entries were replaced with 50


# Sort values  by session ID, interval ID

In [16]:
df = df.sort_values(by=["Session ID", "Interval ID"], axis=0, ascending=True)

~~Simplify dataset for easier joining~~

In [23]:
# if test:
#     data = df.copy(deep=True)
#     print("fraction of incomplete intervals:", sum(data["Interval Duration (Secs)"] != 900)/len(data))
# # Note: too many. skipped.

## Round to 4 decimals

In [17]:
decimals = {'Interval Energy': 4, 'Peak Power': 4, 'Average Power': 4}
df = df.round(decimals=decimals)

# All in one function

In [18]:
def clean_intervals(df):
    """
    * 0 - Remove all Intervals with Zero or Negative Energy (End of a session is captured in session data)
    * 1 - Remove intervals of duration less than 1 second
    * 2 - Replace negative energy values with zero
    * 3a - Ignore interval data with super high “Power” (limit to station max: 50kW) -> Assuming power data is in kW. 
    * 3b - Also limit maximal energy, to 50kW times Interval-duration
    * optional: Sort by session ID and interval ID
    * optional: round to 4 decimals
    """
    df = remove_less_or_equal(df, col="Interval Energy", threshold=0.0)
    df = remove_less_than(df, col='Interval Duration (Secs)', threshold=1)
    df = replace_neg_with_zero(df, col='Peak Power')
    df = replace_neg_with_zero(df, col='Average Power')
    df = replace_neg_with_zero(df, col='Interval Energy')
    df = limit_power(df, col='Peak Power', threshold=50)
    df = limit_power(df, col='Average Power', threshold=50)
    df = limit_energy_sophisticated(df, threshold=50)
    df = df.sort_values(by=["Session ID", "Interval ID"], axis=0, ascending=True)
    df = df.round(decimals={'Interval Energy': 4,'Peak Power': 4,'Average Power': 4,})
    return df

In [19]:
# Can do it all at once: 
df = clean_intervals(df_raw)
print("change in size:",  len(df)/len(df_raw))

removing 29369560 occurences of Interval Energy of less than 0.0
removing 93427 occurences of Interval Duration (Secs) of less than 1
In Peak Power, 0 negative entries were replaced with 0
In Average Power, 0 negative entries were replaced with 0
In Interval Energy, 0 negative entries were replaced with 0
In Peak Power, 10 extreme entries were replaced with 50
In Average Power, 634 extreme entries were replaced with 50
In Interval Energy, 1982 extreme entries were replaced with 50
change in size: 0.26865097831348006


# Save

In [20]:
save_date = '26072019'

In [21]:
csv_buffer = StringIO()
df.to_csv(csv_buffer, index=False)
s3_resource = boto3.resource('s3')
s3_resource.Object('script.chargepoint.data', 'Residential_Data/clean/Charging_Intervals_Cleaned'+str(save_date)+'.csv').put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '8C27C7E97057E56A',
  'HostId': '1maNr4TugUCoWUIfH/YS1qAawmiqP/rxS24UR2ojKzh2rE6zDBSf9BSVn/zLQMUJNzjLqyiV9B4=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '1maNr4TugUCoWUIfH/YS1qAawmiqP/rxS24UR2ojKzh2rE6zDBSf9BSVn/zLQMUJNzjLqyiV9B4=',
   'x-amz-request-id': '8C27C7E97057E56A',
   'date': 'Sat, 27 Jul 2019 00:47:40 GMT',
   'etag': '"6111ea41c46dccf91b4d3950c6e239b8"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"6111ea41c46dccf91b4d3950c6e239b8"'}