In [1]:
import pandas as pd
%matplotlib inline
import mpld3
mpld3.enable_notebook()
import numpy as np 
import matplotlib.pyplot as plt

# Clean, Transform, Save INTERVAL Data

## Clean

* 0 - Remove all Intervals with Zero or Negative Energy (End of a session is captured in session data)
* 1 - Remove intervals of duration less than 1 second
* 2 - Replace negative energy values with zero
* 3a - Ignore interval data with super high “Power” (limit to station max: 50kW) -> Assuming power data is in kW. 
* 3b - Also limit maximal energy, to 50kW times Interval-duration

* optional: Sort by session ID and interval ID
* optional: round to 4 decimals

Note: only done for non-fleet.

Note:
Some of the intervals in a session might be missing in a few cases because the data was not stored/captured
Ideally “Session Time” and sum of “Interval Duration” should match, if it does not, please use “Session Time”

In [2]:
# do verbose sanity checks, and do not save in the end
test = False

In [3]:
# Loading data into pandas dataframe
path = "../../data/"
file_names = [
    'Charging_Session_Interval_SLAC_ALAMEDA.csv', 
    'Charging_Session_Interval_SLAC_CONTRACOSTA.csv', 
    'Charging_Session_Interval_SLAC_MARIN.csv', 
    'Charging_Session_Interval_SLAC_NAPA.csv', 
    'Charging_Session_Interval_SLAC_SANFRANCISCO.csv', 
    'Charging_Session_Interval_SLAC_SANMATEO.csv', 
    'Charging_Session_Interval_SLAC_SANTACLARA.csv',
    'Charging_Session_Interval_SLAC_SOLANO.csv', 
    'Charging_Session_Interval_SLAC_SONOMA.csv'
]
    
if test:
    nrows = 1e6
else:
    nrows = None

col_types = {
    'Interval ID': int, 
    'Session ID': int, 
    'Interval Start Time (Local)': str, 
    'Interval Duration (Secs)': int, 
    'Peak Power': float, 
    'Average Power': float, 
    'Interval Energy':float
}
    
def get_csv(file, dtype=None):
    print("Reading {} rows of dataframe {}".format(nrows if nrows is not None else "ALL", file))
    return pd.read_csv(file, index_col=False, nrows=nrows, dtype=dtype)

In [4]:
df_raw_list = [get_csv(path + f, dtype=col_types) for f in file_names]
print("concatenate dataframes")
df_raw = pd.concat(df_raw_list, ignore_index=True)

if test:
    df = df_raw.copy(deep=True)

Reading ALL rows of dataframe ../../data/Charging_Session_Interval_SLAC_ALAMEDA.csv
Reading ALL rows of dataframe ../../data/Charging_Session_Interval_SLAC_CONTRACOSTA.csv
Reading ALL rows of dataframe ../../data/Charging_Session_Interval_SLAC_MARIN.csv
Reading ALL rows of dataframe ../../data/Charging_Session_Interval_SLAC_NAPA.csv
Reading ALL rows of dataframe ../../data/Charging_Session_Interval_SLAC_SANFRANCISCO.csv
Reading ALL rows of dataframe ../../data/Charging_Session_Interval_SLAC_SANMATEO.csv
Reading ALL rows of dataframe ../../data/Charging_Session_Interval_SLAC_SANTACLARA.csv
Reading ALL rows of dataframe ../../data/Charging_Session_Interval_SLAC_SOLANO.csv
Reading ALL rows of dataframe ../../data/Charging_Session_Interval_SLAC_SONOMA.csv
concatenate dataframes


In [5]:
# List all column headers

print("Len of CSV: ", len(df_raw.index))
print("Columns: ", list(df_raw))

Len of CSV:  227656337
Columns:  ['Interval ID', 'Session ID', 'Interval Start Time (Local)', 'Interval Duration (Secs)', 'Peak Power', 'Average Power', 'Interval Energy']


In [6]:
df_raw.head(10)

Unnamed: 0,Interval ID,Session ID,Interval Start Time (Local),Interval Duration (Secs),Peak Power,Average Power,Interval Energy
0,999998925,42092073,2016-04-07 10:30:00.0,852,0.0,0.0,0.0
1,999998923,42092073,2016-04-07 10:15:00.0,900,0.0,0.0,0.0
2,999998921,42092073,2016-04-07 10:00:00.0,900,1.3197,0.1566,0.039161
3,999998919,42092073,2016-04-07 09:45:00.0,900,1.468,0.3375,0.084379
4,999998917,42092073,2016-04-07 09:30:00.0,900,1.6033,0.5034,0.125838
5,999998915,42092073,2016-04-07 09:15:00.0,900,2.2783,1.2577,0.314425
6,999998913,42092073,2016-04-07 09:00:00.0,900,5.8166,3.7041,0.926022
7,999998911,42092073,2016-04-07 08:45:00.0,900,5.8681,5.8265,1.456619
8,999998909,42092073,2016-04-07 08:30:00.0,900,5.8846,5.8664,1.466605
9,999998907,42092073,2016-04-07 08:15:00.0,900,5.8888,5.8598,1.464958


# Helper Fun

In [7]:
def apply_inplace(df, field, fun):
    return pd.concat([df.drop(field, axis=1), df[field].apply(fun)], axis=1) 

In [8]:
def remove_less_or_equal(data, col, threshold):
    print("removing {} occurences of {} of less than {}".format(sum(data[col] <= threshold), col, threshold))
    return data[data[col] > threshold]

In [9]:
def remove_less_than(data, col, threshold):
    print("removing {} occurences of {} of less than {}".format(sum(data[col] < threshold), col, threshold))
    return data[data[col] >= threshold]

# Clean 0 - Remove Intervals with Zero or negative Energy

In [10]:
if test:
    data = df.copy(deep=True)
    zeros = sum(data["Interval Energy"] == 0.0)
    print("Percentage Zero-Energy:", zeros/len(df))
    data = data[data["Interval Energy"] > 0.0]
    near_zero = sum(data["Interval Energy"] < 0.01)
    print("Percentage Near Zero-Energy:", near_zero/len(df)) 

In [11]:
# def remove_zero_entries(data, col):
#     zeros = sum(data[col]] == 0.0)
#     print("removing Zero-energy entries: in percent of all:", zeros/len(data))
#     return data[data[col] != 0.0]

In [12]:
if test:
    # keep all non-zero entries, saves about half of the space
#     df = remove_zero_entries(df, col="Interval Energy")
    # also remove all negative entries
    df = remove_less_or_equal(df, col="Interval Energy", threshold=0.0)

# Clean 1 - Remove intervals of length 0

In [13]:
def remove_less_than(data, col, threshold):
    print("removing {} occurences of {} of less than {}".format(sum(data[col] < threshold), col, threshold))
    return data[data[col] > threshold]

In [14]:
if test:
    df = remove_less_than(df, col='Interval Duration (Secs)', threshold=1)

# Clean 2 - Replace negative Power with 0

In [15]:
def replace_neg_with_zero(data, col):
    print("In {}, {} negative entries were replaced with 0".format(col, sum(data[col] < 0)))
    fun = lambda x: x if (x > 0) else 0.0
    return apply_inplace(data, col, fun)

In [16]:
if test:
    df = replace_neg_with_zero(df, col='Peak Power')
    df = replace_neg_with_zero(df, col='Average Power')
    # already dropped:
#     df = replace_neg_with_zero(df, col='Interval Energy')

# Clean 3 - Limit Peak / Avg Power and Interval Energy

In [17]:
if test:
    data = df.copy(deep=True)
    print("found {} entries where data['Average Power'] > 0.1 + data['Peak Power']".format(
        sum(data['Average Power'] > 0.1 + data['Peak Power'])))
    # print(len(data))
#     print(data[data['Average Power'] > data['Peak Power']].drop(
#         columns=["Interval ID", "Session ID", "start_datetime", "start_time_seconds"]).head(20))

In [18]:
def limit_power(data, col, threshold):
    print("In {}, {} extreme entries were replaced with {}".format(col, sum(data[col] > threshold), threshold))
    fun = lambda x: min(x, threshold)
    return apply_inplace(data, col, fun)

In [19]:
def limit_energy_sophisticated(data, threshold):
    assert sum(data['Interval Duration (Secs)'] == 0) == 0
    col = 'Interval Energy'
    col_s = 'Interval Duration (Secs)'
    to_replace = data[col]  > (threshold/3600)*data[col_s]
    print("In {}, {} extreme entries were replaced with {}".format(col, sum(to_replace), threshold))
    data.loc[:, col] = np.minimum(data[col], (threshold/3600)*data[col_s])
    return data

In [20]:
if test:
    df = limit_power(df, col='Peak Power', threshold=50)
    df = limit_power(df, col='Average Power', threshold=50)
    # df = limit_power(df, col='Interval Energy, threshold=50/4.0)
    df = limit_energy_sophisticated(df, threshold=50)

# Sort values  by session ID, interval ID

In [21]:
if test:
    df = df.sort_values(by=["Session ID", "Interval ID"], axis=0, ascending=True)

In [22]:
# df.head(10)

# Simplify dataset for easier joining

In [23]:
if test:
    data = df.copy(deep=True)
    print("fraction of incomplete intervals:", sum(data["Interval Duration (Secs)"] != 900)/len(data))
# Note: too many. skipped.

## Round to 4 decimals

In [None]:
if test:
    decimals = {
        'Interval Energy': 4, 
        'Peak Power': 4, 
        'Average Power': 4,
    }
    df = df.round(decimals=decimals)

# All in one function

In [24]:
def clean_intervals(df):
    """
    * 0 - Remove all Intervals with Zero or Negative Energy (End of a session is captured in session data)
    * 1 - Remove intervals of duration less than 1 second
    * 2 - Replace negative energy values with zero
    * 3a - Ignore interval data with super high “Power” (limit to station max: 50kW) -> Assuming power data is in kW. 
    * 3b - Also limit maximal energy, to 50kW times Interval-duration
    * optional: Sort by session ID and interval ID
    * optional: round to 4 decimals
    """
    # 0
    df = remove_less_or_equal(df, col="Interval Energy", threshold=0.0)
    
    # 1
    df = remove_less_than(df, col='Interval Duration (Secs)', threshold=1)
    
    # 2
    df = replace_neg_with_zero(df, col='Peak Power')
    df = replace_neg_with_zero(df, col='Average Power')
    df = replace_neg_with_zero(df, col='Interval Energy')
    
    # 3
    df = limit_power(df, col='Peak Power', threshold=50)
    df = limit_power(df, col='Average Power', threshold=50)
    # df = limit_power(df, col='Interval Energy, threshold=50/4.0)
    df = limit_energy_sophisticated(df, threshold=50)
    
    # sort
    df = df.sort_values(by=["Session ID", "Interval ID"], axis=0, ascending=True)
    
    # round to 4 decimals
    df = df.round(decimals={'Interval Energy': 4,'Peak Power': 4,'Average Power': 4,})
    return df

In [25]:
if not test:
    df_clean = clean_intervals(df_raw)
    print("change in size:",  len(df_clean)/len(df_raw))

removing 96780327 occurences of Interval Energy of less than 0.0
removing 550 occurences of Interval Duration (Secs) of less than 1
In Peak Power, 0 negative entries were replaced with 0
In Average Power, 0 negative entries were replaced with 0
In Interval Energy, 0 negative entries were replaced with 0
In Peak Power, 1272 extreme entries were replaced with 50
In Average Power, 6797 extreme entries were replaced with 50
In Interval Energy, 105518 extreme entries were replaced with 50
change in size: 0.5748010080650643


In [26]:
df_clean.tail(10)

Unnamed: 0,Interval ID,Session ID,Interval Start Time (Local),Interval Duration (Secs),Interval Energy,Peak Power,Average Power
99121841,5395236561,322361711,2018-11-01 14:00:00.0,900,0.912823,3.6639,3.6513
99121840,5395236571,322361711,2018-11-01 14:15:00.0,900,0.914189,3.671,3.6568
99121839,5395236581,322361711,2018-11-01 14:30:00.0,900,0.825795,3.6731,3.3032
99121838,5395236591,322361711,2018-11-01 14:45:00.0,900,0.239451,2.3307,0.9578
45538186,5395337001,322361931,2018-11-01 16:43:13.0,409,0.174384,7.2902,5.8671
45538185,5395337021,322361931,2018-11-01 16:45:00.0,302,0.614609,7.2657,7.3265
18939627,5395364001,322364701,2018-11-01 16:47:55.0,297,0.240006,3.235,2.9092
99111946,5396681311,322369151,2018-11-01 15:20:49.0,87,0.072493,6.3909,2.9997
18939546,5395431721,322369461,2018-11-01 16:55:28.0,50,0.005531,2.9236,0.3982
18939418,5395620651,322370041,2018-11-01 16:56:29.0,131,0.006174,3.056,0.1697


# save resulting data

In [27]:
if not test:
    print("saving")
    df_clean.to_csv(path + 'clean/' +'intervals_clean.csv',sep=',', index=False)