# Clean Sessions

Based off Oskar's script called 'clean_sessions' for the commercial data.

In [2]:
import pandas as pd
import s3fs
import boto3 
from io import StringIO
from datetime import datetime, date
import numpy as np

*Note my changes are highlighted with italics*

*Can also do it all at once later after box with "Clean, Transform all at once: "*

### Clean

* 1) Eliminate sessions that have 0 kWh Energy (*specifically < 0.1kWh*)

* 2) Eliminate sessions where total session time is less than 120 seconds (probably driver never really charged his car)

* 3) ~~Ignore sessions with “Connector Type” -  Type 2 Cable & Type 2 Socket~~ (*in this set connector types are {'J1772', 'NEMA14', 'NEMA6'}*)

* 4) ~~“Session Type” - FLEET should ideally be analyzed/treated separately since they are not your regular/passenger EV drivers~~ (*The only category present in this set is 'Single family residential'*)

* 5) Ignore sessions with Energy(kWh) greater than 100 kWh ~~for “Session Type” OTHER~~ (*OTHER is the only Session Type*)

### Transform Datetimes

* transform start time string to datetime, create new column "start_datetime"
* create new columns: "start_" + ["seconds", "year", "month", "day", "weekday"]
* ~~ignore endtime, because it is given by start time + duration~~
* ~~drop original start and end columns~~


### ~~Optional~~ Clean:

* 6) Ignore sessions not in PDT/PST timezone

* 7) discard drivers from out of country

* 8) discard sessions with Fees not in USD




# Load

In [3]:
united_date = '26072019'
df_raw = pd.read_csv('s3://script.chargepoint.data/Residential_Data/clean/Charging_Sessions_United'+united_date+'.csv')
print("Size of CSV: ", len(df_raw))
print("Columns: ", list(df_raw))

Size of CSV:  1074668
Columns:  ['Session ID', 'Station ID', 'Port ID', 'Connector Type', 'POI Category', 'POI Subcategory', 'Station Start Time (Local)', 'Station End Time (Local)', 'Session Time Zone', 'Session Time (secs)', 'Charging Time (secs)', 'Start SoC', 'End SoC', 'Full Charge', 'Energy (kWh)', 'Driver ID', 'Driver Zip', 'Vehicle Make', 'Vehicle Model', 'Battery Capacity', 'EV Type', '$ Fee', 'Fee Currency', 'Zip Code', 'City', 'County', 'State', 'Country', 'Driver City', 'Driver State', 'Driver Country', 'Max Power', 'Session Type', 'Vehicle Model Year']


### Helper Fun

In [16]:
def remove_less_than(data, col, threshold):
    print("removing {} occurences of {} of less than {}".format(sum(data[col] < threshold), col, threshold))
    return data[data[col] >= threshold]

def remove_greater_than(data, col, threshold):
    print("removing {} occurences of {} of greater than {}".format(sum(data[col] > threshold), col, threshold))
    return data[data[col] <= threshold]


## Clean 1 - remove sessions with less than 0.1 kWh:

In [9]:
print("Removing ", sum(df_raw['Energy (kWh)'] < 0.1) / len(df_raw), " fraction of the sessions.")
df = remove_less_than(df_raw, col='Energy (kWh)', threshold=0.1)

Removing  0.040203113891918246  fraction of the sessions.
removing 43205 occurences of Energy (kWh) of less than 0.1


## Clean 2 - remove sessions with Session Time < 2min

In [10]:
print("Removing ", sum(df['Session Time (secs)'] < 120) / len(df), " fraction of the sessions.")
df = remove_less_than(df, col='Session Time (secs)', threshold=120)

Removing  0.0011401281480770517  fraction of the sessions.
removing 1176 occurences of Session Time (secs) of less than 120


## ~~Clean 3 - Remove sessions with “Connector Type” - Type 2 Cable & Type 2 Socket~~

## ~~Clean 4 - separating fleet from non fleet~~

## Clean 5 - ignore sessions with Energy(kWh) greater than 100 kWh ~~for “Session Type” OTHER~~


In [15]:
print("Removing ", sum(df['Energy (kWh)'] > 100) / len(df), " fraction of the sessions.")
df = remove_greater_than(df, col='Energy (kWh)', threshold=100)

Removing  1.3588446714362115e-05  fraction of the sessions.
removing 14 occurences of Energy (kWh) of less than 100


# Transform Datetimes (make new variables)

In [23]:
def to_datetime(x):
    return datetime.strptime(x, "%Y-%m-%d %H:%M:%S.%f")

def to_seconds(x):
    """seconds in current day since midnight"""
    return 60*60*x.hour + 60*x.minute + x.second

def to_year(x):
    return x.year

def to_month(x):
    return x.month

def to_day(x):
    """day of year, [1, 366]"""
    return x.timetuple().tm_yday

# My addition: (should start counting everything from 0)
def to_day0(x):
    """day of year, [0, 365]"""
    return x.timetuple().tm_yday - 1

def to_weekday(x):
    """weekday, [0, 6]"""
    return x.weekday()

In [18]:
def apply_transforms(data, col, transforms, names, drop_col=False):
    """apply multiple transforms"""
    for n, t in zip(names, transforms):
        print("applying transform {} to {}".format(n, col))
        data[n] = data[col].apply(t)
    if drop_col:
        print("dropping {}".format(col))
        data = data.drop(columns=[col])  
    return data

### handle datetimes: START
apply multiple transforms to start time. 

*Don't remove end times, unnecessary*

In [19]:
def transform_start_datetimes(data):
    col = "Station Start Time (Local)"
    
    # remove sessions where timestamp is not a string
    data_clean = data[data[col].apply(lambda x: type(x)) == type("")]
    print("removed {} sessions where timestamp was not a string".format(len(data) - len(data_clean)))
    
    # Note changed to use "to_day0" not "to_day"
    transforms = [to_datetime, to_seconds, to_year, to_month, to_day0, to_weekday]
    names = ["datetime", "seconds", "year", "month", "day", "weekday"]
    names = ["start_{}".format(x) for x in names]
    
    # create datetime object from string
    data_clean = apply_transforms(data_clean, col, [transforms[0]], [names[0]], drop_col=False)
    # all other transfroms are applied to datetime object
    data_clean = apply_transforms(data_clean, names[0], transforms[1:], names[1:], drop_col=False)
    
    return data_clean

In [20]:
col = "Station Start Time (Local)"
print(df[col][0])
print(datetime.strptime(df[col][0], "%Y-%m-%d %H:%M:%S.%f"))
print(datetime.strptime(df[col][0], "%Y-%m-%d %H:%M:%S.%f"))
print(df[col].apply(lambda x: type(x)).value_counts())

2015-05-26 17:05:01.0
2015-05-26 17:05:01
2015-05-26 17:05:01
<class 'str'>    1030273
Name: Station Start Time (Local), dtype: int64


In [24]:
df = transform_start_datetimes(df)

removed 0 sessions where timestamp was not a string
applying transform start_datetime to Station Start Time (Local)
applying transform start_seconds to start_datetime
applying transform start_year to start_datetime
applying transform start_month to start_datetime
applying transform start_day to start_datetime
applying transform start_weekday to start_datetime


# Optional Cleaning

## Helper function: Remove all rows that are not legit values for a column

In [25]:
def remove_out_of(data, col, legit_values, keep_nan=True):
    """Helper function: Remove all rows that are not legit values for a column"""
    print("There are {} NAN entries in column {}".format(data[col].isnull().sum(), col))
    legit = data[col].apply(lambda x: x in legit_values)
    is_nan = keep_nan & data[col].isnull()
    data_clean = data[legit | is_nan]
    print("Removed {} entries for variable {} of values {}".format(
        len(data)-len(data_clean), col, set(data[col].unique()) - set(legit_values + [np.nan])))
    if not keep_nan:
        print("dropped nans")
    return data_clean

## Optional Clean 6 - ignore sessions not in PDT/PST timezone

Note: Is only small fraction


In [29]:
df = remove_out_of(df, col="Session Time Zone", legit_values=['PDT', 'PST'], keep_nan=True)


There are 0 NAN entries in column Session Time Zone
Removed 2282 entries for variable Session Time Zone of values {'UTC'}


## Optional Clean 7 - discard drivers from out of country
Because they are only here temporarily

Is only small fraction

Keep drivers from other states, because they might just not have changed their info in the database

In [30]:
df = remove_out_of(df, col="Driver Country", legit_values=["United States"], keep_nan=True)

There are 536 NAN entries in column Driver Country
Removed 0 entries for variable Driver Country of values set()


## Optional Clean 8 -  discard sessions with Fees not in USD
is only small fraction

In [31]:
df = remove_out_of(df, col="Fee Currency", legit_values=["USD"], keep_nan=True)

There are 1 NAN entries in column Fee Currency
Removed 0 entries for variable Fee Currency of values set()


# Put all together in one function

In [32]:
def clean_session_data(df):
    """
    * 1) Eliminate sessions that have 0 kWh Energy
    * 2) Eliminate sessions where total session time is less than 120 seconds (probably driver never really charged his car)
    * 3) Ignore sessions with Energy(kWh) greater than 100 kWh
    """
    df = remove_less_than(df, col='Energy (kWh)', threshold=0.1)    
    df = remove_less_than(df, col='Session Time (secs)', threshold=120)
    df = remove_greater_than(df, col='Energy (kWh)', threshold=100)
    
    df = transform_start_datetimes(df)
    df = remove_out_of(df, col="Session Time Zone", legit_values=['PDT', 'PST'], keep_nan=True)
    df = remove_out_of(df, col="Driver Country", legit_values=["United States"], keep_nan=True)
    df = remove_out_of(df, col="Fee Currency", legit_values=["USD"], keep_nan=True) 
        
    return df

# Clean, Transform all at once: 

In [33]:
df = clean_session_data(df_raw)
print("change in size:",  len(df)/len(df_raw))

removing 43205 occurences of Energy (kWh) of less than 0.1
removing 1176 occurences of Session Time (secs) of less than 120
removing 14 occurences of Energy (kWh) of greater than 100
removed 0 sessions where timestamp was not a string
applying transform start_datetime to Station Start Time (Local)
applying transform start_seconds to start_datetime
applying transform start_year to start_datetime
applying transform start_month to start_datetime
applying transform start_day to start_datetime
applying transform start_weekday to start_datetime
There are 0 NAN entries in column Session Time Zone
Removed 2282 entries for variable Session Time Zone of values {'UTC'}
There are 536 NAN entries in column Driver Country
Removed 0 entries for variable Driver Country of values set()
There are 1 NAN entries in column Fee Currency
Removed 0 entries for variable Fee Currency of values set()
change in size: 0.9565661208857061


# Save

In [34]:
save_date = '26072019'

In [35]:
csv_buffer = StringIO()
df.to_csv(csv_buffer, index=False)
s3_resource = boto3.resource('s3')
s3_resource.Object('script.chargepoint.data', 'Residential_Data/clean/Charging_Sessions_Cleaned'+str(save_date)+'.csv').put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '97E67B4A6B32D19D',
  'HostId': '1mPNYw0U4KQBNg7ogT6u5BC9sbqU8LUIu1IF3U79mtavVylJgbN7VeqJmKEmAKiCesI+uIFxxB4=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '1mPNYw0U4KQBNg7ogT6u5BC9sbqU8LUIu1IF3U79mtavVylJgbN7VeqJmKEmAKiCesI+uIFxxB4=',
   'x-amz-request-id': '97E67B4A6B32D19D',
   'date': 'Sat, 27 Jul 2019 00:22:34 GMT',
   'etag': '"414f3a3afba48c1aa2cdd498c62e2d24"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"414f3a3afba48c1aa2cdd498c62e2d24"'}