In [1]:
from datetime import datetime


start_date = datetime(2012, 1, 1, 0, 0)
end_date = datetime(2017, 8, 12, 23, 59)

start_date, end_date

(datetime.datetime(2012, 1, 1, 0, 0), datetime.datetime(2017, 8, 12, 23, 59))

In [2]:
from datetime import timedelta
from dateutil.relativedelta import relativedelta

import math


r = relativedelta(end_date, start_date)
months = r.years * 12 + r.months + math.floor((r.days + 30)/31)
months

68

In [3]:
lat, long = 40.701, -74.009
lat, long

(40.701, -74.009)

In [4]:
from pandas import read_csv


df = read_csv('data/nyc_energy.csv').drop(columns=['precip', 'temp'], axis=1)
df['lat'] = lat
df['long'] = long
df.head(5)

Unnamed: 0,timeStamp,demand,lat,long
0,2012-01-01 00:00:00,4937.5,40.701,-74.009
1,2012-01-01 01:00:00,4752.1,40.701,-74.009
2,2012-01-01 02:00:00,4542.6,40.701,-74.009
3,2012-01-01 03:00:00,4357.7,40.701,-74.009
4,2012-01-01 04:00:00,4275.5,40.701,-74.009


In [5]:
df.tail()

Unnamed: 0,timeStamp,demand,lat,long
49200,2017-08-12 02:00:00,,40.701,-74.009
49201,2017-08-12 03:00:00,,40.701,-74.009
49202,2017-08-12 04:00:00,,40.701,-74.009
49203,2017-08-12 05:00:00,,40.701,-74.009
49204,2017-08-12 06:00:00,,40.701,-74.009


In [6]:
import os.path
import pandas as pd
import numpy as np
from azureml.opendatasets.accessories.location_data import LatLongColumn
from azureml.opendatasets.accessories.location_time_customer_data \
    import LocationTimeCustomerData
from azureml.opendatasets import NoaaIsdWeather
from azureml.opendatasets.environ import PandasEnv

In [7]:
from dateutil import parser


df['new_datetime'] = df['timeStamp'].apply(parser.parse)
raw_columns = list(df.columns)
df.head(5)

Unnamed: 0,timeStamp,demand,lat,long,new_datetime
0,2012-01-01 00:00:00,4937.5,40.701,-74.009,2012-01-01 00:00:00
1,2012-01-01 01:00:00,4752.1,40.701,-74.009,2012-01-01 01:00:00
2,2012-01-01 02:00:00,4542.6,40.701,-74.009,2012-01-01 02:00:00
3,2012-01-01 03:00:00,4357.7,40.701,-74.009,2012-01-01 03:00:00
4,2012-01-01 04:00:00,4275.5,40.701,-74.009,2012-01-01 04:00:00


In [30]:
all = pd.DataFrame([])
report_joined = {}
i_date = start_date

In [9]:
mi = iter(range(months))

In [17]:
m = next(mi)
m

7

In [28]:
# for m in range(months):
j_date = i_date + relativedelta(months=1) - timedelta(milliseconds=1)
j_date

datetime.datetime(2012, 8, 31, 23, 59, 59, 999000)

In [29]:
i_date, j_date

(datetime.datetime(2012, 8, 1, 0, 0),
 datetime.datetime(2012, 8, 31, 23, 59, 59, 999000))

In [31]:
df1 = df[(df['new_datetime'] >= i_date) & (df['new_datetime'] <= j_date)].copy()
df1['idx'] = list(range(len(df1.index)))
df1 = df1.set_index('idx')

df1
energy = LocationTimeCustomerData(df1,
                                LatLongColumn('lat', 'long'),
                                'new_datetime')

weather = NoaaIsdWeather(cols=["temperature", "precipTime", "precipDepth", "snowDepth"],
                        start_date=i_date,
                        end_date=j_date)
print("Running Weather Enricher")
weather_enricher = weather.get_enricher()
new_energy, processed_weather = weather_enricher.enrich_customer_data_no_agg(
                                                customer_data_object=energy,
                                                location_match_granularity=5, # higher for high join success rate, lower for performance.
                                                time_round_granularity='day')
processed_weather.data['precipDepth'] = processed_weather.data['precipDepth'].apply(
        lambda x: np.nan if x == 9999 else x)
processed_weather.data['precipTime'] = processed_weather.data['precipTime'].apply(
        lambda x: np.nan if x == 99 else x)
processed_weather.data['precipDepth/precipTime'] = \
        processed_weather.data[['precipDepth', 'precipTime']].apply(
        lambda x: np.nan if (
                pd.isna(x[0]) or pd.isna(x[1]) or x[1] == 0.0) else (x[0] / x[1]), axis=1)
aggregations = {
        "temperature": "mean",
        "snowDepth": "mean",
        "precipDepth/precipTime": "mean",
        "precipDepth": "max",
        "precipTime": "max"}
public_rankgroup = processed_weather.id
public_join_time = [
        s for s in list(processed_weather.data.columns)
        if s.startswith('ds_join_time')][0]
customer_rankgroup = weather_enricher.location_selector.customer_rankgroup
customer_join_time = [
        s for s in list(new_energy.data.columns)
        if s.startswith('customer_join_time')][0]
weather_df_grouped = processed_weather.data.groupby(
        by=[public_rankgroup, public_join_time]).agg(aggregations)
joined_dataset = new_energy.data.merge(
        weather_df_grouped,
        left_on=[customer_rankgroup, customer_join_time],
        right_on=[public_rankgroup, public_join_time],
        how='left')
final_df = joined_dataset[raw_columns + [
        "temperature", "precipTime", "precipDepth", "snowDepth", "precipDepth/precipTime"]]


Running Weather Enricher
[Info] read from C:\Users\Satya\AppData\Local\Temp\tmpk0to43h8\https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2012/month=1/part-00000-tid-1176268934493077726-e9239a20-3ec9-41f2-904f-d82d127e4049-4900-1.c000.snappy.parquet
[Info] read from C:\Users\Satya\AppData\Local\Temp\tmpk0to43h8\https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2012/month=2/part-00000-tid-1176268934493077726-e9239a20-3ec9-41f2-904f-d82d127e4049-4900-2.c000.snappy.parquet
[Info] read from C:\Users\Satya\AppData\Local\Temp\tmpk0to43h8\https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2012/month=3/part-00000-tid-1176268934493077726-e9239a20-3ec9-41f2-904f-d82d127e4049-4900-3.c000.snappy.parquet
[Info] read from C:\Users\Satya\AppData\Local\Temp\tmpk0to43h8\https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2012/month=4/part-00000-tid-1176268934493077726

MemoryError: Unable to allocate 3.45 GiB for an array with shape (6, 77214578) and data type float64

In [71]:
report_joined[i_date] = final_df.describe()

In [72]:
fdate = str(i_date).replace(":",'-')
fn = "temp/nyc_energy_enriched_"+ fdate +".csv"
fn

'temp/nyc_energy_enriched_2012-07-01 00-00-00.csv'

In [73]:
final_df.to_csv(fn)

In [74]:
all = pd.concat([all, final_df])

In [75]:
all.to_csv('data/nyc_energy_enriched.csv')

  values = values.astype(str)


In [27]:
i_date += relativedelta(months=1)

In [None]:
# i_date += relativedelta(months=-1)

In [1]:
# if os.path.exists('data/nyc_energy_enriched.csv'):
#     raise RuntimeError('nyc_energy_enriched.csv exists already.')
# else:
#     print('[%s] Start enriching...' % datetime.now())
#     all = pd.DataFrame([])
#     report_joined = {}
#     i_date = start_date
#     for m in range(months):
#         j_date = i_date + relativedelta(months=1) - timedelta(milliseconds=1)

#         # This is important to set monotonically increasing index for successful enrichemnt.
#         df1 = df[(df['new_datetime'] >= i_date) & (df['new_datetime'] <= j_date)].copy()
#         df1['idx'] = list(range(len(df1.index)))
#         df1 = df1.set_index('idx')

#         energy = LocationTimeCustomerData(
#             df1,
#             LatLongColumn('lat', 'long'),
#             'new_datetime')

#         weather = NoaaIsdWeather(
#             cols=["temperature", "precipTime", "precipDepth", "snowDepth"],
#             start_date=i_date,
#             end_date=j_date)

#         weather_enricher = weather.get_enricher()
#         new_energy, processed_weather = weather_enricher.enrich_customer_data_no_agg(
#             customer_data_object=energy,
#             location_match_granularity=5, # higher for high join success rate, lower for performance.
#             time_round_granularity='day')
        
#         # ---=== Begin of cusomtized aggregation ===---
        
#         processed_weather.data['precipDepth'] = processed_weather.data['precipDepth'].apply(
#             lambda x: np.nan if x == 9999 else x)
#         processed_weather.data['precipTime'] = processed_weather.data['precipTime'].apply(
#             lambda x: np.nan if x == 99 else x)

#         processed_weather.data['precipDepth/precipTime'] = \
#         processed_weather.data[['precipDepth', 'precipTime']].apply(
#             lambda x: np.nan if (
#                 pd.isna(x[0]) or pd.isna(x[1]) or x[1] == 0.0) else (x[0] / x[1]), axis=1)
        
#         aggregations = {
#             "temperature": "mean",
#             "snowDepth": "mean",
#             "precipDepth/precipTime": "mean",
#             "precipDepth": "max",
#             "precipTime": "max"}
        
#         public_rankgroup = processed_weather.id

#         public_join_time = [
#             s for s in list(processed_weather.data.columns)
#             if s.startswith('ds_join_time')][0]

#         customer_rankgroup = weather_enricher.location_selector.customer_rankgroup

#         customer_join_time = [
#             s for s in list(new_energy.data.columns)
#             if s.startswith('customer_join_time')][0]

#         weather_df_grouped = processed_weather.data.groupby(
#             by=[public_rankgroup, public_join_time]).agg(aggregations)
        
#         joined_dataset = new_energy.data.merge(
#             weather_df_grouped,
#             left_on=[customer_rankgroup, customer_join_time],
#             right_on=[public_rankgroup, public_join_time],
#             how='left')

#         final_df = joined_dataset[raw_columns + [
#             "temperature", "precipTime", "precipDepth", "snowDepth", "precipDepth/precipTime"]]

#         report_joined[i_date] = final_df.describe()
        
#         # ---=== End of customized aggregation ===---
        
#         fdate = str(i_date).replace(":",'-')
#         fn = "temp/nyc_energy_enriched_"+ fdate +".csv"
#         final_df.to_csv(fn)

#         all = pd.concat([all, final_df])
#         all.to_csv('data/nyc_energy_enriched.csv')

#         i_date += relativedelta(months=1)

#     print('[%s] End enriching...' % datetime.now())

In [1]:
import pandas as pd

In [2]:
import glob

In [3]:
import os

In [4]:
temp_csv_files = glob.glob(os.path.join('temp', '*.csv'))

In [7]:
df = pd.concat([pd.read_csv(file) for file in temp_csv_files])

In [14]:
df.duplicated(subset=['timeStamp']).sum()

0

In [16]:
df.drop('Unnamed: 0', axis=1).to_csv(os.path.join('data', 'nyc_energy_data.csv'))