# Step 2 - Weather Data Process - Get Weather Station Data for Mapped Weather Stations

<b>Summary:</b><br>
Uses Synoptic Weather Data API to grab weather station data for the mapped weather stations to cameras.<br>
Synoptic is partner of SDG&E, helping store and serve their weather station data--accessible via API.

- Read in processed camera weather station mappings
- Get all station data for input timerange
- Save weater station data by network
- Convert wind speed and direction to uv components
- Save processed data with only desired columns

<b>Output:</b><br>
.<br>
├── data<br>
&emsp;&emsp;&emsp;├── processed<br>
&emsp;&emsp;&emsp;&nbsp;│&emsp;&emsp;&nbsp;├── weather_HPWREN.csv<br>
&emsp;&emsp;&emsp;&nbsp;│&emsp;&emsp;&nbsp;├── weather_SC-EDISON.csv<br>
&emsp;&emsp;&emsp;&nbsp;│&emsp;&emsp;&nbsp;├── weather_SDGE.csv<br>
&emsp;&emsp;&emsp;└── raw<br>
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;├── weather_HPWREN.csv<br>
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;├── weather_SC-EDISON.csv<br>
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;└── weather_SDGE.csv

<b>Instructions:</b><br>
- Create a copy of `config.json.example` and remove the `.example`
- Update the `synoptic_api_token` value

In [166]:
import asyncio
import json
import os
import time
import urllib
from ast import literal_eval
from datetime import datetime, timedelta
from io import StringIO

import aiofiles
import aiohttp
import fastparquet
import geopandas
import matplotlib.pyplot as plt
import nest_asyncio
import numpy as np
import pandas as pd
import pytz
import requests
import rtree
from aiohttp import ClientSession
from geopandas import GeoDataFrame
from shapely.geometry import Point, Polygon
from tqdm.notebook import tqdm, trange

nest_asyncio.apply()

## Parameters

In [2]:
# Base url and API token
with open("config.json") as config_file:
    cfg = json.load(config_file)

API_BASE_URL = "https://api.synopticdata.com/v2/"
API_TOKEN = cfg["synoptic_api_token"]

# should be SHORTNAME values from mesonet - should align to notebook 1
WEATHER_NETWORKS = {139: "SDGE", 81: "HPWREN", 231: "SC-EDISON"}

# figlib start = 2016-06-04 (assume PST)
# figlib end = 2021-07-11 (assume PST)
# YYYYmmddHHMM (format needed for api)
START_DATETIME = "201606030000"
END_DATETIME = "202107120000"
# END_DATETIME = "202112311159"

## 1. Get data for matched weather stations

### Read weather station mappings

In [3]:
station_mappings_df = pd.read_csv("../../data/processed/camera_station_mappings.csv")
print(station_mappings_df.shape)

(309, 7)


In [4]:
station_mappings_df

Unnamed: 0,properties.description.id,stid,shortname,distance_m,distance_mi,is_in_direction,rn
0,hpwren0_unknown direction,BFDSD,SDGE,2362.963219,1.468277,,1
1,hpwren0_unknown direction,CVXSD,SDGE,8161.908209,5.071575,,2
2,hpwren0_unknown direction,DJZSD,SDGE,10216.081526,6.347979,,3
3,hpwren1_north,HP016,HPWREN,0.000000,0.000000,True,1
4,hpwren1_north,MGDSD,SDGE,4637.139293,2.881385,True,2
...,...,...,...,...,...,...,...
304,hpwren30_south,TLGSD,SDGE,4521.722898,2.809668,True,2
305,hpwren30_south,CRISD,SDGE,8167.055431,5.074773,True,3
306,hpwren30_west,HP024,HPWREN,0.000000,0.000000,True,1
307,hpwren30_west,TLGSD,SDGE,4521.722898,2.809668,True,2


### Helper function to get weather station data given stid and start/end time

In [5]:
def get_historical_station_readings(stid: str, start: str, end: str) -> str:
    """
    Return csv string of station readings given station id(s) and timeframe.
    Start and end format = YYYYmmddHHMM. Synchronous.

    Note:
    For multiple stids, would need to remove output csv, but would be in json format.
    """
    # All times are requested in UTC, but may be returned in either UTC or Local time
    endpoint = f"{API_BASE_URL}stations/timeseries"
    params = {
        "token": API_TOKEN,
        "stid": stid,
        "start": start,
        "end": end,
        "obtimezone": "UTC",
        "output": "csv",
    }
    r = requests.get(endpoint, params=params)
    if r.status_code not in range(200, 299):
        # TODO: add error handling
        print("Error!")
        return {}
    # return r.json()
    return r.text

In [6]:
def get_network_station_data(df: pd.DataFrame, shortname: str = None) -> pd.DataFrame:
    """
    Return dataframe of station data given weather network shortname.
    If no shortname given, go through the full dataframe.
    Use if getting data in single file or in files by network.
    """
    if shortname != None:
        df = df[df["shortname"] == shortname]
    stations = df["stid"].dropna().unique().tolist()

    weather_df_created = 0

    for idx, station in enumerate(tqdm(stations)):
        # print(station)
        # get csv string
        csv_string_io = StringIO(
            get_historical_station_readings(
                stid=station,
                start=START_DATETIME,
                end=END_DATETIME
                # start="201606030000",
                # end="201606030100",
            )
        )
        # initial creation of df
        if not weather_df_created:
            weather_df = pd.read_csv(csv_string_io, skiprows=6, header=[0, 1])
            weather_df_created = 1
        # additional appends to df
        else:
            temp_df = pd.read_csv(csv_string_io, skiprows=6, header=[0, 1])
            weather_df = pd.concat([weather_df, temp_df])

    return weather_df

In [7]:
async def get_station_data(
    stid: str, start: str, end: str, session: ClientSession
) -> str:
    """
    Return weather station csv string data given stid. Asynchronous.
    Start and end format = YYYYmmddHHMM.
    """
    # All times are requested in UTC, but may be returned in either UTC or Local time
    endpoint = f"{API_BASE_URL}stations/timeseries"
    params = {
        "token": API_TOKEN,
        "stid": stid,
        "start": start,
        "end": end,
        "obtimezone": "UTC",
        "output": "csv",
    }
    params_encode = urllib.parse.urlencode(params)
    url = endpoint + "?" + params_encode
    print(url)
    # r = requests.get(endpoint, params=params)
    r = await session.request(method="GET", url=url)
    # r.raise_for_status()
    if r.status not in range(200, 300):
        # TODO: add error handling
        print(f"Error! - {stid}")
        return ""
    # return r.json()
    # return r.text
    r_text = await r.text()
    return r_text


async def save_station_data(
    stid: str, start: str, end: str, session: ClientSession
) -> None:
    """
    Get and save weather station data to csv. Asynchronous.
    Start and end format = YYYYmmddHHMM.
    """
    base_output_path = "../../data/raw/stations/"
    r_text = await get_station_data(stid, start, end, session)
    r_io = StringIO(r_text)
    r_df = pd.read_csv(r_io, skiprows=6, header=[0, 1])
    r_df.to_csv(f"{base_output_path}{stid}.csv", index=False)

In [8]:
# %%time

# # Synchronous baseline test

# test_stations = ["BFDSD"]
# test_stations = ["BFDSD", "CVXSD"]
# test_stations = ["BFDSD", "CVXSD", "DJZSD", "HP016", "MGDSD"]

# for station in test_stations:
#     # get_historical_station_readings(
#     #     stid=station, start=START_DATETIME, end=END_DATETIME
#     # )
#     tmp = StringIO(
#         get_historical_station_readings(
#             stid=station, start=START_DATETIME, end=END_DATETIME
#         )
#     )
#     tmp_df = pd.read_csv(tmp, skiprows=6, header=[0, 1])
#     tmp_df.to_csv(f"../../data/raw/stations0/{station}.csv", index=False)

In [9]:
# Asynchronous

# stations = station_mappings_df["stid"].dropna().unique().tolist()

# start = time.time()

# my_conn = aiohttp.TCPConnector(limit=10)
# async with ClientSession(connector=my_conn) as session:
#     await asyncio.gather(
#         *[
#             save_station_data(station, START_DATETIME, END_DATETIME, session)
#             for station in stations
#         ]
#     )
# print(time.time() - start)

In [10]:
# Get for all stations grouped in files by network

for key in WEATHER_NETWORKS:
    network_station_df = get_network_station_data(
        station_mappings_df, WEATHER_NETWORKS[key]
    )
    network_station_df.to_csv(
        f"../../data/raw/weather_{WEATHER_NETWORKS[key]}.csv", index=False
    )

  0%|          | 0/81 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/38 [00:00<?, ?it/s]

In [11]:
# %%time

# Get for all stations as single file

# weather_df_created = 0

# for idx, station in enumerate(tqdm(stations)):
#     # print(station)
#     # get csv string
#     csv_string_io = StringIO(
#         get_historical_station_readings(
#             stid=station,
#             start=START_DATETIME,
#             end=END_DATETIME
#             # start="201606030000",
#             # end="201606030100",
#         )
#     )
#     # initial creation of df
#     if not weather_df_created:
#         weather_df = pd.read_csv(csv_string_io, skiprows=6, header=[0, 1])
#         weather_df_created = 1
#     # additional appends to df
#     else:
#         temp_df = pd.read_csv(csv_string_io, skiprows=6, header=[0, 1])
#         weather_df = pd.concat([weather_df, temp_df])

# # prev runtime: 1h 6min 56s

<hr>

## 2. Convert wind speed and direction to uv components

Reference: http://colaweb.gmu.edu/dev/clim301/lectures/wind/wind-uv

Directions are originally in "weather wind direction" and will be converted to "math wind direction" for uv componenent calculation.<br>
Null wind speed will remain as null for vector component. Null wind direction results in 0 v component.

### Read raw weather data

In [20]:
weather_sdge_df = pd.read_csv("../../data/raw/weather_SDGE.csv", header=[0, 1])
weather_hpwren_df = pd.read_csv("../../data/raw/weather_HPWREN.csv", header=[0, 1])
weather_sce_df = pd.read_csv("../../data/raw/weather_SC-EDISON.csv", header=[0, 1])

### Helper function to calculate uv components

In [21]:
def calc_uv_components(df: pd.DataFrame) -> pd.DataFrame:
    """
    Return original dataframe with uv column componenets.
    """
    # convert direction to math direction
    df["wind_direction_math"] = 270 - df["wind_direction_set_1"]["Degrees"]
    # if negative add 360
    df.loc[df["wind_direction_math"] < 0, ["wind_direction_math"]] += 360
    # convert degrees to radians
    df["wind_direction_math_r"] = np.radians(df["wind_direction_math"])

    # calculate uv components
    df["u"] = df["wind_speed_set_1"]["m/s"] * np.cos(df["wind_direction_math_r"])
    df["v"] = df["wind_speed_set_1"]["m/s"] * np.sin(df["wind_direction_math_r"])
    return df

In [22]:
weather_sdge_df = calc_uv_components(weather_sdge_df)
weather_hpwren_df = calc_uv_components(weather_hpwren_df)
weather_sce_df = calc_uv_components(weather_sce_df)

In [23]:
weather_sdge_df.head()

Unnamed: 0_level_0,Station_ID,Date_Time,air_temp_set_1,relative_humidity_set_1,wind_speed_set_1,volt_set_1,wind_gust_set_1,wind_direction_set_1,dew_point_temperature_set_1d,wind_chill_set_1d,...,sea_level_pressure_set_1d,sea_level_pressure_set_1d,wet_bulb_temperature_set_1d,altimeter_set_1d,fuel_temp_set_1,precip_accum_ten_minute_set_1,wind_direction_math,wind_direction_math_r,u,v
Unnamed: 0_level_1,Unnamed: 0_level_1,Unnamed: 1_level_1,Celsius,%,m/s,volts,m/s,Degrees,Celsius,Celsius,...,Pascals,Pascals.1,Celsius,Pascals,Celsius,Millimeters,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,BFDSD,2016-06-03T00:00:00Z,18.33,87.0,4.47,,6.26,280.0,16.11,,...,,,,,,,350.0,6.108652,4.402091,-0.776207
1,BFDSD,2016-06-03T00:10:00Z,18.33,87.0,4.02,,6.26,280.0,16.11,,...,,,,,,,350.0,6.108652,3.958927,-0.698066
2,BFDSD,2016-06-03T00:20:00Z,17.78,88.0,4.02,,5.37,275.0,15.75,,...,,,,,,,355.0,6.195919,4.004703,-0.350366
3,BFDSD,2016-06-03T00:30:00Z,17.78,88.0,4.02,,5.37,270.0,15.75,,...,,,,,,,0.0,0.0,4.02,0.0
4,BFDSD,2016-06-03T00:40:00Z,17.78,89.0,3.58,,5.37,271.0,15.93,,...,,,,,,,359.0,6.265732,3.579455,-0.06248


## 3. Clean weather datetimes / snap to 10 interval

### Set Date_Time column to datetime type

In [27]:
weather_sdge_df["Date_Time"] = pd.to_datetime(
    weather_sdge_df["Date_Time"]["Unnamed: 1_level_1"]
)
weather_hpwren_df["Date_Time"] = pd.to_datetime(
    weather_hpwren_df["Date_Time"]["Unnamed: 1_level_1"]
)
weather_sce_df["Date_Time"] = pd.to_datetime(
    weather_sce_df["Date_Time"]["Unnamed: 1_level_1"]
)

In [30]:
print(weather_sdge_df["Date_Time"].dtypes)
print(weather_hpwren_df["Date_Time"].dtypes)
print(weather_sce_df["Date_Time"].dtypes)

Unnamed: 1_level_1    datetime64[ns, UTC]
dtype: object
Unnamed: 1_level_1    datetime64[ns, UTC]
dtype: object
Unnamed: 1_level_1    datetime64[ns, UTC]
dtype: object


### Check interval counts not on 10 minute interval (ie 0...10...20...30...40...50)

In [44]:
def get_misaligned_datetime_count(df: pd.DataFrame) -> int:
    """
    Returns the rows count of datetimes with minutes not aligned to 10 minute interval given a dataframe.
    I.e. 0...10...20...30...40...50
    Assumes datetime column is ["Date_Time"]["Unnamed: 1_level_1"].
    """
    return df[(df["Date_Time"]["Unnamed: 1_level_1"].dt.minute % 10) != 0].shape[0]

In [52]:
print(
    f"SDGE misaligned datetime count: {get_misaligned_datetime_count(weather_sdge_df)}"
)
print(
    f"SDGE misaligned datetime percentage: {get_misaligned_datetime_count(weather_sdge_df)/weather_sdge_df.shape[0]*100}"
)

SDGE misaligned datetime count: 9
SDGE misaligned datetime percentage: 4.59566473584068e-05


In [158]:
# Confirm if all seconds are 00
weather_sdge_df["Date_Time"]["Unnamed: 1_level_1"].dt.second.value_counts()

0    19583665
Name: Unnamed: 1_level_1, dtype: int64

In [53]:
print(
    f"HPWREN misaligned datetime count: {get_misaligned_datetime_count(weather_hpwren_df)}"
)
print(
    f"HPWREN misaligned datetime percentage: {get_misaligned_datetime_count(weather_hpwren_df)/weather_hpwren_df.shape[0]*100}"
)

HPWREN misaligned datetime count: 2418717
HPWREN misaligned datetime percentage: 99.99772611887354


In [159]:
# Confirm if all seconds are 00
weather_hpwren_df["Date_Time"]["Unnamed: 1_level_1"].dt.second.value_counts()

0    2418772
Name: Unnamed: 1_level_1, dtype: int64

In [54]:
print(f"SCE misaligned datetime count: {get_misaligned_datetime_count(weather_sce_df)}")
print(
    f"SCE misaligned datetime percentage: {get_misaligned_datetime_count(weather_sce_df)/weather_sce_df.shape[0]*100}"
)

SCE misaligned datetime count: 0
SCE misaligned datetime percentage: 0.0


In [160]:
weather_sce_df["Date_Time"]["Unnamed: 1_level_1"].dt.second.value_counts()

0    3949232
Name: Unnamed: 1_level_1, dtype: int64

Weather networks to fix datetimes within:

- SDGE
- HPWREN

### Clean SDGE

In [55]:
weather_sdge_df[
    (weather_sdge_df["Date_Time"]["Unnamed: 1_level_1"].dt.minute % 10) != 0
]

Unnamed: 0_level_0,Station_ID,Date_Time,air_temp_set_1,relative_humidity_set_1,wind_speed_set_1,volt_set_1,wind_gust_set_1,wind_direction_set_1,dew_point_temperature_set_1d,wind_chill_set_1d,...,sea_level_pressure_set_1d,sea_level_pressure_set_1d,wet_bulb_temperature_set_1d,altimeter_set_1d,fuel_temp_set_1,precip_accum_ten_minute_set_1,wind_direction_math,wind_direction_math_r,u,v
Unnamed: 0_level_1,Unnamed: 0_level_1,Unnamed: 1_level_1,Celsius,%,m/s,volts,m/s,Degrees,Celsius,Celsius,...,Pascals,Pascals.1,Celsius,Pascals,Celsius,Millimeters,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
53899,BFDSD,2017-07-10 14:45:00+00:00,,,,,,,,,...,,,,,,,,,,
53903,BFDSD,2017-07-10 15:15:00+00:00,,,,,,,,,...,,,,,,,,,,
53907,BFDSD,2017-07-10 15:45:00+00:00,,,,,,,,,...,,,,,,,,,,
10993594,MLGSD,2019-06-03 18:06:00+00:00,-17.78,,0.0,,0.0,,,,...,,,,,,,,,,
13001745,CHOSD,2017-07-10 14:45:00+00:00,,,,,,,,,...,,,,,,,,,,
13001749,CHOSD,2017-07-10 15:15:00+00:00,,,,,,,,,...,,,,,,,,,,
13001753,CHOSD,2017-07-10 15:45:00+00:00,,,,,,,,,...,,,,,,,,,,
15709417,AMOSD,2017-03-09 09:08:00+00:00,-17.78,,0.0,,0.0,,,,...,,,,,,,,,,
17450447,SHCSD,2020-04-23 18:48:00+00:00,22.19,28.72,2.4,13.51,4.41,222.0,3.13,,...,,,,,,,48.0,0.837758,1.605913,1.783548


#### Scenario: Additional timestamps added at random 5 min intervals

In this scenario the 10 minute intervals still exist, just some random mixed minute intervals seem to be sprinkled in. Drop these.

In [112]:
pd.set_option("display.max_rows", None)

weather_sdge_df[
    (
        weather_sdge_df["Date_Time"]["Unnamed: 1_level_1"]
        >= datetime(2017, 7, 10, 14, 40, 0, 0, pytz.UTC)
    )
    & (
        weather_sdge_df["Date_Time"]["Unnamed: 1_level_1"]
        < datetime(2017, 7, 10, 16, 0, 0, 0, pytz.UTC)
    )
    & (
        (weather_sdge_df["Station_ID"]["Unnamed: 0_level_1"] == "BFDSD")
        | (weather_sdge_df["Station_ID"]["Unnamed: 0_level_1"] == "CHOSD")
    )
]["Date_Time"]["Unnamed: 1_level_1"]

53898      2017-07-10 14:40:00+00:00
53899      2017-07-10 14:45:00+00:00
53900      2017-07-10 14:50:00+00:00
53901      2017-07-10 15:00:00+00:00
53902      2017-07-10 15:10:00+00:00
53903      2017-07-10 15:15:00+00:00
53904      2017-07-10 15:20:00+00:00
53905      2017-07-10 15:30:00+00:00
53906      2017-07-10 15:40:00+00:00
53907      2017-07-10 15:45:00+00:00
53908      2017-07-10 15:50:00+00:00
13001744   2017-07-10 14:40:00+00:00
13001745   2017-07-10 14:45:00+00:00
13001746   2017-07-10 14:50:00+00:00
13001747   2017-07-10 15:00:00+00:00
13001748   2017-07-10 15:10:00+00:00
13001749   2017-07-10 15:15:00+00:00
13001750   2017-07-10 15:20:00+00:00
13001751   2017-07-10 15:30:00+00:00
13001752   2017-07-10 15:40:00+00:00
13001753   2017-07-10 15:45:00+00:00
13001754   2017-07-10 15:50:00+00:00
Name: Unnamed: 1_level_1, dtype: datetime64[ns, UTC]

In [120]:
weather_sdge_df[
    (
        weather_sdge_df["Date_Time"]["Unnamed: 1_level_1"]
        >= datetime(2019, 6, 3, 18, 0, 0, 0, pytz.UTC)
    )
    & (
        weather_sdge_df["Date_Time"]["Unnamed: 1_level_1"]
        < datetime(2019, 6, 3, 18, 50, 0, 0, pytz.UTC)
    )
    & (weather_sdge_df["Station_ID"]["Unnamed: 0_level_1"] == "MLGSD")
]["Date_Time"]["Unnamed: 1_level_1"]

10993593   2019-06-03 18:00:00+00:00
10993594   2019-06-03 18:06:00+00:00
10993595   2019-06-03 18:10:00+00:00
10993596   2019-06-03 18:20:00+00:00
10993597   2019-06-03 18:30:00+00:00
10993598   2019-06-03 18:40:00+00:00
Name: Unnamed: 1_level_1, dtype: datetime64[ns, UTC]

In [121]:
weather_sdge_df[
    (
        weather_sdge_df["Date_Time"]["Unnamed: 1_level_1"]
        >= datetime(2017, 3, 9, 9, 0, 0, 0, pytz.UTC)
    )
    & (
        weather_sdge_df["Date_Time"]["Unnamed: 1_level_1"]
        < datetime(2017, 3, 9, 10, 0, 0, 0, pytz.UTC)
    )
    & (weather_sdge_df["Station_ID"]["Unnamed: 0_level_1"] == "AMOSD")
]["Date_Time"]["Unnamed: 1_level_1"]

15709416   2017-03-09 09:00:00+00:00
15709417   2017-03-09 09:08:00+00:00
15709418   2017-03-09 09:10:00+00:00
15709419   2017-03-09 09:20:00+00:00
15709420   2017-03-09 09:30:00+00:00
15709421   2017-03-09 09:40:00+00:00
15709422   2017-03-09 09:50:00+00:00
Name: Unnamed: 1_level_1, dtype: datetime64[ns, UTC]

In [128]:
weather_sdge_df[
    (
        weather_sdge_df["Date_Time"]["Unnamed: 1_level_1"]
        >= datetime(2020, 4, 23, 15, 0, 0, 0, pytz.UTC)
    )
    & (
        weather_sdge_df["Date_Time"]["Unnamed: 1_level_1"]
        < datetime(2020, 4, 24, 0, 0, 0, 0, pytz.UTC)
    )
    & (weather_sdge_df["Station_ID"]["Unnamed: 0_level_1"] == "SHCSD")
]["Date_Time"]["Unnamed: 1_level_1"]

17450444   2020-04-23 15:00:00+00:00
17450445   2020-04-23 15:10:00+00:00
17450446   2020-04-23 15:20:00+00:00
17450447   2020-04-23 18:48:00+00:00
17450448   2020-04-23 19:10:00+00:00
17450449   2020-04-23 19:20:00+00:00
17450450   2020-04-23 19:30:00+00:00
17450451   2020-04-23 19:40:00+00:00
17450452   2020-04-23 19:50:00+00:00
17450453   2020-04-23 20:00:00+00:00
17450454   2020-04-23 20:10:00+00:00
17450455   2020-04-23 20:20:00+00:00
17450456   2020-04-23 20:30:00+00:00
17450457   2020-04-23 20:40:00+00:00
17450458   2020-04-23 20:50:00+00:00
17450459   2020-04-23 21:00:00+00:00
17450460   2020-04-23 21:10:00+00:00
17450461   2020-04-23 21:20:00+00:00
17450462   2020-04-23 21:30:00+00:00
17450463   2020-04-23 21:40:00+00:00
17450464   2020-04-23 21:50:00+00:00
17450465   2020-04-23 22:00:00+00:00
17450466   2020-04-23 22:10:00+00:00
17450467   2020-04-23 22:20:00+00:00
17450468   2020-04-23 22:30:00+00:00
17450469   2020-04-23 22:40:00+00:00
17450470   2020-04-23 22:50:00+00:00
1

In this scenario, we actually see a temperary outage starting at at 15:30 fully resuming at 19:10.<br>
There is 1 mixed reading in between at 18:48. Drop this both for simplicity and because may it not be accurate during outage.

In [129]:
pd.reset_option("display.max_rows")

As reviewed above, we are okay to drop all the mixed intervals for the sdge weather network readings.

In [131]:
weather_sdge_df.shape

(19583674, 26)

In [143]:
# Drop the 9 rows
index_list_to_drop = weather_sdge_df[
    (weather_sdge_df["Date_Time"]["Unnamed: 1_level_1"].dt.minute % 10) != 0
].index.tolist()

weather_sdge_df.drop(index_list_to_drop, axis=0, inplace=True)

weather_sdge_df[
    (weather_sdge_df["Date_Time"]["Unnamed: 1_level_1"].dt.minute % 10) != 0
]

Unnamed: 0_level_0,Station_ID,Date_Time,air_temp_set_1,relative_humidity_set_1,wind_speed_set_1,volt_set_1,wind_gust_set_1,wind_direction_set_1,dew_point_temperature_set_1d,wind_chill_set_1d,...,sea_level_pressure_set_1d,sea_level_pressure_set_1d,wet_bulb_temperature_set_1d,altimeter_set_1d,fuel_temp_set_1,precip_accum_ten_minute_set_1,wind_direction_math,wind_direction_math_r,u,v
Unnamed: 0_level_1,Unnamed: 0_level_1,Unnamed: 1_level_1,Celsius,%,m/s,volts,m/s,Degrees,Celsius,Celsius,...,Pascals,Pascals.1,Celsius,Pascals,Celsius,Millimeters,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [144]:
weather_sdge_df.shape

(19583665, 26)

In [218]:
get_misaligned_datetime_count(weather_sdge_df)

0

### Clean HPWREN

In [151]:
weather_hpwren_df[
    (weather_hpwren_df["Date_Time"]["Unnamed: 1_level_1"].dt.minute % 10) != 0
].head(10)

Unnamed: 0_level_0,Station_ID,Date_Time,pressure_set_1,air_temp_set_1,relative_humidity_set_1,wind_speed_set_1,wind_direction_set_1,wind_gust_set_1,precip_accum_one_hour_set_1,precip_accum_set_1,...,altimeter_set_1d,precip_accum_five_minute_set_1,solar_radiation_set_1,precip_accum_one_minute_set_1,fuel_temp_set_1,fuel_moisture_set_1,wind_direction_math,wind_direction_math_r,u,v
Unnamed: 0_level_1,Unnamed: 0_level_1,Unnamed: 1_level_1,Pascals,Celsius,%,m/s,Degrees,m/s,Millimeters,Millimeters,...,Pascals,Millimeters,W/m**2,Millimeters,Celsius,gm,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,HP016,2016-06-03 00:07:00+00:00,87940.0,26.9,31.3,4.0,309.0,5.9,,,...,101803.11,,,,,,321.0,5.602507,3.108584,-2.517282
1,HP016,2016-06-03 00:17:00+00:00,87930.0,27.4,29.8,6.2,279.0,7.5,,,...,101791.53,,,,,,351.0,6.126106,6.123668,-0.969894
2,HP016,2016-06-03 00:27:00+00:00,87920.0,26.8,33.5,6.0,289.0,6.8,,,...,101779.96,,,,,,341.0,5.951573,5.673111,-1.953409
3,HP016,2016-06-03 00:37:00+00:00,87910.0,26.4,36.6,4.5,262.0,5.7,,,...,101768.38,,,,,,8.0,0.139626,4.456206,0.626279
4,HP016,2016-06-03 00:47:00+00:00,87910.0,26.4,38.7,3.3,273.0,4.4,,,...,101768.38,,,,,,357.0,6.230825,3.295477,-0.172709
5,HP016,2016-06-03 00:57:00+00:00,87910.0,26.6,34.6,5.9,278.0,6.8,,,...,101768.38,,,,,,352.0,6.143559,5.842582,-0.821121
6,HP016,2016-06-03 01:07:00+00:00,87910.0,26.3,33.6,5.7,294.0,6.6,,,...,101768.38,,,,,,336.0,5.864306,5.207209,-2.318399
7,HP016,2016-06-03 01:17:00+00:00,87920.0,26.0,36.3,5.5,281.0,6.2,,,...,101779.96,,,,,,349.0,6.091199,5.39895,-1.049449
8,HP016,2016-06-03 01:27:00+00:00,87920.0,26.0,34.6,5.0,282.0,6.1,,,...,101779.96,,,,,,348.0,6.073746,4.890738,-1.039558
9,HP016,2016-06-03 01:37:00+00:00,87920.0,25.9,32.3,3.5,275.0,4.6,,,...,101779.96,,,,,,355.0,6.195919,3.486681,-0.305045


In [152]:
weather_hpwren_df["Date_Time"]["Unnamed: 1_level_1"].dt.minute.value_counts()

55    129188
25    129158
45    129121
15    129113
5     129052
35    128991
33     96986
3      96968
43     96712
13     96701
53     96655
23     96641
56     87062
26     86995
46     86993
16     86987
6      86975
36     86965
14     60073
44     60061
54     60058
24     60039
34     59807
4      59803
27     30204
37     30202
47     30195
7      30187
17     30185
57     30183
38        80
18        65
28        59
48        56
8         49
58        29
10        14
11        13
39        13
59        12
19        11
29        11
9         10
40        10
0          9
30         8
50         8
49         7
42         7
2          6
12         6
20         6
52         4
51         4
1          4
41         4
21         3
32         2
31         1
22         1
Name: Unnamed: 1_level_1, dtype: int64

#### Scenario: Mixed timestamps

HPWREN stations are not synced and are reading at mixed intervals that could start at 5, 6, 3, 4, 7, etc. <br>
We will need to snap these readings to their nearest 10 minute interval to properly aggregate with the other stations.<br>

See below for rounding:

Anything above 0, should round up to the next interval. In production, if the latest reading we have is from 7:03, that data will not exist for the 7:00 interval as it shouldn't be able to see into the future. Therefore, this number will be rounded up to be grouped with the next interval.

To achieve this, the ceiling of each timestamp to the nearest 10th minute will be used.

In [206]:
weather_hpwren_df["Date_Time_Snapped"] = weather_hpwren_df["Date_Time"][
    "Unnamed: 1_level_1"
].dt.ceil(freq="10T")
weather_hpwren_df[["Date_Time", "Date_Time_Snapped"]].head()

Unnamed: 0_level_0,Date_Time,Date_Time_Snapped
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2016-06-03 00:07:00+00:00,2016-06-03 00:10:00+00:00
1,2016-06-03 00:17:00+00:00,2016-06-03 00:20:00+00:00
2,2016-06-03 00:27:00+00:00,2016-06-03 00:30:00+00:00
3,2016-06-03 00:37:00+00:00,2016-06-03 00:40:00+00:00
4,2016-06-03 00:47:00+00:00,2016-06-03 00:50:00+00:00


In [212]:
weather_hpwren_df["Date_Time"] = weather_hpwren_df["Date_Time_Snapped"]
weather_hpwren_df.drop(columns=["Date_Time_Snapped"], inplace=True)

  weather_hpwren_df.drop(columns=["Date_Time_Snapped"], inplace=True)


In [215]:
weather_hpwren_df[
    (weather_hpwren_df["Date_Time"]["Unnamed: 1_level_1"].dt.minute % 10) != 0
].head(10)

Unnamed: 0_level_0,Station_ID,Date_Time,pressure_set_1,air_temp_set_1,relative_humidity_set_1,wind_speed_set_1,wind_direction_set_1,wind_gust_set_1,precip_accum_one_hour_set_1,precip_accum_set_1,...,altimeter_set_1d,precip_accum_five_minute_set_1,solar_radiation_set_1,precip_accum_one_minute_set_1,fuel_temp_set_1,fuel_moisture_set_1,wind_direction_math,wind_direction_math_r,u,v
Unnamed: 0_level_1,Unnamed: 0_level_1,Unnamed: 1_level_1,Pascals,Celsius,%,m/s,Degrees,m/s,Millimeters,Millimeters,...,Pascals,Millimeters,W/m**2,Millimeters,Celsius,gm,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [216]:
weather_hpwren_df["Date_Time"]["Unnamed: 1_level_1"].dt.minute.value_counts()

0     403204
50    403164
20    403160
30    403119
10    403068
40    403057
Name: Unnamed: 1_level_1, dtype: int64

In [217]:
get_misaligned_datetime_count(weather_hpwren_df)

0

### Write processed data to csv keeping only desired columns

In [219]:
columns = [
    "Station_ID",
    "Date_Time",
    "air_temp_set_1",
    "relative_humidity_set_1",
    "wind_speed_set_1",
    "wind_gust_set_1",
    "wind_direction_set_1",
    "dew_point_temperature_set_1d",
    "u",
    "v",
]

In [220]:
weather_sdge_df[columns].to_csv("../../data/processed/weather_SDGE.csv", index=False)
weather_hpwren_df[columns].to_csv(
    "../../data/processed/weather_HPWREN.csv", index=False
)
weather_sce_df[columns].to_csv(
    "../../data/processed/weather_SC-EDISON.csv", index=False
)