# Exploratory: Check column null percentage

Summary: <br>
This notebook is used to explore the SDG&E weather station data that was extracted via Synopic API.

Open Questions: <br>

In [1]:
import pandas as pd

pd.set_option("display.max_rows", None, "display.max_columns", None)

### Read in raw data

In [2]:
%%time
weather_sdge_df = pd.read_csv("../../data/raw/weather_SDGE.csv", header=[0, 1])
weather_hpwren_df = pd.read_csv("../../data/raw/weather_HPWREN.csv", header=[0, 1])
weather_scedison_df = pd.read_csv("../../data/raw/weather_SC-EDISON.csv", header=[0, 1])

CPU times: user 30.9 s, sys: 7.02 s, total: 37.9 s
Wall time: 38 s


### Helper functions to calculate attribute null percentage and station null percentage

In [3]:
def calc_att_null_percentage(df: pd.DataFrame) -> pd.DataFrame:
    """
    Return dataframe with null percentage of each column given a weather data dataframe.
    """
    rows = df.shape[0]
    return pd.DataFrame(df.isna().sum() / rows * 100, columns=["null_percentage"])

In [4]:
def calc_station_null_percentage(df: pd.DataFrame) -> pd.DataFrame:
    """
    Return dataframe with null percentage of each column for each station given a weather data dataframe.
    """
    tmp = df.set_index("Station_ID")

    # get null counts of each column for each station
    station_df = tmp.isna().groupby("Station_ID").sum()
    # update index
    # just take the station value in each list
    orig_index = station_df.index.to_list()
    new_index = [item[0] for item in orig_index]
    station_df.index = pd.Index(new_index, name="Station_ID")

    # get the total row count for each station
    counts_df = (
        tmp["Date_Time"]
        .groupby("Station_ID")
        .count()
        .rename(columns={"Unnamed: 1_level_1": "row_count"})
    )
    # update index
    # just take the station value in each list
    orig_index = counts_df.index.to_list()
    new_index = [item[0] for item in orig_index]
    counts_df.index = pd.Index(new_index, name="Station_ID")

    # calc null percentage
    station_null_per_df = station_df.copy()
    for station in counts_df.index:
        # print(station)
        # print(station_null_per_df.loc[station] / counts_df.loc[station]["row_count"])
        # print(counts_df.loc[station]["row_count"])
        station_null_per_df.loc[station] = (
            station_null_per_df.loc[station] / counts_df.loc[station]["row_count"] * 100
        )

    return station_null_per_df

### Calc null percentages for each network

#### SDGE

In [5]:
weather_sdge_df.shape

(19614476, 22)

In [6]:
%%time
calc_att_null_percentage(weather_sdge_df)

CPU times: user 6.61 s, sys: 60.1 ms, total: 6.67 s
Wall time: 6.66 s


Unnamed: 0,Unnamed: 1,null_percentage
Station_ID,Unnamed: 0_level_1,0.0
Date_Time,Unnamed: 1_level_1,0.0
air_temp_set_1,Celsius,0.199113
relative_humidity_set_1,%,0.15641
wind_speed_set_1,m/s,0.027709
volt_set_1,volts,63.775025
wind_gust_set_1,m/s,0.003671
wind_direction_set_1,Degrees,3.977985
dew_point_temperature_set_1d,Celsius,0.204645
wind_chill_set_1d,Celsius,97.645413


In [None]:
%%time
sdge_station_null_per_df = calc_station_null_percentage(weather_sdge_df)
sdge_station_null_per_df

#### HPWREN

In [None]:
weather_hpwren_df.shape

In [None]:
%%time
calc_att_null_percentage(weather_hpwren_df)

In [None]:
%%time
hpwren_station_null_per_df = calc_station_null_percentage(weather_hpwren_df)
hpwren_station_null_per_df

# via trial and error there are 4 hpwren stations with > 20% null percentage for wind speed; these will be manually filtered out and reprocessed
# HP003, HP006 (100%), HP015, HP025

#### SC-EDISON

In [None]:
weather_scedison_df.shape

In [None]:
%%time
calc_att_null_percentage(weather_scedison_df)

In [None]:
%%time
scedison_station_null_per_df = calc_station_null_percentage(weather_scedison_df)
scedison_station_null_per_df

## Check wind direction nulls

Is wind direction null only when wind speed is 0? -> Primarily yes. Wind speed=0 accounts for 99.99% of null wind directions.

In [None]:
weather_sdge_df[weather_sdge_df["wind_direction_set_1"]["Degrees"].isna()][
    "wind_speed_set_1"
]["m/s"].value_counts(normalize=True) * 100

In [None]:
weather_hpwren_df[weather_hpwren_df["wind_direction_set_1"]["Degrees"].isna()][
    "wind_speed_set_1"
]["m/s"].value_counts(normalize=True) * 100

In [None]:
weather_scedison_df[weather_scedison_df["wind_direction_set_1"]["Degrees"].isna()][
    "wind_speed_set_1"
]["m/s"].value_counts(normalize=True) * 100

### Read in processed data

In [None]:
%%time
weather_sdge_df2 = pd.read_csv("../../data/processed/weather_SDGE.csv", header=[0, 1])
weather_hpwren_df2 = pd.read_csv(
    "../../data/processed/weather_HPWREN.csv", header=[0, 1]
)
weather_scedison_df2 = pd.read_csv(
    "../../data/processed/weather_SC-EDISON.csv", header=[0, 1]
)

### Concat to single weather dataframe

In [None]:
weather_df2 = pd.concat([weather_sdge_df2, weather_hpwren_df2, weather_scedison_df2])

### Check interval counts

Are all stations on 10 minute intervals?

In [None]:
interval_df = weather_sdge_df["Date_Time"]["Unnamed: 1_level_1"].str.split(
    "T", expand=True
)

In [None]:
interval_df[[1]].value_counts().sort_index()