# Exploratory: Check column null percentage

Summary: <br>
This notebook is used to explore the SDG&E weather station data that was extracted via Synopic API.

Open Questions: <br>

In [1]:
import pandas as pd

pd.set_option("display.max_rows", None, "display.max_columns", None)

### Read in raw data

In [2]:
%%time
weather_sdge_df = pd.read_csv("../../data/raw/weather_SDGE.csv", header=[0, 1])
weather_hpwren_df = pd.read_csv("../../data/raw/weather_HPWREN.csv", header=[0, 1])
weather_scedison_df = pd.read_csv("../../data/raw/weather_SC-EDISON.csv", header=[0, 1])

CPU times: user 28.9 s, sys: 6.62 s, total: 35.5 s
Wall time: 35.5 s


### Helper functions to calculate attribute null percentage and station null percentage

In [3]:
def calc_att_null_percentage(df: pd.DataFrame) -> pd.DataFrame:
    """
    Return dataframe with null percentage of each column given a weather data dataframe.
    """
    rows = df.shape[0]
    return pd.DataFrame(df.isna().sum() / rows * 100, columns=["null_percentage"])

In [4]:
def calc_station_null_percentage(df: pd.DataFrame) -> pd.DataFrame:
    """
    Return dataframe with null percentage of each column for each station given a weather data dataframe.
    """
    tmp = df.set_index("Station_ID")

    # get null counts of each column for each station
    station_df = tmp.isna().groupby("Station_ID").sum()
    # update index
    # just take the station value in each list
    orig_index = station_df.index.to_list()
    new_index = [item[0] for item in orig_index]
    station_df.index = pd.Index(new_index, name="Station_ID")

    # get the total row count for each station
    counts_df = (
        tmp["Date_Time"]
        .groupby("Station_ID")
        .count()
        .rename(columns={"Unnamed: 1_level_1": "row_count"})
    )
    # update index
    # just take the station value in each list
    orig_index = counts_df.index.to_list()
    new_index = [item[0] for item in orig_index]
    counts_df.index = pd.Index(new_index, name="Station_ID")

    # calc null percentage
    station_null_per_df = station_df.copy()
    for station in counts_df.index:
        # print(station)
        # print(station_null_per_df.loc[station] / counts_df.loc[station]["row_count"])
        # print(counts_df.loc[station]["row_count"])
        station_null_per_df.loc[station] = (
            station_null_per_df.loc[station] / counts_df.loc[station]["row_count"] * 100
        )

    return station_null_per_df

### Calc null percentages for each network

#### SDGE

In [5]:
weather_sdge_df.shape

(19614476, 22)

In [6]:
%%time
calc_att_null_percentage(weather_sdge_df)

CPU times: user 6.49 s, sys: 55.9 ms, total: 6.54 s
Wall time: 6.55 s


Unnamed: 0,Unnamed: 1,null_percentage
Station_ID,Unnamed: 0_level_1,0.0
Date_Time,Unnamed: 1_level_1,0.0
air_temp_set_1,Celsius,0.199113
relative_humidity_set_1,%,0.15641
wind_speed_set_1,m/s,0.027709
volt_set_1,volts,63.775025
wind_gust_set_1,m/s,0.003671
wind_direction_set_1,Degrees,3.977985
dew_point_temperature_set_1d,Celsius,0.204645
wind_chill_set_1d,Celsius,97.645413


In [7]:
%%time
sdge_station_null_per_df = calc_station_null_percentage(weather_sdge_df)
sdge_station_null_per_df

CPU times: user 29.3 s, sys: 4.83 s, total: 34.2 s
Wall time: 34.2 s


Unnamed: 0_level_0,Date_Time,air_temp_set_1,relative_humidity_set_1,wind_speed_set_1,volt_set_1,wind_gust_set_1,wind_direction_set_1,dew_point_temperature_set_1d,wind_chill_set_1d,wind_cardinal_direction_set_1d,heat_index_set_1d,altimeter_set_1,pressure_set_1,solar_radiation_set_1,pressure_set_1d,sea_level_pressure_set_1d,sea_level_pressure_set_1d,wet_bulb_temperature_set_1d,altimeter_set_1d,fuel_temp_set_1,precip_accum_ten_minute_set_1
Unnamed: 0_level_1,Unnamed: 1_level_1,Celsius,%,m/s,volts,m/s,Degrees,Celsius,Celsius,code,Celsius,Pascals,Pascals,W/m**2,Pascals,Pascals,Pascals.1,Celsius,Pascals,Celsius,Millimeters
Station_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
ARHSD,0,0.0,0.001135,0.027992,63.709818,0.0,8.559983,0.001135,99.47155,8.559983,88.066561,100,100,100.0,100,100,100,100,100,100,100
AVOSD,0,0.0,0.000374,0.021668,65.690494,0.0,8.577844,0.000374,99.982442,8.577844,91.244704,100,100,100.0,100,100,100,100,100,100,100
BFDSD,0,0.00114,0.00152,0.033061,63.703245,0.00114,0.762294,0.00152,99.336508,0.762294,98.798418,100,100,100.0,100,100,100,100,100,100,100
BLCSD,0,0.0,0.0,0.028577,65.804831,0.0,3.724844,0.0,99.943597,3.724844,86.330957,100,100,100.0,100,100,100,100,100,100,100
BLUSD,0,0.0,0.001026,0.06668,1.815757,0.0,1.880386,0.001026,98.778211,1.880386,92.055806,100,100,100.0,100,100,100,100,100,100,100
BMRSD,0,0.0,0.0,0.065547,1.812782,0.0,0.572511,0.0,98.611225,0.572511,96.568005,100,100,100.0,100,100,100,100,100,100,100
BMSSD,0,0.000376,0.001504,0.036859,63.906859,0.000752,2.219456,0.001504,98.025771,2.219456,85.216041,100,100,100.0,100,100,100,100,100,100,100
BOBSD,0,0.0,0.0,0.041058,0.0,0.0,0.246348,0.0,98.51165,0.246348,90.440346,100,100,100.0,100,100,100,100,100,100,100
BOCSD,0,0.0,0.000377,0.028281,66.226998,0.0,0.436652,0.000377,96.982278,0.436652,89.423831,100,100,100.0,100,100,100,100,100,100,100
BRGSD,0,0.0,0.001886,0.022259,63.867217,0.0,2.698216,0.001886,99.200193,2.698216,60.532551,100,100,100.0,100,100,100,100,100,100,100


#### HPWREN

In [8]:
weather_hpwren_df.shape

(2418771, 23)

In [9]:
%%time
calc_att_null_percentage(weather_hpwren_df)

CPU times: user 866 ms, sys: 12.7 ms, total: 878 ms
Wall time: 886 ms


Unnamed: 0,Unnamed: 1,null_percentage
Station_ID,Unnamed: 0_level_1,0.0
Date_Time,Unnamed: 1_level_1,0.0
pressure_set_1,Pascals,1.012126
air_temp_set_1,Celsius,1.01225
relative_humidity_set_1,%,1.012539
wind_speed_set_1,m/s,1.837007
wind_direction_set_1,Degrees,1.851353
wind_gust_set_1,m/s,1.838041
precip_accum_one_hour_set_1,Millimeters,100.0
precip_accum_set_1,Millimeters,84.761931


In [10]:
%%time
hpwren_station_null_per_df = calc_station_null_percentage(weather_hpwren_df)
hpwren_station_null_per_df

# via trial and error there are 4 hpwren stations with > 20% null percentage for wind speed; these will be manually filtered out and reprocessed
# HP003, HP006 (100%), HP015, HP025

CPU times: user 3.66 s, sys: 352 ms, total: 4.01 s
Wall time: 4.01 s


Unnamed: 0_level_0,Date_Time,pressure_set_1,air_temp_set_1,relative_humidity_set_1,wind_speed_set_1,wind_direction_set_1,wind_gust_set_1,precip_accum_one_hour_set_1,precip_accum_set_1,volt_set_1,dew_point_temperature_set_1d,wind_chill_set_1d,wind_cardinal_direction_set_1d,sea_level_pressure_set_1d,heat_index_set_1d,wet_bulb_temperature_set_1d,altimeter_set_1d,precip_accum_five_minute_set_1,solar_radiation_set_1,precip_accum_one_minute_set_1,fuel_temp_set_1,fuel_moisture_set_1
Unnamed: 0_level_1,Unnamed: 1_level_1,Pascals,Celsius,%,m/s,Degrees,m/s,Millimeters,Millimeters,volts,Celsius,Celsius,code,Pascals,Celsius,Celsius,Pascals,Millimeters,W/m**2,Millimeters,Celsius,gm
Station_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
HP001,0,1.303828,1.303828,1.303828,7.465208,7.470773,7.469978,100,85.481005,85.480608,1.303828,65.413985,7.470773,1.303828,98.10309,100,1.303828,100,100,100,100,100
HP002,0,0.63991,0.63991,0.64035,6.643691,6.645454,6.644132,100,84.769531,84.769531,0.64035,92.850816,6.645454,0.64035,93.455909,100,0.63991,100,100,100,100,100
HP004,0,1.308221,1.308623,1.308221,0.290224,0.30552,0.290224,100,87.713592,87.713592,1.309026,99.973031,0.30552,1.309428,89.520547,100,1.308221,100,100,100,100,100
HP005,0,0.857596,0.857596,0.857596,0.013425,0.020532,0.013425,100,83.935862,83.935467,0.857596,97.109363,0.020532,0.857596,92.339674,100,0.857596,100,100,100,100,100
HP007,0,1.192226,1.192226,1.192226,0.005503,0.022013,0.005896,100,85.096188,85.096188,1.192226,99.345907,0.022013,1.192226,91.625327,100,1.192226,100,100,100,100,100
HP009,0,0.912596,0.912596,0.912596,0.425801,0.513263,0.426377,100,82.673917,82.673917,0.912596,99.977559,0.513263,0.912596,94.555498,100,0.912596,100,100,100,100,100
HP010,0,0.779624,0.779624,0.779624,0.06657,0.080166,0.067039,100,83.085023,83.085491,0.779624,99.155212,0.080166,0.779624,88.803045,100,0.779624,100,100,100,100,100
HP014,0,1.0479,1.048326,1.05088,3.025376,3.025802,3.028355,100,83.605594,83.605594,1.051305,91.423561,3.025802,1.051305,97.295974,100,1.0479,100,100,100,100,100
HP016,0,1.136095,1.136095,1.136095,0.021581,0.02415,0.021581,100,91.066419,91.065391,1.136095,89.009013,0.02415,1.136095,93.401297,100,1.136095,100,100,100,100,100
HP019,0,0.554817,0.554817,0.557874,2.70072,2.70072,2.70072,100,82.096077,82.096077,0.557874,72.781879,2.70072,0.557874,97.308451,100,0.554817,100,100,100,100,100


#### SC-EDISON

In [11]:
weather_scedison_df.shape

(2995134, 13)

In [12]:
%%time
calc_att_null_percentage(weather_scedison_df)

CPU times: user 900 ms, sys: 11.8 ms, total: 912 ms
Wall time: 908 ms


Unnamed: 0,Unnamed: 1,null_percentage
Station_ID,Unnamed: 0_level_1,0.0
Date_Time,Unnamed: 1_level_1,0.0
air_temp_set_1,Celsius,0.112983
relative_humidity_set_1,%,0.151946
wind_speed_set_1,m/s,0.077793
wind_direction_set_1,Degrees,1.818082
wind_gust_set_1,m/s,0.081031
volt_set_1,volts,18.718194
solar_radiation_set_1,W/m**2,49.153126
dew_point_temperature_set_1d,Celsius,0.15218


In [13]:
%%time
scedison_station_null_per_df = calc_station_null_percentage(weather_scedison_df)
scedison_station_null_per_df

CPU times: user 4.28 s, sys: 201 ms, total: 4.48 s
Wall time: 4.54 s


Unnamed: 0_level_0,Date_Time,air_temp_set_1,relative_humidity_set_1,wind_speed_set_1,wind_direction_set_1,wind_gust_set_1,volt_set_1,solar_radiation_set_1,dew_point_temperature_set_1d,wind_chill_set_1d,wind_cardinal_direction_set_1d,heat_index_set_1d
Unnamed: 0_level_1,Unnamed: 1_level_1,Celsius,%,m/s,Degrees,m/s,volts,W/m**2,Celsius,Celsius,code,Celsius
Station_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
094SE,0,0.0,0.0,0.037168,2.843338,0.0,0.0,100.0,0.0,100.0,2.843338,81.378926
152SE,0,0.0,0.0,0.030404,2.645181,0.0,0.0,100.0,0.0,100.0,2.645181,99.452721
SE003,0,0.0,0.004751,0.061763,0.043551,0.211418,28.448016,35.532505,0.004751,95.015441,0.043551,92.00095
SE007,0,0.0,0.0,0.087918,1.558379,0.0,40.643536,40.660252,0.0,100.0,1.558379,94.178214
SE024,0,0.0,0.000681,0.071552,0.08927,0.0,35.295681,35.325665,0.000681,97.516798,0.08927,97.784607
SE031,0,2.124435,2.840127,0.072953,7.359424,0.001887,39.78567,66.064387,2.840755,100.0,7.359424,93.221682
SE072,0,0.0,0.0,0.072141,1.497428,0.0,35.382043,35.423844,0.0,99.958199,1.497428,91.455694
SE073,0,0.0,0.0,0.086196,1.661302,0.000673,35.382295,67.109995,0.0,99.94074,1.661302,92.94738
SE076,0,0.0,0.001348,0.090335,1.101546,0.002022,35.316206,66.872055,0.001348,99.927867,1.101546,85.935404
SE121,0,0.0,0.000712,0.084749,10.345834,0.001424,32.273847,65.087527,0.000712,100.0,10.345834,92.445198


## Check wind direction nulls

Is wind direction null only when wind speed is 0? -> Primarily yes. Wind speed=0 accounts for 99.99% of null wind directions.

In [14]:
weather_sdge_df[weather_sdge_df["wind_direction_set_1"]["Degrees"].isna()][
    "wind_speed_set_1"
]["m/s"].value_counts(normalize=True) * 100

0.000    99.987056
0.468     0.000384
1.199     0.000256
1.235     0.000256
1.980     0.000256
0.580     0.000128
0.741     0.000128
0.273     0.000128
1.451     0.000128
0.980     0.000128
1.460     0.000128
0.607     0.000128
1.394     0.000128
1.020     0.000128
1.000     0.000128
1.152     0.000128
0.319     0.000128
2.420     0.000128
0.590     0.000128
0.082     0.000128
0.210     0.000128
0.905     0.000128
1.955     0.000128
1.327     0.000128
0.700     0.000128
0.360     0.000128
2.090     0.000128
2.550     0.000128
4.970     0.000128
4.522     0.000128
0.268     0.000128
7.259     0.000128
0.823     0.000128
4.491     0.000128
2.269     0.000128
0.561     0.000128
1.970     0.000128
2.768     0.000128
1.163     0.000128
0.535     0.000128
0.653     0.000128
0.952     0.000128
2.450     0.000128
2.710     0.000128
1.550     0.000128
3.730     0.000128
2.546     0.000128
1.759     0.000128
4.337     0.000128
1.307     0.000128
2.896     0.000128
2.480     0.000128
2.380     0.

In [15]:
weather_hpwren_df[weather_hpwren_df["wind_direction_set_1"]["Degrees"].isna()][
    "wind_speed_set_1"
]["m/s"].value_counts(normalize=True) * 100

0.0    100.0
Name: m/s, dtype: float64

In [16]:
weather_scedison_df[weather_scedison_df["wind_direction_set_1"]["Degrees"].isna()][
    "wind_speed_set_1"
]["m/s"].value_counts(normalize=True) * 100

0.000    99.904507
1.821     0.003673
1.510     0.001836
2.400     0.001836
0.350     0.001836
0.442     0.001836
0.977     0.001836
1.595     0.001836
1.929     0.001836
1.873     0.001836
0.777     0.001836
1.420     0.001836
0.314     0.001836
1.183     0.001836
0.823     0.001836
0.590     0.001836
0.770     0.001836
0.833     0.001836
0.895     0.001836
0.450     0.001836
0.130     0.001836
2.819     0.001836
0.406     0.001836
0.262     0.001836
0.320     0.001836
2.900     0.001836
1.710     0.001836
0.900     0.001836
0.800     0.001836
0.864     0.001836
0.268     0.001836
0.010     0.001836
0.210     0.001836
0.310     0.001836
0.920     0.001836
0.499     0.001836
1.214     0.001836
0.746     0.001836
0.952     0.001836
2.433     0.001836
1.940     0.001836
0.797     0.001836
4.040     0.001836
1.271     0.001836
2.670     0.001836
3.611     0.001836
0.422     0.001836
1.530     0.001836
1.620     0.001836
0.440     0.001836
0.093     0.001836
2.510     0.001836
Name: m/s, d

### Read in processed data

In [17]:
%%time
weather_sdge_df2 = pd.read_csv("../../data/processed/weather_SDGE.csv", header=[0, 1])
weather_hpwren_df2 = pd.read_csv(
    "../../data/processed/weather_HPWREN.csv", header=[0, 1]
)
weather_scedison_df2 = pd.read_csv(
    "../../data/processed/weather_SC-EDISON.csv", header=[0, 1]
)

CPU times: user 25 s, sys: 4.09 s, total: 29.1 s
Wall time: 29.2 s


### Concat to single weather dataframe

In [18]:
weather_df2 = pd.concat([weather_sdge_df2, weather_hpwren_df2, weather_scedison_df2])

### Check interval counts

Are all stations on 10 minute intervals?

In [19]:
interval_df = weather_sdge_df["Date_Time"]["Unnamed: 1_level_1"].str.split(
    "T", expand=True
)

In [20]:
interval_df[[1]].value_counts().sort_index()

1        
00:00:00Z    136258
00:10:00Z    136237
00:20:00Z    136329
00:30:00Z    136330
00:40:00Z    136224
00:50:00Z    136238
01:00:00Z    136193
01:10:00Z    136159
01:20:00Z    136201
01:30:00Z    136185
01:40:00Z    136322
01:50:00Z    136375
02:00:00Z    136209
02:10:00Z    136226
02:20:00Z    136255
02:30:00Z    136370
02:40:00Z    136290
02:50:00Z    136277
03:00:00Z    136142
03:10:00Z    136117
03:20:00Z    136232
03:30:00Z    136228
03:40:00Z    136207
03:50:00Z    136203
04:00:00Z    136085
04:10:00Z    136189
04:20:00Z    136181
04:30:00Z    136271
04:40:00Z    136238
04:50:00Z    136270
05:00:00Z    136196
05:10:00Z    136334
05:20:00Z    136285
05:30:00Z    136425
05:40:00Z    136360
05:50:00Z    136295
06:00:00Z    136431
06:10:00Z    136271
06:20:00Z    136345
06:30:00Z    136156
06:40:00Z    136401
06:50:00Z    136389
07:00:00Z    136382
07:10:00Z    136346
07:20:00Z    136237
07:30:00Z    136081
07:40:00Z    136310
07:50:00Z    136053
08:00:00Z    136077
08:10:00Z 