# Exploratory: Check column null percentage

Summary: <br>
This notebook is used to explore the SDG&E weather station data that was extracted via Synopic API.

Open Questions: <br>

In [1]:
import pandas as pd

### Read in data

In [2]:
%%time
weather_df = pd.read_csv("../../data/raw/weather_sdge.csv", header=[0, 1])

CPU times: user 55.1 s, sys: 11.8 s, total: 1min 6s
Wall time: 1min 6s


### Calculate attribute null percentage across all stations

In [3]:
rows = weather_df.shape[0]
weather_df.shape

(45797111, 23)

In [4]:
%%time
pd.DataFrame(weather_df.isna().sum() / rows * 100, columns=["null_percentage"])

CPU times: user 14 s, sys: 276 ms, total: 14.3 s
Wall time: 14.3 s


Unnamed: 0,Unnamed: 1,null_percentage
Station_ID,Unnamed: 0_level_1,0.0
Date_Time,Unnamed: 1_level_1,0.0
air_temp_set_1,Celsius,0.318103
relative_humidity_set_1,%,0.280882
wind_speed_set_1,m/s,0.029757
wind_direction_set_1,Degrees,4.179626
wind_gust_set_1,m/s,0.024185
volt_set_1,volts,63.005819
solar_radiation_set_1,W/m**2,98.297144
dew_point_temperature_set_1d,Celsius,0.322815


## Calculate null percentage per station

### Generate null counts for each column for each station

In [5]:
tmp = weather_df.set_index("Station_ID")

In [6]:
station_df = tmp.isna().groupby("Station_ID").sum()

# update index
# just take the station value in each list
orig_index = station_df.index.to_list()
new_index = [item[0] for item in orig_index]
station_df.index = pd.Index(new_index, name="Station_ID")

station_df

Unnamed: 0_level_0,Date_Time,air_temp_set_1,relative_humidity_set_1,wind_speed_set_1,wind_direction_set_1,wind_gust_set_1,volt_set_1,solar_radiation_set_1,dew_point_temperature_set_1d,wind_chill_set_1d,...,altimeter_set_1,pressure_set_1,pressure_set_1d,sea_level_pressure_set_1d,sea_level_pressure_set_1d,wet_bulb_temperature_set_1d,altimeter_set_1d,precip_accum_ten_minute_set_1,fuel_temp_set_1,precip_accum_one_hour_set_1
Unnamed: 0_level_1,Unnamed: 1_level_1,Celsius,%,m/s,Degrees,m/s,volts,W/m**2,Celsius,Celsius,...,Pascals,Pascals,Pascals,Pascals,Pascals.1,Celsius,Pascals,Millimeters,Celsius,Millimeters
Station_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AMOSD,0,684,595,70,10574,0,167787,261384,1279,261266,...,261384,261384,261384,261384,261384,261384,261384,261384,261384,261384
ANESD,0,0,0,52,57261,0,183399,255017,0,267099,...,267102,267102,267102,267102,267102,267102,267102,267102,267102,267102
ARHSD,0,0,3,74,22629,0,168422,264358,3,262961,...,264358,264358,264358,264358,264358,264358,264358,264358,264358,264358
AVOSD,0,0,1,58,22961,0,175839,267678,1,267631,...,267678,267678,267678,267678,267678,267678,267678,267678,267678,267678
AVYSD,0,3891,0,83,2957,0,176702,267714,3891,261746,...,267714,267714,267714,267714,267714,267714,267714,267714,267714,267714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WSTSD,0,0,0,37,68,0,1282,96405,0,87232,...,96405,96405,96405,96405,96405,96405,96405,96405,96405,96405
WSYSD,0,0,1,36,5271,0,204646,266658,1,259500,...,266658,266658,266658,266658,266658,266658,266658,266658,266658,266658
WWYSD,0,0,1,66,6352,0,148045,238705,1,236712,...,238705,238705,238705,238705,238705,238705,238705,238705,238705,238705
WYNSD,0,0,0,72,3959,0,175698,267613,0,259486,...,267613,267613,267613,267613,267613,267613,267613,267613,267613,267613


### Generate total row counts for each station

In [7]:
counts_df = (
    tmp["Date_Time"]
    .groupby("Station_ID")
    .count()
    .rename(columns={"Unnamed: 1_level_1": "row_count"})
)

# update index
# just take the station value in each list
orig_index = counts_df.index.to_list()
new_index = [item[0] for item in orig_index]
counts_df.index = pd.Index(new_index, name="Station_ID")

counts_df

Unnamed: 0_level_0,row_count
Station_ID,Unnamed: 1_level_1
AMOSD,261384
ANESD,267102
ARHSD,264358
AVOSD,267678
AVYSD,267714
...,...
WSTSD,96405
WSYSD,266658
WWYSD,238705
WYNSD,267613


### Calculate null percentage for each attribute of each station

In [8]:
station_null_per_df = station_df.copy()

In [9]:
# station_null_per_df.loc["AMOSD"]

In [10]:
for station in counts_df.index:
    # print(station)
    # print(station_null_per_df.loc[station] / counts_df.loc[station]["row_count"])
    # print(counts_df.loc[station]["row_count"])
    station_null_per_df.loc[station] = (
        station_null_per_df.loc[station] / counts_df.loc[station]["row_count"] * 100
    )

In [11]:
station_null_per_df
# station_null_per_df.to_csv("station_null_per.csv")

Unnamed: 0_level_0,Date_Time,air_temp_set_1,relative_humidity_set_1,wind_speed_set_1,wind_direction_set_1,wind_gust_set_1,volt_set_1,solar_radiation_set_1,dew_point_temperature_set_1d,wind_chill_set_1d,...,altimeter_set_1,pressure_set_1,pressure_set_1d,sea_level_pressure_set_1d,sea_level_pressure_set_1d,wet_bulb_temperature_set_1d,altimeter_set_1d,precip_accum_ten_minute_set_1,fuel_temp_set_1,precip_accum_one_hour_set_1
Unnamed: 0_level_1,Unnamed: 1_level_1,Celsius,%,m/s,Degrees,m/s,volts,W/m**2,Celsius,Celsius,...,Pascals,Pascals,Pascals,Pascals,Pascals.1,Celsius,Pascals,Millimeters,Celsius,Millimeters
Station_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AMOSD,0,0.261684,0.227634,0.026781,4.045389,0.0,64.191764,100.000000,0.489318,99.954856,...,100,100,100,100,100,100,100,100,100,100
ANESD,0,0.000000,0.000000,0.019468,21.437878,0.0,68.662533,95.475511,0.000000,99.998877,...,100,100,100,100,100,100,100,100,100,100
ARHSD,0,0.000000,0.001135,0.027992,8.559983,0.0,63.709818,100.000000,0.001135,99.471550,...,100,100,100,100,100,100,100,100,100,100
AVOSD,0,0.000000,0.000374,0.021668,8.577844,0.0,65.690494,100.000000,0.000374,99.982442,...,100,100,100,100,100,100,100,100,100,100
AVYSD,0,1.453417,0.000000,0.031003,1.104537,0.0,66.004019,100.000000,1.453417,97.770755,...,100,100,100,100,100,100,100,100,100,100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WSTSD,0,0.000000,0.000000,0.038380,0.070536,0.0,1.329807,100.000000,0.000000,90.484933,...,100,100,100,100,100,100,100,100,100,100
WSYSD,0,0.000000,0.000375,0.013500,1.976689,0.0,76.744744,100.000000,0.000375,97.315663,...,100,100,100,100,100,100,100,100,100,100
WWYSD,0,0.000000,0.000419,0.027649,2.661025,0.0,62.020067,100.000000,0.000419,99.165078,...,100,100,100,100,100,100,100,100,100,100
WYNSD,0,0.000000,0.000000,0.026905,1.479375,0.0,65.653761,100.000000,0.000000,96.963152,...,100,100,100,100,100,100,100,100,100,100


## Check wind direction nulls

Is wind direction null only when wind speed is 0? -> Primarily yes. Wind speed=0 accounts for 99.99% of null wind directions.

In [12]:
weather_df[weather_df["wind_direction_set_1"]["Degrees"].isna()]

Unnamed: 0_level_0,Station_ID,Date_Time,air_temp_set_1,relative_humidity_set_1,wind_speed_set_1,wind_direction_set_1,wind_gust_set_1,volt_set_1,solar_radiation_set_1,dew_point_temperature_set_1d,...,altimeter_set_1,pressure_set_1,pressure_set_1d,sea_level_pressure_set_1d,sea_level_pressure_set_1d,wet_bulb_temperature_set_1d,altimeter_set_1d,precip_accum_ten_minute_set_1,fuel_temp_set_1,precip_accum_one_hour_set_1
Unnamed: 0_level_1,Unnamed: 0_level_1,Unnamed: 1_level_1,Celsius,%,m/s,Degrees,m/s,volts,W/m**2,Celsius,...,Pascals,Pascals,Pascals,Pascals,Pascals.1,Celsius,Pascals,Millimeters,Celsius,Millimeters
17,ANESD,2016-06-03T02:50:00Z,26.110,43.00,0.0,,0.45,,,12.54,...,,,,,,,,,,
18,ANESD,2016-06-03T03:00:00Z,25.560,45.00,0.0,,0.45,,,12.74,...,,,,,,,,,,
19,ANESD,2016-06-03T03:10:00Z,25.000,47.00,0.0,,0.45,,,12.89,...,,,,,,,,,,
20,ANESD,2016-06-03T03:20:00Z,23.890,49.00,0.0,,0.45,,,12.50,...,,,,,,,,,,
21,ANESD,2016-06-03T03:30:00Z,23.890,52.00,0.0,,0.45,,,13.41,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45794384,BUBSD,2021-06-21T10:10:00Z,21.289,22.58,0.0,,0.00,12.74,,-0.97,...,,,,,,,,,,
45794510,BUBSD,2021-06-22T07:10:00Z,23.622,31.89,0.0,,0.00,12.81,,5.88,...,,,,,,,,,,
45795079,BUBSD,2021-06-26T07:10:00Z,23.044,38.86,0.0,,0.00,12.81,,8.25,...,,,,,,,,,,
45795931,BUBSD,2021-07-02T07:10:00Z,25.272,41.71,0.0,,0.00,12.81,,11.32,...,,,,,,,,,,


In [13]:
weather_df[weather_df["wind_direction_set_1"]["Degrees"].isna()]["wind_speed_set_1"][
    "m/s"
].value_counts(normalize=True) * 100

0.000    99.989238
0.468     0.000157
0.980     0.000104
0.226     0.000104
0.911     0.000104
           ...    
1.550     0.000052
2.710     0.000052
0.093     0.000052
1.173     0.000052
2.768     0.000052
Name: m/s, Length: 183, dtype: float64