# NOAA Dataset: Handling Missing Values

## Setup and Data Loading

In [1]:
import pandas as pd

In [2]:
files = ['../data/source/NOAA_46041.csv',
         '../data/source/NOAA_46050.csv',
         '../data/source/NOAA_46243.csv'
]
dataframes = [pd.read_csv(file) for file in files]
df = pd.concat(dataframes, ignore_index=True)

df['date_time'] = pd.to_datetime(df['date_time'])

## 1. Standardize and Clean Columns

In [3]:
# Clean up column names
df.columns = df.columns.str.strip()
df.columns = df.columns.str.replace(r'\s*\(C°\)', '_celsius', regex=True)
df.columns = df.columns.str.replace(r'\s*\(m/s\)', '_mps', regex=True)
df.columns = df.columns.str.replace(r'\s*\(hPa\)', '_hpa', regex=True)
df.columns = df.columns.str.replace(r'\s*\(s\)', '_s', regex=True)
df.columns = df.columns.str.replace(r'\s*\(m\)', '_m', regex=True)

df.columns = df.columns.str.replace(r'\s+', '_', regex=True)
df.columns = df.columns.str.replace(r'_{2,}', '_', regex=True)
df.columns = df.columns.str.lower()

print("Cleaned Column Names:")
df.columns.to_list()

Cleaned Column Names:


['station_id',
 'latitude_(degrees_north)',
 'longitude_(degrees_east)',
 'date_time',
 'air_temperature_celsius',
 'sea_level_pressure_hpa',
 'wind_speed_mps',
 'gust_speed_mps',
 'significant_wave_height_m',
 'dominant_wave_period_s',
 'sea_surface_temperature_celsius',
 'wind_speed_cwind_mps']

## 2. Merge Redundant Columns

In [4]:
# Use combine_first to create a single wind_speed column
df['wind_speed'] = df['wind_speed_mps'].combine_first(
    df['wind_speed_cwind_mps'])

# Drop the original, redundant columns
df.drop(columns=['wind_speed_mps', 'wind_speed_cwind_mps'], inplace=True)

print("DataFrame info after merging wind speed columns:")
df.info()

DataFrame info after merging wind speed columns:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7219 entries, 0 to 7218
Data columns (total 11 columns):
 #   Column                           Non-Null Count  Dtype              
---  ------                           --------------  -----              
 0   station_id                       7219 non-null   int64              
 1   latitude_(degrees_north)         7219 non-null   float64            
 2   longitude_(degrees_east)         7219 non-null   float64            
 3   date_time                        7219 non-null   datetime64[ns, UTC]
 4   air_temperature_celsius          723 non-null    float64            
 5   sea_level_pressure_hpa           723 non-null    float64            
 6   gust_speed_mps                   723 non-null    float64            
 7   significant_wave_height_m        2167 non-null   float64            
 8   dominant_wave_period_s           2167 non-null   float64            
 9   sea_surface_temperature_c

## 3. Quantify Missing Data

In [5]:
missing_data = df.isnull().sum()
missing_percentage = (missing_data / len(df)) * 100
missing_info = pd.DataFrame(
    {'Missing Values': missing_data, 'Percentage': missing_percentage})
print(missing_info)

                                 Missing Values  Percentage
station_id                                    0    0.000000
latitude_(degrees_north)                      0    0.000000
longitude_(degrees_east)                      0    0.000000
date_time                                     0    0.000000
air_temperature_celsius                    6496   89.984762
sea_level_pressure_hpa                     6496   89.984762
gust_speed_mps                             6496   89.984762
significant_wave_height_m                  5052   69.981992
dominant_wave_period_s                     5052   69.981992
sea_surface_temperature_celsius            5051   69.968140
wind_speed                                 2176   30.142679


## 4. Imputation Strategy

 Since the `NaN` values are systematic, a single, broad imputation won't be performed. Instead, three separate dataframes will be made based on the station capabilities discovered in the EDA. This respects the "data silos" and only the data that is physically valid for a given task will be used.

- `df_complete_station`: For modeling tasks that require all variables (e.g., predicting waves from wind and pressure). This will only contain data from Station 2868187.

- `df_wind_analysis`: For analyzing regional wind patterns. This will contain data from the two stations that measure wind (2868187 and 2868934).

- `df_wave_analysis`: For analyzing regional wave patterns. This will contain data from the two stations that measure waves (2868187 and 2888997).

In [6]:
# DataFrame for the one station with complete data
df_complete_station = df[df['station_id'] == 2868187].copy()

# DataFrame for all stations that have wind data
wind_station_ids = [2868187, 2868934]
df_wind_analysis = df[df['station_id'].isin(wind_station_ids)].copy()

# DataFrame for all stations that have wave data
wave_station_ids = [2868187, 2888997]
df_wave_analysis = df[df['station_id'].isin(wave_station_ids)].copy()


print("--- Complete Station DataFrame ---")
df_complete_station.info()

print("\n--- Wind Analysis DataFrame ---")
df_wind_analysis.info()

print("\n--- Wave Analysis DataFrame ---")
df_wave_analysis.info()

--- Complete Station DataFrame ---
<class 'pandas.core.frame.DataFrame'>
Index: 723 entries, 0 to 722
Data columns (total 11 columns):
 #   Column                           Non-Null Count  Dtype              
---  ------                           --------------  -----              
 0   station_id                       723 non-null    int64              
 1   latitude_(degrees_north)         723 non-null    float64            
 2   longitude_(degrees_east)         723 non-null    float64            
 3   date_time                        723 non-null    datetime64[ns, UTC]
 4   air_temperature_celsius          723 non-null    float64            
 5   sea_level_pressure_hpa           723 non-null    float64            
 6   gust_speed_mps                   723 non-null    float64            
 7   significant_wave_height_m        722 non-null    float64            
 8   dominant_wave_period_s           722 non-null    float64            
 9   sea_surface_temperature_celsius  723 non-null 

## 5. Handle Incidental Missing Data with Time-Series Interpolation

In [7]:
# This cell fills small, random gaps within a single station's data using
# time-based linear interpolation, a better approach to mean-filling
# for time-series data.

# Set date_time as the index to enable time-based interpolation
df_complete_station.set_index('date_time', inplace=True)

# Sort the index to ensure chronological order, which is required for interpolation
df_complete_station.sort_index(inplace=True)

print("Missing values BEFORE interpolation:")
print(df_complete_station.isnull().sum())

# Apply time-based interpolation
df_complete_station.interpolate(method='time', inplace=True)

print("\nMissing values AFTER interpolation:")
print(df_complete_station.isnull().sum())

Missing values BEFORE interpolation:
station_id                         0
latitude_(degrees_north)           0
longitude_(degrees_east)           0
air_temperature_celsius            0
sea_level_pressure_hpa             0
gust_speed_mps                     0
significant_wave_height_m          1
dominant_wave_period_s             1
sea_surface_temperature_celsius    0
wind_speed                         0
dtype: int64

Missing values AFTER interpolation:
station_id                         0
latitude_(degrees_north)           0
longitude_(degrees_east)           0
air_temperature_celsius            0
sea_level_pressure_hpa             0
gust_speed_mps                     0
significant_wave_height_m          0
dominant_wave_period_s             0
sea_surface_temperature_celsius    0
wind_speed                         0
dtype: int64


In [8]:
# This cell fills small, random gaps within a single station's data using
# time-based linear interpolation, a better approach to mean-filling
# for time-series data.

# Set date_time as the index to enable time-based interpolation
df_wave_analysis.set_index('date_time', inplace=True)

# Sort the index to ensure chronological order, which is required for interpolation
df_wave_analysis.sort_index(inplace=True)

print("Missing values BEFORE interpolation:")
print(df_wave_analysis.isnull().sum())

# Apply time-based interpolation
df_wave_analysis.interpolate(method='time', inplace=True)

print("\nMissing values AFTER interpolation:")
print(df_wave_analysis.isnull().sum())

Missing values BEFORE interpolation:
station_id                            0
latitude_(degrees_north)              0
longitude_(degrees_east)              0
air_temperature_celsius            1445
sea_level_pressure_hpa             1445
gust_speed_mps                     1445
significant_wave_height_m             1
dominant_wave_period_s                1
sea_surface_temperature_celsius       0
wind_speed                         1445
dtype: int64

Missing values AFTER interpolation:
station_id                         0
latitude_(degrees_north)           0
longitude_(degrees_east)           0
air_temperature_celsius            2
sea_level_pressure_hpa             2
gust_speed_mps                     2
significant_wave_height_m          0
dominant_wave_period_s             0
sea_surface_temperature_celsius    0
wind_speed                         2
dtype: int64


In [9]:
print(f"Shape before dropping final NaNs: {df_wave_analysis.shape}")
df_wave_analysis.dropna(inplace=True)
print(f"Shape after dropping final NaNs:  {df_wave_analysis.shape}")

Shape before dropping final NaNs: (2168, 10)
Shape after dropping final NaNs:  (2166, 10)


In [10]:
# This cell fills small, random gaps within a single station's data using
# time-based linear interpolation, a better approach to mean-filling
# for time-series data.

# Set date_time as the index to enable time-based interpolation
df_wind_analysis.set_index('date_time', inplace=True)

# Sort the index to ensure chronological order, which is required for interpolation
df_wind_analysis.sort_index(inplace=True)

print("Missing values BEFORE interpolation:")
print(df_wind_analysis.isnull().sum())

# Apply time-based interpolation
df_wind_analysis.interpolate(method='time', inplace=True)

print("\nMissing values AFTER interpolation:")
print(df_wind_analysis.isnull().sum())

Missing values BEFORE interpolation:
station_id                            0
latitude_(degrees_north)              0
longitude_(degrees_east)              0
air_temperature_celsius            5051
sea_level_pressure_hpa             5051
gust_speed_mps                     5051
significant_wave_height_m          5052
dominant_wave_period_s             5052
sea_surface_temperature_celsius    5051
wind_speed                          731
dtype: int64

Missing values AFTER interpolation:
station_id                         0
latitude_(degrees_north)           0
longitude_(degrees_east)           0
air_temperature_celsius            7
sea_level_pressure_hpa             7
gust_speed_mps                     7
significant_wave_height_m          7
dominant_wave_period_s             7
sea_surface_temperature_celsius    7
wind_speed                         0
dtype: int64


In [11]:
print(f"Shape before dropping final NaNs: {df_wind_analysis.shape}")
df_wind_analysis.dropna(inplace=True)
print(f"Shape after dropping final NaNs:  {df_wind_analysis.shape}")

Shape before dropping final NaNs: (5774, 10)
Shape after dropping final NaNs:  (5767, 10)


## 6. Save Cleaned Data

In [12]:
df_complete_station.to_csv('../data/clean/NOAA_complete_station.csv', index=True)
df_wind_analysis.to_csv('../data/clean/NOAA_wind_analysis.csv', index=True)
df_wave_analysis.to_csv('../data/clean/NOAA_wave_analysis.csv', index=True)

print("--- Complete Station CSV ---")
df_complete_station.info()

print("\n--- Wind Analysis CSV ---")
df_wind_analysis.info()

print("\n--- Wave Analysis CSV ---")
df_wave_analysis.info()

--- Complete Station CSV ---
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 723 entries, 2013-04-18 00:50:00+00:00 to 2013-05-18 10:50:00+00:00
Data columns (total 10 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   station_id                       723 non-null    int64  
 1   latitude_(degrees_north)         723 non-null    float64
 2   longitude_(degrees_east)         723 non-null    float64
 3   air_temperature_celsius          723 non-null    float64
 4   sea_level_pressure_hpa           723 non-null    float64
 5   gust_speed_mps                   723 non-null    float64
 6   significant_wave_height_m        723 non-null    float64
 7   dominant_wave_period_s           723 non-null    float64
 8   sea_surface_temperature_celsius  723 non-null    float64
 9   wind_speed                       723 non-null    float64
dtypes: float64(9), int64(1)
memory usage: 62.1 KB

--- Wind Analysis CSV