In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid')

from utils import scatter_plot

%load_ext autoreload
%autoreload 2

In [4]:
ifmha = pd.read_csv('./dataset/IFMHA.csv', converters={'site_no': str}, low_memory=False)
ifmha.shape

(2802532, 47)

In [7]:
# Remove Zero/Missing Values
candidate_columns=['site_no', 'site_tp_cd', 'chan_discharge', 'chan_area', 'chan_width', 'SLOPE']

ifmha_subset = ifmha[(ifmha[candidate_columns] != 0).all(axis=1)]
ifmha_subset = ifmha_subset.dropna(axis=0, how='any', subset=candidate_columns)
print(f"dataset size: {ifmha_subset.shape}")

dataset size: (2248655, 47)


In [8]:
# Exclude USGS sites with negative discharge observations
ifmha_subset = ifmha_subset[ifmha_subset.groupby('site_no')['chan_discharge'].transform(lambda x: (x<0).sum()) == 0]
ifmha_subset.shape

(2107954, 47)

In [9]:
# Keep only sites categorized as channels or streams
ifmha_subset = ifmha_subset[ifmha_subset['site_tp_cd'].isin(['ST'])]
print(ifmha_subset.shape)

(2093881, 47)


In [10]:
# Mean depth estimation
ifmha_subset['chan_mean_depth'] = ifmha_subset['chan_area']/ifmha_subset['chan_width']

# Maximum depth estimation
ifmha_subset['chan_max_depth'] = 1.47910570 * ifmha_subset['chan_mean_depth'] - 0.02600691

In [12]:
# Sanity check for having physically meaningful channel characteristics
ifmha_subset = ifmha_subset[(ifmha_subset[['chan_area', 'chan_mean_depth', 'chan_max_depth']] > 0).all(axis=1)]
print(ifmha_subset.shape)

(2093328, 49)
