In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid')

from utils import scatter_plot

%load_ext autoreload
%autoreload 2

In [2]:
ifmha = pd.read_csv('./dataset/IFMHA.csv', converters={'site_no': str}, low_memory=False)
ifmha.shape

(2802532, 47)

In [3]:
# Remove Zero/Missing Values
candidate_columns=['site_no', 'site_tp_cd', 'chan_discharge', 'chan_area', 'chan_width', 'SLOPE']

ifmha_subset = ifmha[(ifmha[candidate_columns] != 0).all(axis=1)]
ifmha_subset = ifmha_subset.dropna(axis=0, how='any', subset=candidate_columns)
print(f"dataset size: {ifmha_subset.shape}")

dataset size: (2248655, 47)


In [4]:
# Exclude USGS sites with negative discharge observations
ifmha_subset = ifmha_subset[ifmha_subset.groupby('site_no')['chan_discharge'].transform(lambda x: (x<0).sum()) == 0]
ifmha_subset.shape

(2107954, 47)

In [5]:
# Keep only sites categorized as channels or streams
ifmha_subset = ifmha_subset[ifmha_subset['site_tp_cd'].isin(['ST'])]
print(ifmha_subset.shape)

(2093881, 47)


In [6]:
# Mean depth estimation
ifmha_subset['chan_mean_depth'] = ifmha_subset['chan_area']/ifmha_subset['chan_width']

# Maximum depth estimation
ifmha_subset['chan_max_depth'] = 1.47910570 * ifmha_subset['chan_mean_depth'] - 0.02600691

In [7]:
# Sanity check for having physically meaningful channel characteristics
ifmha_subset = ifmha_subset[(ifmha_subset[['chan_area', 'chan_mean_depth', 'chan_max_depth']] > 0).all(axis=1)]
print(ifmha_subset.shape)

(2093328, 49)


### Statistics of IFMHA 

In [8]:
ifmha_subset['site_no'].value_counts().describe()

count    8431.000000
mean      248.289408
std       268.599205
min         1.000000
25%        62.000000
50%       194.000000
75%       340.000000
max      6580.000000
Name: count, dtype: float64

In [9]:
ifmha_site_list = ifmha_subset['site_no'].value_counts()
ifmha_subset_stat = ifmha_subset[ifmha_subset["site_no"].isin(ifmha_site_list.index[ifmha_site_list>=50].to_list())]

print("site locations with minimum of 50 observations: ")
print(f'Number of observations: {ifmha_subset_stat.shape}')
print('Number of sites:')
ifmha_subset_stat['site_no'].value_counts().describe()

site locations with minimum of 50 observations: 
Number of observations: (2064782, 49)
Number of sites:


count    6498.000000
mean      317.756540
std       269.268215
min        50.000000
25%       154.000000
50%       258.000000
75%       387.000000
max      6580.000000
Name: count, dtype: float64