In [1]:
import ee
import pandas as pd
import numpy as np
ee.Initialize()

In [11]:
# read in the nhd addendum file

nhd_stats = pd.read_csv("nhd_stats_AI.csv")

# read in csv file with SSURGO variables
df_m = pd.read_csv("combined_regular_clean_with_ssurgo_variables.csv")

In [4]:
df_merged = []
for i in range(10):
    try:
        df_temp = pd.read_pickle(('NHD_extracted_vars_200mX200m/combined_regular_clean_with_ssurgo_nhd_variables_part' + 
                              str(500 * i + 1)))
    except:
        break
    df_merged.append(df_temp)
df_merged = pd.concat(df_merged)

In [5]:
df_merged.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'jurisdiction_type', 'da_number',
       'district', 'project_name', 'longitude', 'latitude',
       'date_issued_or_denied', 'rha_determination', 'cwa_determination',
       'rha1', 'rha2', 'cwa1', 'cwa2', 'cwa3', 'cwa4', 'cwa5', 'cwa6', 'cwa7',
       'cwa8', 'cwa9', 'potential_wetland', 'index', 'Index', 'mukey',
       'hydclprs', 'aws025wta', 'drclassdcd', 'nhd_vars_wb', 'nhd_vars_fl'],
      dtype='object')

In [6]:
df_merged["wb_comid_list"] = df_merged.apply(lambda x: [comid for comid in x.nhd_vars_wb[0]], axis=1)
df_merged["wb_ftype_str_list"] = df_merged.apply(lambda x: [ftype_str for ftype_str in x.nhd_vars_wb[1]], axis=1)
df_merged["wb_gnis_id_list"] = df_merged.apply(lambda x: [gnis_id for gnis_id in x.nhd_vars_wb[2]], axis=1)
df_merged["wb_area_list"] = df_merged.apply(lambda x: [wb_area for wb_area in x.nhd_vars_wb[3]], axis=1)

df_merged["fl_comid_list"] = df_merged.apply(lambda x: [comid for comid in x.nhd_vars_fl[0]], axis=1)
df_merged["fl_ftype_str_list"] = df_merged.apply(lambda x: [ftype_str for ftype_str in x.nhd_vars_fl[1]], axis=1)
df_merged["fl_gnis_id_list"] = df_merged.apply(lambda x: [gnis_id for gnis_id in x.nhd_vars_fl[2]], axis=1)
df_merged["fl_length_list"] = df_merged.apply(lambda x: [fl_length for fl_length in x.nhd_vars_fl[4]], axis=1)

In [7]:
df_merged.fl_comid_list[9]

[21980217, 21978365]

In [179]:
df_merged["fl_comid_list_len"] = df_merged.apply(lambda x: len([comid for comid in x.nhd_vars_fl[0]]), axis=1)

In [191]:
for i in range(100):
    if df_merged.fl_comid_list_len[i] != 0:
        print(i, end=",")

9,11,13,15,20,21,23,25,31,32,33,37,38,39,41,42,44,47,49,50,53,54,55,58,59,64,74,75,78,81,82,85,87,88,89,92,94,

In [188]:
df_merged.fl_comid_list_len[9]

2

In [152]:
df_merged.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'jurisdiction_type', 'da_number',
       'district', 'project_name', 'longitude', 'latitude',
       'date_issued_or_denied', 'rha_determination', 'cwa_determination',
       'rha1', 'rha2', 'cwa1', 'cwa2', 'cwa3', 'cwa4', 'cwa5', 'cwa6', 'cwa7',
       'cwa8', 'cwa9', 'potential_wetland', 'index', 'Index', 'mukey',
       'hydclprs', 'aws025wta', 'drclassdcd', 'nhd_vars_wb', 'nhd_vars_fl',
       'wb_comid_list', 'wb_ftype_str_list', 'wb_gnis_id_list', 'wb_area_list',
       'fl_comid_list', 'fl_ftype_str_list', 'fl_gnis_id_list',
       'fl_length_list'],
      dtype='object')

In [153]:
[num for num in df_merged.fl_length_list[1]]

[nan]

In [154]:
df_merged.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'jurisdiction_type', 'da_number',
       'district', 'project_name', 'longitude', 'latitude',
       'date_issued_or_denied', 'rha_determination', 'cwa_determination',
       'rha1', 'rha2', 'cwa1', 'cwa2', 'cwa3', 'cwa4', 'cwa5', 'cwa6', 'cwa7',
       'cwa8', 'cwa9', 'potential_wetland', 'index', 'Index', 'mukey',
       'hydclprs', 'aws025wta', 'drclassdcd', 'nhd_vars_wb', 'nhd_vars_fl',
       'wb_comid_list', 'wb_ftype_str_list', 'wb_gnis_id_list', 'wb_area_list',
       'fl_comid_list', 'fl_ftype_str_list', 'fl_gnis_id_list',
       'fl_length_list'],
      dtype='object')

In [188]:
# working version
# read in fl_comid_list, pull out comids and match with nhd_stats to sum 
# areasqkm, todasqkm and flow_type

def extract_feature(comid, feature):
    if comid == None:
        return np.nan # if no comid's in GEE
    extracted_feature = nhd_stats[nhd_stats["comid"] == comid][str(feature)]
    try:
        extracted_feature = np.array(extracted_feature).item() 
    except Exception as e:
        return np.nan # if comid in GEE but not in nhd database
    return extracted_feature


def fl_areasqkm(comid):
    if comid == None:
        return np.nan # if no comid's in GEE
    area = nhd_stats[nhd_stats["comid"] == comid]["areasqkm"]
    try:
        area = np.array(area).item() 
    except Exception as e:
        return np.nan # if comid in GEE but not in nhd database
    return area

def fl_length(comid):
    return np.float(nhd_stats[nhd_stats["comid"] == comid]["lengthkm"])


def fl_totdasqkm(comid):
    return np.float(nhd_stats[nhd_stats["comid"] == comid]["totdasqkm"])

def fl_flow_type(comid):
    return np.float(nhd_stats[nhd_stats["comid"] == comid]["flow_type"])

def gnis_name_ind(comid):
    return np.float(nhd_stats[nhd_stats["comid"] == comid]["gnis_name_ind"])


df_merged["fl_areasqkm_sum"] = (df_merged.apply(lambda x: 
                                                np.sum(np.array([fl_areasqkm(item) 
                                                                 for item in x.fl_comid_list])
                                                       [~np.isnan(np.array([fl_areasqkm(item) 
                                                                            for item in x.fl_comid_list]))]), 
                                                axis=1))
df_merged["fl_areasqkm_count"] = (df_merged.apply(lambda x: 
                                                  len(np.array([fl_areasqkm(item) 
                                                                for item in x.fl_comid_list])
                                                      [~np.isnan(np.array([fl_areasqkm(item) 
                                                                           for item in x.fl_comid_list]))]), 
                                                  axis=1))

# temp = np.array([fl_areasqkm(item) for item in x.fl_comid_list if len(x.fl_comid_list) != 0])
# temp2 = np.array([fl_areasqkm(item) if len(x.fl_comid_list) != 0 else np.nan for item in x.fl_comid_list])

# df_merged["fl_areasqkm_mean"] = df_merged.apply(lambda x: temp[~np.isnan(temp)], axis=1)
# df_merged["fl_areasqkm_mean"] = df_merged.apply(lambda x: np.array([fl_areasqkm(item) for item in x.fl_comid_list if len(x.fl_comid_list) != 0])[~np.isnan(np.array([fl_areasqkm(item) for item in x.fl_comid_list if len(x.fl_comid_list) != 0]))], axis=1)
# df_merged["fl_areasqkm_mean"] = (df_merged.apply(lambda x: np.mean(np.array([fl_areasqkm(item) for item in x.fl_comid_list if len(x.fl_comid_list) != 0])[~np.isnan(np.array([fl_areasqkm(item) for item in x.fl_comid_list if len(x.fl_comid_list) != 0]))]), axis=1))
# df_merged["fl_areasqkm_mean"] = (df_merged.apply(lambda x: np.mean(np.array([fl_areasqkm(item) for item in x.fl_comid_list if len(x.fl_comid_list) != 0])[~np.isnan(np.array([fl_areasqkm(item) for item in x.fl_comid_list if len(x.fl_comid_list) != 0]))]), axis=1))

# df_merged["fl_areasqkm_mean"] = df_merged.apply(lambda x: np.mean(temp2[~isnan(temp2)]))
# df_merged["fl_areasqkm_mean"] = df_merged.apply(lambda x: np.mean(np.array([fl_areasqkm(item) if len(x.fl_comid_list) != 0 else np.nan for item in x.fl_comid_list])[~np.isnan(np.array([fl_areasqkm(item) if len(x.fl_comid_list) != 0 else np.nan for item in x.fl_comid_list]))]), axis=1)
# df_merged["fl_areasqkm_mean"] = df_merged.apply(lambda x: np.mean(fl_areasqkm(comid) for comid in np.array(x.fl_comid_list)[~np.isnan(np.array(x.fl_comid_list))]) if len(x.fl_comid_list) != 0 else np.nan, axis=1)



# filter out invalid comids to avoid nuisance issues downstream

In [189]:
nhd_stats.columns

Index(['comid', 'long_comid', 'lat_comid', 'startflag', 'intephem',
       'divergence', 'streamorde', 'lengthkm', 'gnis_name_ind', 'areasqkm',
       'totdasqkm', 'flow_type', 'distup_max', 'distdown_max'],
      dtype='object')

In [234]:
df_merged["wb_comid_list_filtered"] = df_merged.apply(lambda x: [comid for comid in x.nhd_vars_wb[0] if comid in np.array(nhd_stats.comid)
                                                                ], axis=1)

df_merged["fl_comid_list_filtered"] = df_merged.apply(lambda x: [comid for comid in x.nhd_vars_fl[0] if comid in np.array(nhd_stats.comid)
                                                                ], axis=1)

In [203]:
179 in nhd_stats.comid

True

In [237]:
df_merged.fl_comid_list_filtered[9:15]

9     [21980217, 21978365]
10                      []
11              [21635913]
12                      []
13              [15560261]
14                      []
Name: fl_comid_list_filtered, dtype: object

In [89]:
np.mean([])

nan

In [236]:
df_merged.fl_areasqkm_mean[9:15]

9     1.4490
10       NaN
11    8.5815
12       NaN
13       NaN
14       NaN
Name: fl_areasqkm_mean, dtype: float64

In [58]:
np.mean([np.nan])

nan

In [468]:
# df_merged.apply(lambda x: [fl_areasqkm(comid) for comid in x.fl_comid_list], axis=1)
def test(x):
    return x**2
df_merged.apply(lambda x: np.sum([fl_areasqkm(comid) for comid in x.fl_comid_list]), axis=1)

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
4995    0.0
4996    0.0
4997    0.0
4998    0.0
4999    0.0
Length: 5000, dtype: float64

In [469]:
fl_areasqkm(21980217)

1.5831

In [470]:
np.sum([fl_areasqkm(comid) for comid in df_merged.fl_comid_list[9]])
# [fl_areasqkm(comid) for comid in df_merged.fl_comid_list[9]]
# df_merged.fl_comid_list[9]

2.8979999999999997

In [471]:
df_merged.fl_length_list[9]

[3.135, 3.557]

In [472]:
df_merged.columns
df_merged.fl_areasqkm_sum[9]

2.8979999999999997

In [33]:
def fl_areasqkm(comid):
#     if comid == None:
#         return np.nan
    area = nhd_stats[nhd_stats["comid"] == comid]["areasqkm"]
    try:
        area = np.array(area).item() 
    except Exception as e:
        return np.nan 
    return area


In [223]:
df_temp = df_merged.iloc[9:15].copy()
df_temp.apply(lambda x: [comid for comid in x.fl_comid_list], axis=1)

9     [21980217, 21978365]
10                      []
11    [21632389, 21635913]
12                      []
13              [15560261]
14                      []
dtype: object

In [233]:
nhd_stats[nhd_stats.comid == 21980217]
21980217 in np.array(nhd_stats.comid)

True

In [199]:
df_temp = df_merged.iloc[9:15].copy()
# df_temp

df_temp["fl_areasqkm_sum"] = df_temp.apply(lambda x: [fl_areasqkm(comid) for comid in np.array(x.fl_comid_list)[~np.isnan(np.array(x.fl_comid_list))]], axis=1)
df_temp["fl_areasqkm_sum"]

9     [1.5831, 1.3149]
10                  []
11       [nan, 8.5815]
12                  []
13               [nan]
14                  []
Name: fl_areasqkm_sum, dtype: object

In [219]:
df_temp = df_merged.iloc[9:15].copy()
# df_temp

# df_temp.apply(lambda x: [fl_areasqkm(comid) for comid in np.array(x.fl_comid_list_filtered)[~np.isnan(np.array(x.fl_comid_list_filtered))]], axis=1)

df_temp.apply(lambda x: [comid in nhd_stats.comid for comid in x.nhd_vars_fl[0]], axis=1)
# df_temp.apply(lambda x: len(x.nhd_vars_fl[0]), axis=1)


9     [False, False]
10                []
11    [False, False]
12                []
13           [False]
14                []
dtype: object

In [159]:
df_temp.apply(lambda x: np.mean(np.array([fl_areasqkm(item) if len(x.fl_comid_list) != 0 else np.nan for item in x.fl_comid_list])[~np.isnan(np.array([fl_areasqkm(item) if len(x.fl_comid_list) != 0 else np.nan for item in x.fl_comid_list]))]), axis=1)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


9     1.449
10      NaN
dtype: float64

In [160]:
(df_temp.apply(lambda x: 
               np.mean(np.array([fl_areasqkm(item) 
                                 if len(x.fl_comid_list) != 0 
                                 else np.nan for item in x.fl_comid_list])
                       [~np.isnan(np.array([fl_areasqkm(item) 
                                            if len(x.fl_comid_list) != 0 
                                            else np.nan for item in x.fl_comid_list]))]), axis=1))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


9     1.449
10      NaN
dtype: float64

In [168]:
(df_temp.apply(lambda x: 
               (np.sum(np.array([fl_areasqkm(item) 
                                 if len(x.fl_comid_list) != 0 
                                 else np.nan for item in x.fl_comid_list])
                       [~np.isnan(np.array([fl_areasqkm(item) 
                                            if len(x.fl_comid_list) != 0 
                                            else np.nan for item in x.fl_comid_list]))])) if len(x.fl_comid_list) != 0 else np.nan, axis=1))



9     2.8980
10       NaN
11    8.5815
12       NaN
13    0.0000
14       NaN
dtype: float64

In [185]:
df_temp.apply(lambda x: len(np.array([fl_areasqkm(item) for item in x.fl_comid_list])[~np.isnan(np.array([fl_areasqkm(item) for item in x.fl_comid_list]))]), axis=1)

9     2
10    0
11    1
12    0
13    0
14    0
dtype: int64

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


9     1.449
10      NaN
dtype: float64

In [None]:
np.

In [512]:
df_temp = df_merged.iloc[9:15].copy()
# df_temp
df_temp["fl_areasqkm_nulls"] = df_temp.apply(lambda x: np.nan if len(x.fl_comid_list) == 0 else x.fl_comid_list, axis=1)
df_temp["fl_areasqkm_sum"] = df_temp.apply(lambda x:[fl_areasqkm(comid) for comid in x.fl_areasqkm_nulls if fl_areasqkm(comid) != np.nan], axis=1)
# df_temp["fl_areasqkm_sum"] = df_temp.apply(lambda x:x.fl_areasqkm_nulls, axis=1)

# df_temp["fl_areasqkm_nulls"]
# df_temp["fl_areasqkm_sum"]
df_temp.fl_areasqkm_nulls

TypeError: 'float' object is not iterable

In [494]:
df_temp = df_merged.iloc[9:15].copy()
# df_temp

df_temp.fl_comid_list

9     [21980217, 21978365]
10                      []
11    [21632389, 21635913]
12                      []
13              [15560261]
14                      []
Name: fl_comid_list, dtype: object

In [50]:
def fl_areasqkm(comid):
#     if comid == None:
#         return np.nan
    area = nhd_stats[nhd_stats["comid"] == comid]["areasqkm"]
#     print(type(np.array(area).item()))
    try:
        area = np.array(area).item()
    except Exception as e:
        return np.nan
    return area


In [64]:
comid0 = None # ValueError: can only convert an array of size 1 to a Python scalar
comid1 = 21632389 # ValueError: can only convert an array of size 1 to a Python scalar
comid2 = 21635913 # 8.5815
comid3 = 15560261 # nan

In [66]:
comid_list = [fl_areasqkm(comid2), fl_areasqkm(comid3)]
# np.array(comid_list).__dir__()
np.array(comid_list)[~np.isnan(np.array(comid_list))]

array([8.5815])

In [51]:
fl_areasqkm(comid1)

nan

In [408]:
nhd_stats

Unnamed: 0,comid,long_comid,lat_comid,startflag,intephem,divergence,streamorde,lengthkm,gnis_name_ind,areasqkm,totdasqkm,flow_type,distup_max,distdown_max
0,179,-67.986409,46.022164,1.0,0,0.0,1.0,2.412,0,3.5550,3.5550,1,,243.972000
1,181,-67.998723,46.016490,0.0,0,0.0,1.0,0.442,0,0.2898,3.8448,1,1.364,242.608000
2,183,-67.998835,46.020847,0.0,0,0.0,2.0,0.112,1,0.2043,8.1954,1,3.278,243.296010
3,185,-67.998621,46.019712,0.0,0,0.0,2.0,0.170,1,0.0369,8.2323,1,3.438,243.136000
4,843,-68.378758,46.246067,1.0,0,0.0,1.0,1.889,0,2.7486,2.7486,1,,285.859010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2989308,948100736,-114.836484,32.899003,1.0,1,0.0,1.0,8.559,0,9.3933,9.3933,1,,10.409000
2989309,948100737,-116.804081,34.543096,0.0,1,0.0,4.0,6.019,0,23.6052,201.5145,1,13.660,5.613000
2989310,948100738,-116.779838,34.570815,0.0,1,0.0,2.0,2.449,0,7.3683,121.1004,1,9.481,9.794000
2989311,948100739,-114.956633,34.624972,0.0,1,0.0,3.0,0.282,1,4.6683,618.0561,1,28.018,40.571999


In [404]:
df_temp = df_merged.iloc[10:15].copy()
# df_temp

# df_temp["fl_areasqkm_sum"] = df_temp.apply(lambda x: [fl_areasqkm(comid) for comid in x.fl_comid_list], axis=1)
df_temp["fl_areasqkm_sum"] = df_temp.apply(lambda x: [np.array(fl_areasqkm(comid)) for comid in x.fl_comid_list], axis=1)
df_temp["fl_areasqkm_sum"]

ValueError: can only convert an array of size 1 to a Python scalar

In [405]:
nhd_stats[nhd_stats["comid"] == 21635913]["areasqkm"].item()
# nhd_stats[nhd_stats["comid"] == 21632389]["areasqkm"].item()


8.5815

In [406]:
nhd_stats

Unnamed: 0,comid,long_comid,lat_comid,startflag,intephem,divergence,streamorde,lengthkm,gnis_name_ind,areasqkm,totdasqkm,flow_type,distup_max,distdown_max
0,179,-67.986409,46.022164,1.0,0,0.0,1.0,2.412,0,3.5550,3.5550,1,,243.972000
1,181,-67.998723,46.016490,0.0,0,0.0,1.0,0.442,0,0.2898,3.8448,1,1.364,242.608000
2,183,-67.998835,46.020847,0.0,0,0.0,2.0,0.112,1,0.2043,8.1954,1,3.278,243.296010
3,185,-67.998621,46.019712,0.0,0,0.0,2.0,0.170,1,0.0369,8.2323,1,3.438,243.136000
4,843,-68.378758,46.246067,1.0,0,0.0,1.0,1.889,0,2.7486,2.7486,1,,285.859010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2989308,948100736,-114.836484,32.899003,1.0,1,0.0,1.0,8.559,0,9.3933,9.3933,1,,10.409000
2989309,948100737,-116.804081,34.543096,0.0,1,0.0,4.0,6.019,0,23.6052,201.5145,1,13.660,5.613000
2989310,948100738,-116.779838,34.570815,0.0,1,0.0,2.0,2.449,0,7.3683,121.1004,1,9.481,9.794000
2989311,948100739,-114.956633,34.624972,0.0,1,0.0,3.0,0.282,1,4.6683,618.0561,1,28.018,40.571999


In [407]:
fl_areasqkm(948100736)

9.3933


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [344]:
df_temp

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,jurisdiction_type,da_number,district,project_name,longitude,latitude,date_issued_or_denied,rha_determination,...,fl_length_sum,fl_length_sum_,fl_areasqkm_sum,fl_totdasqkm_sum,fl_ftype_str,fl_gnis_name_ind_count,wb_gnis_name_ind_count,fl_comid_list_len,fl_areasqkm,fl_areasqkm_mean
10,10,10,RAPANOS,LRB-1999-01857,Buffalo,"LAKEWOOD, CITY OF",-81.82489,41.47921,04/19/2016,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,
11,11,11,RAPANOS,LRB-1999-02061,Buffalo,"Fair Point Marina, (Snowden Properties LLC) fo...",-76.71328,43.34411,01/28/2020,0,...,12.004,0.0,"727546 NaN Name: areasqkm, dtype: float64",0.0,0.0,0.0,0.0,2,"727546 NaN Name: areasqkm, dtype: float64",
12,12,12,RAPANOS,LRB-2000-00809,Buffalo,Omni Property Companies - Hudson (Formerly: O...,-81.4788,41.2513,01/12/2016,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,


In [None]:
np.mean()

In [294]:
df_merged.fl_comid_list[9]

[21980217, 21978365]

In [203]:
np.sum(num for num in df_merged.fl_length_list[9])

  np.sum(num for num in df_merged.fl_length_list[9])


6.692

In [157]:
df_merged.fl_length_sum_.describe()

count    5000.0
mean        0.0
std         0.0
min         0.0
25%         0.0
50%         0.0
75%         0.0
max         0.0
Name: fl_length_sum_, dtype: float64

SyntaxError: invalid syntax (<ipython-input-102-2f9c250d1a9d>, line 1)