In [1]:
import pandas as pd
import glob
from pathlib import Path
from functools import reduce


In [2]:
data_dir = Path("./Reorganized_Sesan_Data/")
attributes_path = data_dir / 'attributes'
attributes_path

WindowsPath('Reorganized_Sesan_Data/attributes')

In [3]:
attributes_path = data_dir / 'attributes'
if not attributes_path.exists():
    raise FileNotFoundError(f"Attributes folder not found at {attributes_path}")

files = list(attributes_path.glob('*.csv'))
if not files:
    raise FileNotFoundError('No attributes files found')

# Read-in attributes into one big dataframe. Sort by both axes so we can check for identical axes.
dfs = []
for f in files:
    df = pd.read_csv(f, dtype={0: str})  # make sure we read the basin id as str
    df = df.set_index(df.columns[0]).sort_index(axis=0).sort_index(axis=1)
    if df.index.has_duplicates or df.columns.has_duplicates:
        raise ValueError(f'Attributes file {f} contains duplicate basin ids or features.')
    dfs.append(df)

In [4]:
df

Unnamed: 0_level_0,AREA,SLOPE_MEAN,mean_elevation
SUBID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8154,77500000.0,1.705177,610.960215


In [5]:
# # path = "./Reorganized_Sesan_Data/attributes/"

# dfs = []
# for f in attributes_path.glob("*.csv"):
#     df = pd.read_csv(f, dtype={0: str}).T  # make sure we read the basin id as str
#     df = df.set_index(df.columns[0])
#     df = df.sort_index(axis=0)
#     df = df.sort_index(axis=1)
#     if df.index.has_duplicates or df.columns.has_duplicates:
#         raise ValueError(f'Attributes file {f} contains duplicate basin ids or features.')
#     dfs.append(df)
#     # dfs.append(df)

In [6]:
# df = pd.read_csv(f, dtype={0: str})  # make sure we read the basin id as str
# df.set_index(df.columns[0])

In [7]:
dfs

[             AREA  SLOPE_MEAN  mean_elevation
 SUBID                                        
 10206  91500000.0    2.400893      787.998984,
                AREA  SLOPE_MEAN  mean_elevation
 SUBID                                          
 10305  1.210000e+09    3.243262      981.340993,
              AREA  SLOPE_MEAN  mean_elevation
 SUBID                                        
 8154   77500000.0    1.705177      610.960215]

In [8]:
pd.read_csv(f, dtype={0: str}).sort_index(axis=0)


Unnamed: 0,SUBID,mean_elevation,SLOPE_MEAN,AREA
0,8154,610.960215,1.705177,77500000.0


In [9]:
len(reduce(lambda idx, other_idx: idx.intersection(other_idx), (df.index for df in dfs)))

0

In [10]:
seq = [df.index for df in dfs]
seq

[Index(['10206'], dtype='object', name='SUBID'),
 Index(['10305'], dtype='object', name='SUBID'),
 Index(['8154'], dtype='object', name='SUBID')]

In [11]:
if len(dfs) == 1:
    df = dfs[0]
else:
    if len(reduce(lambda idx, other_idx: idx.intersection(other_idx), (df.index for df in dfs))) > 0:
        # basin intersection is non-empty -> concatenate attributes, keep intersection of basins
        if np.any(np.unique(np.concatenate([df.columns for df in dfs]), return_counts=True)[1] > 1):
            raise ValueError('If attributes dataframes refer to the same basins, no attribute name may occur '
                             'multiple times across the different attributes files.')
        concat_axis = 1
    elif len(reduce(lambda cols, other_cols: cols.intersection(other_cols), (df.columns for df in dfs))) > 0:
        # attributes intersection is non-empty -> concatenate basins, keep intersection of attributes
        # no need to check for basin duplicates, since then we'd have had a non-empty basin intersection.
        concat_axis = 0
    else:
        raise ValueError('Attribute files must overlap on either the index or the columns.')

    df = pd.concat(dfs, axis=concat_axis, join='inner')

In [12]:
df['std_0_column'] = [0,0,0]

In [13]:
zero_std_cols = df.columns[df.std() == 0].tolist()
zero_std_cols

['std_0_column']

In [14]:
import numpy as np
np.mean(df['AREA'])

459666666.6666667

In [15]:
df.std() 

AREA              6.498454e+08
SLOPE_MEAN        7.702070e-01
mean_elevation    1.852502e+02
std_0_column      0.000000e+00
dtype: float64

In [16]:
df.std().isnull()

AREA              False
SLOPE_MEAN        False
mean_elevation    False
std_0_column      False
dtype: bool

In [17]:
any(df.std() == 0.0) or any(df.std().isnull())

True

In [18]:
attributes = []
if any(df.std() == 0.0) or any(df.std().isnull()):
    for k, v in df.std().items():
        if (v == 0) or (np.isnan(v)):
            attributes.append(k)

In [19]:
attributes

['std_0_column']

In [20]:
df

Unnamed: 0_level_0,AREA,SLOPE_MEAN,mean_elevation,std_0_column
SUBID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10206,91500000.0,2.400893,787.998984,0
10305,1210000000.0,3.243262,981.340993,0
8154,77500000.0,1.705177,610.960215,0


In [21]:
# df = self._load_attributes()

# remove all attributes not defined in the config
missing_attrs = [attr for attr in ['mean_elevation', 'SLOPE_MEAN', 'AREA'] if attr not in df.columns]
if len(missing_attrs) > 0:
    raise ValueError(f'Static attributes {missing_attrs} are missing.')
df = df[['mean_elevation', 'SLOPE_MEAN', 'AREA']]

In [22]:
df

Unnamed: 0_level_0,mean_elevation,SLOPE_MEAN,AREA
SUBID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10206,787.998984,2.400893,91500000.0
10305,981.340993,3.243262,1210000000.0
8154,610.960215,1.705177,77500000.0


In [23]:
df.dtypes

mean_elevation    float64
SLOPE_MEAN        float64
AREA              float64
dtype: object

In [24]:
attributes = []
if any(df.std() == 0.0) or any(df.std().isnull()):
    for k, v in df.std().items():
        if (v == 0) or (np.isnan(v)):
            attributes.append(k)
if attributes:
    msg = [
        "The following attributes have a std of zero or NaN, which results in NaN's ",
        "when normalizing the features. Remove the attributes from the attribute feature list ",
        "and restart the run. \n", f"Attributes: {attributes}"
    ]
    raise RuntimeError("".join(msg))



In [26]:
any(df.std() == 0.0) or any(df.std().isnull())

False