In [None]:
import os

import numpy as np
import pandas as pd

#from lib import datasets_path
#from .pd_dataset import PandasDataset
from ..utils.utils import disjoint_months, infer_mask, compute_mean, geographical_distance, thresholded_gaussian_kernel


class AirQuality(PandasDataset):
    SEED = 3210

    def __init__(self, impute_nans=False, small=False, freq='60T', masked_sensors=None):
        self.random = np.random.default_rng(self.SEED)
        self.test_months = [3, 6, 9, 12]
        self.infer_eval_from = 'next'
        self.eval_mask = None
        df, dist, mask = self.load(impute_nans=impute_nans, small=small, masked_sensors=masked_sensors)
        self.dist = dist
        if masked_sensors is None:
            self.masked_sensors = list()
        else:
            self.masked_sensors = list(masked_sensors)
        super().__init__(dataframe=df, u=None, mask=mask, name='air', freq=freq, aggr='nearest')

    def load_raw(self, small=False):
        if small:
            path = os.path.join(datasets_path['air'], 'small36.h5')
            eval_mask = pd.DataFrame(pd.read_hdf(path, 'eval_mask'))
        else:
            path = os.path.join(datasets_path['air'], 'full437.h5')
            eval_mask = None
        df = pd.DataFrame(pd.read_hdf(path, 'pm25'))
        stations = pd.DataFrame(pd.read_hdf(path, 'stations'))
        return df, stations, eval_mask

    def load(self, impute_nans=True, small=False, masked_sensors=None):
        # load readings and stations metadata
        df, stations, eval_mask = self.load_raw(small)
        # compute the masks
        mask = (~np.isnan(df.values)).astype('uint8')  # 1 if value is not nan else 0
        if eval_mask is None:
            eval_mask = infer_mask(df, infer_from=self.infer_eval_from)

        eval_mask = eval_mask.values.astype('uint8')
        if masked_sensors is not None:
            eval_mask[:, masked_sensors] = np.where(mask[:, masked_sensors], 1, 0)
        self.eval_mask = eval_mask  # 1 if value is ground-truth for imputation else 0
        # eventually replace nans with weekly mean by hour
        if impute_nans:
            df = df.fillna(compute_mean(df))
        # compute distances from latitude and longitude degrees
        st_coord = stations.loc[:, ['latitude', 'longitude']]
        dist = geographical_distance(st_coord, to_rad=True).values
        return df, dist, mask

    def splitter(self, dataset, val_len=1., in_sample=False, window=0):
        nontest_idxs, test_idxs = disjoint_months(dataset, months=self.test_months, synch_mode='horizon')
        if in_sample:
            train_idxs = np.arange(len(dataset))
            val_months = [(m - 1) % 12 for m in self.test_months]
            _, val_idxs = disjoint_months(dataset, months=val_months, synch_mode='horizon')
        else:
            # take equal number of samples before each month of testing
            val_len = (int(val_len * len(nontest_idxs)) if val_len < 1 else val_len) // len(self.test_months)
            # get indices of first day of each testing month
            delta_idxs = np.diff(test_idxs)
            end_month_idxs = test_idxs[1:][np.flatnonzero(delta_idxs > delta_idxs.min())]
            if len(end_month_idxs) < len(self.test_months):
                end_month_idxs = np.insert(end_month_idxs, 0, test_idxs[0])
            # expand month indices
            month_val_idxs = [np.arange(v_idx - val_len, v_idx) - window for v_idx in end_month_idxs]
            val_idxs = np.concatenate(month_val_idxs) % len(dataset)
            # remove overlapping indices from training set
            ovl_idxs, _ = dataset.overlapping_indices(nontest_idxs, val_idxs, synch_mode='horizon', as_mask=True)
            train_idxs = nontest_idxs[~ovl_idxs]
        return [train_idxs, val_idxs, test_idxs]

    def get_similarity(self, thr=0.1, include_self=False, force_symmetric=False, sparse=False, **kwargs):
        theta = np.std(self.dist[:36, :36])  # use same theta for both air and air36
        adj = thresholded_gaussian_kernel(self.dist, theta=theta, threshold=thr)
        if not include_self:
            adj[np.diag_indices_from(adj)] = 0.
        if force_symmetric:
            adj = np.maximum.reduce([adj, adj.T])
        if sparse:
            import scipy.sparse as sps
            adj = sps.coo_matrix(adj)
        return adj

    @property
    def mask(self):
        return self._mask

    @property
    def training_mask(self):
        return self._mask if self.eval_mask is None else (self._mask & (1 - self.eval_mask))

    def test_interval_mask(self, dtype=bool, squeeze=True):
        m = np.in1d(self.df.index.month, self.test_months).astype(dtype)
        if squeeze:
            return m
        return m[:, None]

In [1]:
import os

import numpy as np
import pandas as pd

import torch

In [23]:
from sklearn.metrics.pairwise import haversine_distances


In [2]:
datasets_path = {
    'air': 'datasets/air_quality',
    'la': 'datasets/metr_la',
    'bay': 'datasets/pems_bay',
    'synthetic': 'datasets/synthetic',
    
}

In [3]:
class PandasDataset:
    def __init__(self, dataframe: pd.DataFrame, u: pd.DataFrame = None, name='pd-dataset', mask=None, freq=None,
                 aggr='sum', **kwargs):
        """
        Initialize a tsl dataset from a pandas dataframe.


        :param dataframe: dataframe containing the data, shape: n_steps, n_nodes
        :param u: dataframe with exog variables
        :param name: optional name of the dataset
        :param mask: mask for valid data (1:valid, 0:not valid)
        :param freq: force a frequency (possibly by resampling)
        :param aggr: aggregation method after resampling
        """
        super().__init__()
        self.name = name

        # set dataset dataframe
        self.df = dataframe

        # set optional exog_variable dataframe
        # make sure to consider only the overlapping part of the two dataframes
        # assumption u.index \in df.index
        idx = sorted(self.df.index)
        self.start = idx[0]
        self.end = idx[-1]

        if u is not None:
            self.u = u[self.start:self.end]
        else:
            self.u = None

        if mask is not None:
            mask = np.asarray(mask).astype('uint8')
        self._mask = mask

        if freq is not None:
            self.resample_(freq=freq, aggr=aggr)
        else:
            self.freq = self.df.index.inferred_freq
            # make sure that all the dataframes are aligned
            self.resample_(self.freq, aggr=aggr)

        assert 'T' in self.freq
        self.samples_per_day = int(60 / int(self.freq[:-1]) * 24)

    def __repr__(self):
        return "{}(nodes={}, length={})".format(self.__class__.__name__, self.n_nodes, self.length)

    @property
    def has_mask(self):
        return self._mask is not None

    @property
    def has_u(self):
        return self.u is not None

    def resample_(self, freq, aggr):
        resampler = self.df.resample(freq)
        idx = self.df.index
        if aggr == 'sum':
            self.df = resampler.sum()
        elif aggr == 'mean':
            self.df = resampler.mean()
        elif aggr == 'nearest':
            self.df = resampler.nearest()
        else:
            raise ValueError(f'{aggr} if not a valid aggregation method.')

        if self.has_mask:
            resampler = pd.DataFrame(self._mask, index=idx).resample(freq)
            self._mask = resampler.min().to_numpy()

        if self.has_u:
            resampler = self.u.resample(freq)
            self.u = resampler.nearest()
        self.freq = freq

    def dataframe(self) -> pd.DataFrame:
        return self.df.copy()

    @property
    def length(self):
        return self.df.values.shape[0]

    @property
    def n_nodes(self):
        return self.df.values.shape[1]

    @property
    def mask(self):
        if self._mask is None:
            return np.ones_like(self.df.values).astype('uint8')
        return self._mask

    def numpy(self, return_idx=False):
        if return_idx:
            return self.numpy(), self.df.index
        return self.df.values

    def pytorch(self):
        data = self.numpy()
        return torch.FloatTensor(data)

    def __len__(self):
        return self.length

    @staticmethod
    def build():
        raise NotImplementedError

    def load_raw(self):
        raise NotImplementedError

    def load(self):
        raise NotImplementedError

In [20]:
def geographical_distance(x=None, to_rad=True):
    """
    Compute the as-the-crow-flies distance between every pair of samples in `x`. The first dimension of each point is
    assumed to be the latitude, the second is the longitude. The inputs is assumed to be in degrees. If it is not the
    case, `to_rad` must be set to False. The dimension of the data must be 2.

    Parameters
    ----------
    x : pd.DataFrame or np.ndarray
        array_like structure of shape (n_samples_2, 2).
    to_rad : bool
        whether to convert inputs to radians (provided that they are in degrees).

    Returns
    -------
    distances :
        The distance between the points in kilometers.
    """
    _AVG_EARTH_RADIUS_KM = 6371.0088

    # Extract values of X if it is a DataFrame, else assume it is 2-dim array of lat-lon pairs
    latlon_pairs = x.values if isinstance(x, pd.DataFrame) else x

    # If the input values are in degrees, convert them in radians
    if to_rad:
        latlon_pairs = np.vectorize(np.radians)(latlon_pairs)

    distances = haversine_distances(latlon_pairs) * _AVG_EARTH_RADIUS_KM

    # Cast response
    if isinstance(x, pd.DataFrame):
        res = pd.DataFrame(distances, x.index, x.index)
    else:
        res = distances

    return res

In [4]:
def compute_mean(x, index=None):
    """Compute the mean values for each datetime. The mean is first computed hourly over the week of the year.
    Further NaN values are computed using hourly mean over the same month through the years. If other NaN are present,
    they are removed using the mean of the sole hours. Hoping reasonably that there is at least a non-NaN entry of the
    same hour of the NaN datetime in all the dataset."""
    if isinstance(x, np.ndarray) and index is not None:
        shape = x.shape
        x = x.reshape((shape[0], -1))
        df_mean = pd.DataFrame(x, index=index)
    else:
        df_mean = x.copy()
    cond0 = [df_mean.index.year, df_mean.index.isocalendar().week, df_mean.index.hour]
    cond1 = [df_mean.index.year, df_mean.index.month, df_mean.index.hour]
    conditions = [cond0, cond1, cond1[1:], cond1[2:]]
    while df_mean.isna().values.sum() and len(conditions):
        nan_mean = df_mean.groupby(conditions[0]).transform(np.nanmean)
        df_mean = df_mean.fillna(nan_mean)
        conditions = conditions[1:]
    if df_mean.isna().values.sum():
        df_mean = df_mean.fillna(method='ffill')
        df_mean = df_mean.fillna(method='bfill')
    if isinstance(x, np.ndarray):
        df_mean = df_mean.values.reshape(shape)
    return df_mean

In [7]:
def load_raw(small=False):
    if small:
        path = os.path.join(datasets_path['air'], 'small36.h5')
        eval_mask = pd.DataFrame(pd.read_hdf(path, 'eval_mask'))
    else:
        path = os.path.join(datasets_path['air'], 'full437.h5')
        eval_mask = None
    df = pd.DataFrame(pd.read_hdf(path, 'pm25'))
    stations = pd.DataFrame(pd.read_hdf(path, 'stations'))
    return df, stations, eval_mask

In [21]:
def load(impute_nans=True, small=False, masked_sensors=None):
    # load readings and stations metadata
    df, stations, eval_mask = load_raw(small)
    # compute the masks
    mask = (~np.isnan(df.values)).astype('uint8')  # 1 if value is not nan else 0
    if eval_mask is None:
        eval_mask = infer_mask(df, infer_from='next')
    eval_mask = eval_mask.values.astype('uint8')
    if masked_sensors is not None:
        eval_mask[:, masked_sensors] = np.where(mask[:, masked_sensors], 1, 0)
    #self.eval_mask = eval_mask  # 1 if value is ground-truth for imputation else 0
        # eventually replace nans with weekly mean by hour
    if impute_nans:
        df = df.fillna(compute_mean(df))
        # compute distances from latitude and longitude degrees
    st_coord = stations.loc[:, ['latitude', 'longitude']]
    dist = geographical_distance(st_coord, to_rad=True).values
    return df, dist, mask

In [102]:
df, stations, eval_mask = load_raw(small=True)

In [103]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8759 entries, 2014-05-01 01:00:00 to 2015-04-30 23:00:00
Freq: H
Data columns (total 36 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   1001    7388 non-null   float32
 1   1002    7441 non-null   float32
 2   1003    8202 non-null   float32
 3   1004    7499 non-null   float32
 4   1005    7512 non-null   float32
 5   1006    8140 non-null   float32
 6   1007    7832 non-null   float32
 7   1008    7002 non-null   float32
 8   1009    8147 non-null   float32
 9   1010    7503 non-null   float32
 10  1011    7240 non-null   float32
 11  1012    8113 non-null   float32
 12  1013    8210 non-null   float32
 13  1014    7306 non-null   float32
 14  1015    8104 non-null   float32
 15  1016    7306 non-null   float32
 16  1017    7455 non-null   float32
 17  1018    7405 non-null   float32
 18  1019    7146 non-null   float32
 19  1020    8026 non-null   float32
 20  1021    7282 non-null   float3

In [104]:
eval_mask

sensor_id,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,...,1027,1028,1029,1030,1031,1032,1033,1034,1035,1036
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-05-01 01:00:00,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,True,True
2014-05-01 02:00:00,False,False,False,False,False,False,False,False,False,False,...,True,False,False,True,False,False,False,False,True,True
2014-05-01 03:00:00,False,False,False,False,False,False,False,False,False,False,...,True,False,False,True,False,False,False,False,True,True
2014-05-01 04:00:00,False,False,False,False,False,False,False,False,False,False,...,True,False,False,True,False,False,False,False,True,True
2014-05-01 05:00:00,True,False,False,False,False,False,False,False,False,False,...,True,False,False,True,False,False,False,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-04-30 19:00:00,False,False,False,False,False,False,False,False,False,False,...,True,True,False,False,False,False,False,False,False,True
2015-04-30 20:00:00,False,False,False,False,False,False,False,False,False,False,...,True,True,False,False,False,False,False,False,False,True
2015-04-30 21:00:00,False,False,False,False,False,False,False,False,False,False,...,True,True,False,False,False,False,False,False,False,True
2015-04-30 22:00:00,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False


In [38]:
(eval_mask == 0).astype(int).sum()

sensor_id
1001    7579
1002    7672
1003    8240
1004    7685
1005    7689
1006    8180
1007    7958
1008    7332
1009    8187
1010    7673
1011    7474
1012    8174
1013    8250
1014    7534
1015    8143
1016    7538
1017    7625
1018    7582
1019    7482
1020    8105
1021    7550
1022    7962
1023    8162
1024    8132
1025    7694
1026    7646
1027    8059
1028    7660
1029    7705
1030    8253
1031    7638
1032    7474
1033    7606
1034    7317
1035    7207
1036    7420
dtype: int64

In [24]:
df, dist, mask = load(impute_nans=False, small=True, masked_sensors=None)

In [25]:
dist

array([[ 0.        , 10.01574128, 19.62147715, ..., 66.9006241 ,
        64.3695516 , 58.68719083],
       [10.01574128,  0.        , 10.11664623, ..., 59.03696884,
        54.41792626, 50.29874425],
       [19.62147715, 10.11664623,  0.        , ..., 55.87528003,
        44.96035584, 40.38356926],
       ...,
       [66.9006241 , 59.03696884, 55.87528003, ...,  0.        ,
        46.55543621, 68.62845597],
       [64.3695516 , 54.41792626, 44.96035584, ..., 46.55543621,
         0.        , 26.57355156],
       [58.68719083, 50.29874425, 40.38356926, ..., 68.62845597,
        26.57355156,  0.        ]])

In [29]:
def thresholded_gaussian_kernel(x, theta=None, threshold=None, threshold_on_input=False):
    if theta is None:
        theta = np.std(x)
    weights = np.exp(-np.square(x / theta))
    if threshold is not None:
        mask = x > threshold if threshold_on_input else weights < threshold
        weights[mask] = 0.
    return weights

In [30]:
    def get_similarity(dist, thr=0.1, include_self=False, force_symmetric=False, sparse=False, **kwargs):
        theta = np.std(dist[:36, :36])  # use same theta for both air and air36
        adj = thresholded_gaussian_kernel(dist, theta=theta, threshold=thr)
        if not include_self:
            adj[np.diag_indices_from(adj)] = 0.
        if force_symmetric:
            adj = np.maximum.reduce([adj, adj.T])
        if sparse:
            import scipy.sparse as sps
            adj = sps.coo_matrix(adj)
        return adj

In [31]:
adj=get_similarity(dist)

In [99]:
adj.shape

(36, 36)

In [None]:
class AirQuality(PandasDataset):
    SEED = 3210

    def __init__(self, impute_nans=False, small=False, freq='60T', masked_sensors=None):
        self.random = np.random.default_rng(self.SEED)
        self.test_months = [3, 6, 9, 12]
        self.infer_eval_from = 'next'
        self.eval_mask = None
        df, dist, mask = self.load(impute_nans=impute_nans, small=small, masked_sensors=masked_sensors)
        self.dist = dist
        if masked_sensors is None:
            self.masked_sensors = list()
        else:
            self.masked_sensors = list(masked_sensors)
        super().__init__(dataframe=df, u=None, mask=mask, name='air', freq=freq, aggr='nearest')

    def load_raw(self, small=False):
        if small:
            path = os.path.join(datasets_path['air'], 'small36.h5')
            eval_mask = pd.DataFrame(pd.read_hdf(path, 'eval_mask'))
        else:
            path = os.path.join(datasets_path['air'], 'full437.h5')
            eval_mask = None
        df = pd.DataFrame(pd.read_hdf(path, 'pm25'))
        stations = pd.DataFrame(pd.read_hdf(path, 'stations'))
        return df, stations, eval_mask

    def load(self, impute_nans=True, small=False, masked_sensors=None):
        # load readings and stations metadata
        df, stations, eval_mask = self.load_raw(small)
        # compute the masks
        mask = (~np.isnan(df.values)).astype('uint8')  # 1 if value is not nan else 0
        if eval_mask is None:
            eval_mask = infer_mask(df, infer_from=self.infer_eval_from)

        eval_mask = eval_mask.values.astype('uint8')
        if masked_sensors is not None:
            eval_mask[:, masked_sensors] = np.where(mask[:, masked_sensors], 1, 0)
        self.eval_mask = eval_mask  # 1 if value is ground-truth for imputation else 0
        # eventually replace nans with weekly mean by hour
        if impute_nans:
            df = df.fillna(compute_mean(df))
        # compute distances from latitude and longitude degrees
        st_coord = stations.loc[:, ['latitude', 'longitude']]
        dist = geographical_distance(st_coord, to_rad=True).values
        return df, dist, mask

    def splitter(self, dataset, val_len=1., in_sample=False, window=0):
        nontest_idxs, test_idxs = disjoint_months(dataset, months=self.test_months, synch_mode='horizon')
        if in_sample:
            train_idxs = np.arange(len(dataset))
            val_months = [(m - 1) % 12 for m in self.test_months]
            _, val_idxs = disjoint_months(dataset, months=val_months, synch_mode='horizon')
        else:
            # take equal number of samples before each month of testing
            val_len = (int(val_len * len(nontest_idxs)) if val_len < 1 else val_len) // len(self.test_months)
            # get indices of first day of each testing month
            delta_idxs = np.diff(test_idxs)
            end_month_idxs = test_idxs[1:][np.flatnonzero(delta_idxs > delta_idxs.min())]
            if len(end_month_idxs) < len(self.test_months):
                end_month_idxs = np.insert(end_month_idxs, 0, test_idxs[0])
            # expand month indices
            month_val_idxs = [np.arange(v_idx - val_len, v_idx) - window for v_idx in end_month_idxs]
            val_idxs = np.concatenate(month_val_idxs) % len(dataset)
            # remove overlapping indices from training set
            ovl_idxs, _ = dataset.overlapping_indices(nontest_idxs, val_idxs, synch_mode='horizon', as_mask=True)
            train_idxs = nontest_idxs[~ovl_idxs]
        return [train_idxs, val_idxs, test_idxs]

    def get_similarity(self, thr=0.1, include_self=False, force_symmetric=False, sparse=False, **kwargs):
        theta = np.std(self.dist[:36, :36])  # use same theta for both air and air36
        adj = thresholded_gaussian_kernel(self.dist, theta=theta, threshold=thr)
        if not include_self:
            adj[np.diag_indices_from(adj)] = 0.
        if force_symmetric:
            adj = np.maximum.reduce([adj, adj.T])
        if sparse:
            import scipy.sparse as sps
            adj = sps.coo_matrix(adj)
        return adj

    @property
    def mask(self):
        return self._mask

    @property
    def training_mask(self):
        return self._mask if self.eval_mask is None else (self._mask & (1 - self.eval_mask))

    def test_interval_mask(self, dtype=bool, squeeze=True):
        m = np.in1d(self.df.index.month, self.test_months).astype(dtype)
        if squeeze:
            return m
        return m[:, None]

In [None]:
dataset = datasets.AirQuality(impute_nans=True, small=True)

In [44]:
new_eval_mask=eval_mask.iloc[2300:4100,1:36]

In [45]:
new_eval_mask.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1800 entries, 2014-08-04 21:00:00 to 2014-10-18 20:00:00
Freq: H
Data columns (total 35 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   1002    1800 non-null   bool 
 1   1003    1800 non-null   bool 
 2   1004    1800 non-null   bool 
 3   1005    1800 non-null   bool 
 4   1006    1800 non-null   bool 
 5   1007    1800 non-null   bool 
 6   1008    1800 non-null   bool 
 7   1009    1800 non-null   bool 
 8   1010    1800 non-null   bool 
 9   1011    1800 non-null   bool 
 10  1012    1800 non-null   bool 
 11  1013    1800 non-null   bool 
 12  1014    1800 non-null   bool 
 13  1015    1800 non-null   bool 
 14  1016    1800 non-null   bool 
 15  1017    1800 non-null   bool 
 16  1018    1800 non-null   bool 
 17  1019    1800 non-null   bool 
 18  1020    1800 non-null   bool 
 19  1021    1800 non-null   bool 
 20  1022    1800 non-null   bool 
 21  1023    1800 non-null   bool 
 22  1024

In [56]:
new_eval_mask

sensor_id,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011,...,1027,1028,1029,1030,1031,1032,1033,1034,1035,1036
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-08-04 21:00:00,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2014-08-04 22:00:00,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2014-08-04 23:00:00,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2014-08-05 00:00:00,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2014-08-05 01:00:00,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2014-10-18 16:00:00,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2014-10-18 17:00:00,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
2014-10-18 18:00:00,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2014-10-18 19:00:00,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False


discharge data read in and apply mask, then export h5 dataset

In [46]:
import json
import csv
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

In [47]:
relationship = pd.read_csv('discharge/relationshipset2.csv',dtype=str)

In [48]:
discharge = pd.read_csv('discharge/set2.csv')

In [49]:
discharge=discharge.set_index('datetime')#将date作为标签
discharge.index=pd.DatetimeIndex(discharge.index)#将标签转为时间索引
discharge.axes#查看

[DatetimeIndex(['2011-10-01 00:00:00', '2011-10-01 01:00:00',
                '2011-10-01 02:00:00', '2011-10-01 03:00:00',
                '2011-10-01 04:00:00', '2011-10-01 05:00:00',
                '2011-10-01 06:00:00', '2011-10-01 07:00:00',
                '2011-10-01 08:00:00', '2011-10-01 09:00:00',
                ...
                '2018-09-30 14:00:00', '2018-09-30 15:00:00',
                '2018-09-30 16:00:00', '2018-09-30 17:00:00',
                '2018-09-30 18:00:00', '2018-09-30 19:00:00',
                '2018-09-30 20:00:00', '2018-09-30 21:00:00',
                '2018-09-30 22:00:00', '2018-09-30 23:00:00'],
               dtype='datetime64[ns]', name='datetime', length=61368, freq=None),
 Index(['642', '631', '640', '636', '626', '569', '641', '624', '627', '571',
        '572', '643', '644', '570', '645', '630', '574', '625', '566', '565',
        '562', '568', '638', '573', '563', '635', '628', '634', '637', '646',
        '639', '629', '564', '632'],
      

In [50]:
discharge1800=discharge[-1800:]

In [88]:
discharge1800

Unnamed: 0_level_0,642,631,640,636,626,569,641,624,627,571,...,563,635,628,634,637,646,639,629,564,632
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-07-18 00:00:00,11.200,276.50,153.00,2800.0,2020.0,37.500,133.5,7250.0,2515.0,23875.0,...,20300.0,633.50,12550.0,1515.0,3192.5,24000.0,22950.0,600.25,731.0,20875.0
2018-07-18 01:00:00,11.050,277.00,152.25,2782.5,2020.0,37.500,133.0,7227.5,2520.0,23800.0,...,20300.0,633.50,12500.0,1510.0,3172.5,24025.0,22925.0,599.00,725.5,20875.0
2018-07-18 02:00:00,10.900,273.75,151.00,2765.0,2020.0,37.100,132.0,7210.0,2500.0,23800.0,...,20300.0,628.00,12475.0,1497.5,3167.5,24075.0,22900.0,594.00,721.0,20825.0
2018-07-18 03:00:00,10.900,272.00,150.00,2747.5,2010.0,36.475,132.0,7185.0,2487.5,23800.0,...,20300.0,628.00,12475.0,1495.0,3157.5,24100.0,22900.0,591.50,717.0,20900.0
2018-07-18 04:00:00,10.900,271.00,149.25,2732.5,2010.0,35.800,132.0,7175.0,2497.5,23800.0,...,20300.0,625.00,12400.0,1477.5,3145.0,24100.0,22900.0,587.75,715.5,20925.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-09-30 19:00:00,18.725,481.75,155.75,5597.5,2897.5,67.300,161.0,4620.0,4872.5,25475.0,...,16600.0,892.75,11425.0,3167.5,5867.5,24200.0,22400.0,1965.00,1367.5,17200.0
2018-09-30 20:00:00,18.750,483.25,155.75,5565.0,2885.0,70.175,162.0,4620.0,4867.5,25425.0,...,16675.0,868.25,11450.0,3150.0,5857.5,24175.0,22425.0,1947.50,1360.0,17150.0
2018-09-30 21:00:00,16.800,484.75,156.00,5557.5,2880.0,77.400,162.0,4615.0,4822.5,25375.0,...,16700.0,854.75,11350.0,3135.0,5835.0,24200.0,22400.0,1937.50,1360.0,17175.0
2018-09-30 22:00:00,18.375,486.25,156.00,5552.5,2867.5,79.100,161.5,4612.5,4825.0,25350.0,...,16700.0,844.75,11325.0,3125.0,5802.5,24200.0,22400.0,1922.50,1357.5,17200.0


In [51]:
mask=discharge1800.copy()

In [53]:
mask.shape[1]

34

In [58]:
new_eval_mask=pd.DataFrame(new_eval_mask).values

In [61]:
for i in range(mask.shape[1]):
    mask.iloc[:,i]=new_eval_mask[:,i]

  mask.iloc[:,i]=new_eval_mask[:,i]


In [62]:
mask

Unnamed: 0_level_0,642,631,640,636,626,569,641,624,627,571,...,563,635,628,634,637,646,639,629,564,632
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-07-18 00:00:00,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2018-07-18 01:00:00,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2018-07-18 02:00:00,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2018-07-18 03:00:00,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2018-07-18 04:00:00,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-09-30 19:00:00,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2018-09-30 20:00:00,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
2018-09-30 21:00:00,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2018-09-30 22:00:00,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False


In [63]:
(mask == 0).astype(int).sum()

642    1275
631    1728
640    1302
636    1308
626    1655
569    1718
641    1354
624    1680
627    1305
571    1302
572    1694
643    1724
644    1249
570    1705
645    1292
630    1311
574    1298
625    1281
566    1710
565    1287
562    1645
568    1725
638    1722
573    1309
563    1288
635    1694
628    1295
634    1311
637    1721
646    1294
639    1292
629    1309
564    1266
632    1286
dtype: int64

In [70]:
mask.shape[1]

34

In [81]:
dist=pd.DataFrame(np.zeros([mask.shape[1],mask.shape[1]]),columns=mask.columns,index=mask.columns)

In [82]:
dist

Unnamed: 0,642,631,640,636,626,569,641,624,627,571,...,563,635,628,634,637,646,639,629,564,632
642,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
640,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
636,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
641,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
624,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
627,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [83]:
for i in range(relationship.shape[0]):
    dist.loc[relationship.iloc[i,1],relationship.iloc[i,0]]=1

In [84]:
dist

Unnamed: 0,642,631,640,636,626,569,641,624,627,571,...,563,635,628,634,637,646,639,629,564,632
642,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
640,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
636,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
641,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
624,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
627,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [86]:
(dist == 1).astype(int).sum().sum()

33

In [87]:
store = pd.HDFStore('water.h5')

In [89]:
store.put(key='eval_mask',value=mask);
store.put(key='discharge',value=discharge1800);
store.put(key='dist',value=dist);

In [92]:
store.close()

In [93]:
def load_raw_water():
    path='water.h5'
    eval_mask = pd.DataFrame(pd.read_hdf(path, 'eval_mask'))
    df = pd.DataFrame(pd.read_hdf(path, 'discharge'))
    dist = pd.DataFrame(pd.read_hdf(path, 'dist'))
    return df, dist, eval_mask

In [94]:
df, dist, eval_mask = load_raw_water()

In [95]:
df

Unnamed: 0_level_0,642,631,640,636,626,569,641,624,627,571,...,563,635,628,634,637,646,639,629,564,632
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-07-18 00:00:00,11.200,276.50,153.00,2800.0,2020.0,37.500,133.5,7250.0,2515.0,23875.0,...,20300.0,633.50,12550.0,1515.0,3192.5,24000.0,22950.0,600.25,731.0,20875.0
2018-07-18 01:00:00,11.050,277.00,152.25,2782.5,2020.0,37.500,133.0,7227.5,2520.0,23800.0,...,20300.0,633.50,12500.0,1510.0,3172.5,24025.0,22925.0,599.00,725.5,20875.0
2018-07-18 02:00:00,10.900,273.75,151.00,2765.0,2020.0,37.100,132.0,7210.0,2500.0,23800.0,...,20300.0,628.00,12475.0,1497.5,3167.5,24075.0,22900.0,594.00,721.0,20825.0
2018-07-18 03:00:00,10.900,272.00,150.00,2747.5,2010.0,36.475,132.0,7185.0,2487.5,23800.0,...,20300.0,628.00,12475.0,1495.0,3157.5,24100.0,22900.0,591.50,717.0,20900.0
2018-07-18 04:00:00,10.900,271.00,149.25,2732.5,2010.0,35.800,132.0,7175.0,2497.5,23800.0,...,20300.0,625.00,12400.0,1477.5,3145.0,24100.0,22900.0,587.75,715.5,20925.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-09-30 19:00:00,18.725,481.75,155.75,5597.5,2897.5,67.300,161.0,4620.0,4872.5,25475.0,...,16600.0,892.75,11425.0,3167.5,5867.5,24200.0,22400.0,1965.00,1367.5,17200.0
2018-09-30 20:00:00,18.750,483.25,155.75,5565.0,2885.0,70.175,162.0,4620.0,4867.5,25425.0,...,16675.0,868.25,11450.0,3150.0,5857.5,24175.0,22425.0,1947.50,1360.0,17150.0
2018-09-30 21:00:00,16.800,484.75,156.00,5557.5,2880.0,77.400,162.0,4615.0,4822.5,25375.0,...,16700.0,854.75,11350.0,3135.0,5835.0,24200.0,22400.0,1937.50,1360.0,17175.0
2018-09-30 22:00:00,18.375,486.25,156.00,5552.5,2867.5,79.100,161.5,4612.5,4825.0,25350.0,...,16700.0,844.75,11325.0,3125.0,5802.5,24200.0,22400.0,1922.50,1357.5,17200.0


In [96]:
dist

Unnamed: 0,642,631,640,636,626,569,641,624,627,571,...,563,635,628,634,637,646,639,629,564,632
642,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
640,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
636,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
641,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
624,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
627,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [97]:
dist_array=dist.values

In [98]:
dist_array

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])