In [1]:
%matplotlib widget

In [2]:
import pandas as pd
import numpy as np
from glob import glob
import os
import datetime
import matplotlib.pyplot as plt

from tensorflow.keras.layers import Input, Concatenate, Dot, Add, ReLU, Activation
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

In [3]:
folder = 'data'
file_names = ['가평_2019.xlsx', '의암호_2019.xlsx']

day = 24*60*60
year = (365.2425)*day

df_full = []
df = []

for i in range(len(file_names)):
    path = os.path.join(folder, file_names[i])

    df_full.append(pd.read_excel(path))
    df.append(df_full[i].iloc[:, 2:11])
    date_time = pd.to_datetime(df_full[i].iloc[:, 0], format='%Y.%m.%d %H:%M')
    timestamp_s = date_time.map(datetime.datetime.timestamp)
    df[i]['Day sin'] = np.sin(timestamp_s * (2 * np.pi / day))
    df[i]['Day cos'] = np.cos(timestamp_s * (2 * np.pi / day))
    df[i]['Year sin'] = np.sin(timestamp_s * (2 * np.pi / year))
    df[i]['Year cos'] = np.cos(timestamp_s * (2 * np.pi / year))

In [4]:
df[1]


Unnamed: 0,수온,수소이온농도,전기전도도,용존산소,탁도,총유기탄소,총질소,총인,클로로필-a,Day sin,Day cos,Year sin,Year cos
0,3.0,7.2,91.0,12.2,0.7,1.3,1.469,0.001,8.0,-7.071068e-01,-7.071068e-01,-0.004430,0.999990
1,3.0,7.2,91.0,12.1,0.8,1.3,1.526,0.002,8.1,-8.660254e-01,-5.000000e-01,-0.003713,0.999993
2,3.0,7.2,91.0,12.1,0.8,1.3,1.623,0.002,8.2,-9.659258e-01,-2.588190e-01,-0.002996,0.999996
3,3.1,7.2,90.0,12.1,0.8,1.3,1.637,0.003,8.4,-1.000000e+00,-2.466750e-12,-0.002279,0.999997
4,3.1,7.2,90.0,12.1,0.8,1.3,1.563,0.002,8.5,-9.659258e-01,2.588190e-01,-0.001563,0.999999
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,4.8,7.7,99.0,12.3,0.7,1.5,1.430,0.003,,5.000000e-01,-8.660254e-01,-0.012185,0.999926
8756,4.8,7.7,100.0,12.4,0.7,1.5,1.421,0.003,,2.588190e-01,-9.659258e-01,-0.011468,0.999934
8757,4.7,7.7,100.0,12.4,0.7,1.5,1.457,0.002,,2.543654e-12,-1.000000e+00,-0.010752,0.999942
8758,4.7,7.7,101.0,12.4,0.7,1.5,1.447,0.003,,-2.588190e-01,-9.659258e-01,-0.010035,0.999950


In [5]:
# normalize data

df_all = pd.concat(df)
df_all

train_mean = df_all.mean()
train_std = df_all.std()
for i in range(len(file_names)):
    df[i] = (df[i]-train_mean)/train_std

In [6]:
df[0]

Unnamed: 0,수온,수소이온농도,전기전도도,용존산소,탁도,총유기탄소,총질소,총인,클로로필-a,Day sin,Day cos,Year sin,Year cos
0,-1.496853,-0.945645,-1.065617,0.998777,-0.669890,-0.875013,0.872656,-0.515611,-1.026082,-9.999715e-01,-9.999715e-01,-0.006269,1.415570
1,-1.496853,-0.945645,-1.065617,0.998777,-0.591824,-0.875013,1.020773,-0.249286,-0.988738,-1.224710e+00,-7.070866e-01,-0.005255,1.415574
2,-1.496853,-0.945645,-1.065617,0.998777,-0.747956,-0.875013,0.850980,-0.781935,-1.026082,-1.365986e+00,-3.660150e-01,-0.004242,1.415577
3,-1.510410,-0.945645,-1.065617,0.998777,-0.669890,-0.875013,0.887106,-0.249286,-1.013634,-1.414173e+00,-3.519073e-12,-0.003229,1.415580
4,-1.496853,-0.945645,-1.065617,0.947057,-0.669890,-0.875013,0.883494,-0.515611,-1.013634,-1.365986e+00,3.660150e-01,-0.002215,1.415582
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,-1.388397,-0.289409,0.698099,0.998777,-0.201494,0.704985,0.681187,-1.048260,1.326626,7.070866e-01,-1.224710e+00,-0.017232,1.415479
8756,-1.401954,-0.289409,0.698099,0.998777,-0.201494,0.309986,0.710088,-1.314584,1.227040,3.660150e-01,-1.365986e+00,-0.016219,1.415491
8757,-1.415511,-0.289409,0.698099,0.998777,-0.201494,0.309986,0.753440,-1.048260,1.152351,3.576374e-12,-1.414173e+00,-0.015206,1.415502
8758,-1.415511,-0.289409,0.698099,0.998777,-0.201494,0.309986,0.699250,-1.048260,1.177248,-3.660150e-01,-1.365986e+00,-0.014193,1.415512


In [7]:
train_df = df[0]
val_df = df[0]
test_df = df[0]

In [8]:
class WindowGenerator():
  def __init__(self, input_width, label_width, shift,
               train_df=train_df, val_df=val_df, test_df=test_df,
            #train_df=None, val_df=None, test_df=None,
               label_columns=None):
    # Store the raw data.
    self.train_df = train_df
    self.val_df = val_df
    self.test_df = test_df

    # Work out the label column indices.
    self.label_columns = label_columns
    if label_columns is not None:
      self.label_columns_indices = {name: i for i, name in
                                    enumerate(label_columns)}
    self.column_indices = {name: i for i, name in
                           enumerate(train_df.columns)}

    # Work out the window parameters.
    self.input_width = input_width
    self.label_width = label_width
    self.shift = shift

    self.total_window_size = input_width + shift

    self.input_slice = slice(0, input_width)
    self.input_indices = np.arange(self.total_window_size)[self.input_slice]

    self.label_start = self.total_window_size - self.label_width
    self.labels_slice = slice(self.label_start, None)
    self.label_indices = np.arange(self.total_window_size)[self.labels_slice]

  def __repr__(self):
    return '\n'.join([
        f'Total window size: {self.total_window_size}',
        f'Input indices: {self.input_indices}',
        f'Label indices: {self.label_indices}',
        f'Label column name(s): {self.label_columns}'])

In [9]:
def split_window(self, features):
  inputs = features[:, self.input_slice, :]
  labels = features[:, self.labels_slice, :]
  if self.label_columns is not None:
    labels = tf.stack(
        [labels[:, :, self.column_indices[name]] for name in self.label_columns],
        axis=-1)

  # Slicing doesn't preserve static shape information, so set the shapes
  # manually. This way the `tf.data.Datasets` are easier to inspect.
  inputs.set_shape([None, self.input_width, None])
  labels.set_shape([None, self.label_width, None])

  return inputs, labels

WindowGenerator.split_window = split_window

In [10]:
import matplotlib
import matplotlib.font_manager as fm
fm.get_fontconfig_fonts()
font_location = '/usr/share/fonts/truetype/nanum/NanumGothicCoding.ttf'
#font_location = '/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc'
# font_location = 'C:/Windows/Fonts/NanumGothic.ttf' # For Windows
fprop = fm.FontProperties(fname=font_location)

In [11]:
def plot(self, model=None, plot_col='T (degC)', max_subplots=3):
  inputs, labels = self.example
  plt.figure(figsize=(10, 8))
  plot_col_index = self.column_indices[plot_col]
  max_n = min(max_subplots, len(inputs))
  for n in range(max_n):
    plt.subplot(3, 1, n+1)
    plt.ylabel(f'{plot_col} [normed]', fontproperties=fprop)
    plt.plot(self.input_indices, inputs[n, :, plot_col_index],
             label='Inputs', marker='.', zorder=-10)

    if self.label_columns:
      label_col_index = self.label_columns_indices.get(plot_col, None)
    else:
      label_col_index = plot_col_index

    if label_col_index is None:
      continue

    plt.scatter(self.label_indices, labels[n, :, label_col_index],
                edgecolors='k', label='Labels', c='#2ca02c', s=64)
    if model is not None:
      predictions = model(inputs)
      plt.scatter(self.label_indices, predictions[n, :, label_col_index],
                  marker='X', edgecolors='k', label='Predictions',
                  c='#ff7f0e', s=64)

    if n == 0:
      plt.legend()

  plt.xlabel('Time [h]')

WindowGenerator.plot = plot

In [12]:
# not used
# original make_dataset code
def make_dataset(self, data):
  data = np.array(data, dtype=np.float32)
  ds = tf.keras.preprocessing.timeseries_dataset_from_array(
      data=data,
      targets=None,
      sequence_length=self.total_window_size,
      sequence_stride=1,
      shuffle=True,
      batch_size=32,)

  ds = ds.map(self.split_window)

  return ds

#WindowGenerator.make_dataset = make_dataset

In [13]:
w2 = WindowGenerator(input_width=6, label_width=1, shift=1,
                     label_columns=None)
w2

Total window size: 7
Input indices: [0 1 2 3 4 5]
Label indices: [6]
Label column name(s): None

In [14]:
# Stack three slices, the length of the total window:
example_window = tf.stack([np.array(train_df[:w2.total_window_size]),
                           np.array(train_df[100:100+w2.total_window_size]),
                           np.array(train_df[200:200+w2.total_window_size])])


example_inputs, example_labels = w2.split_window(example_window)

print('All shapes are: (batch, time, features)')
print(f'Window shape: {example_window.shape}')
print(f'Inputs shape: {example_inputs.shape}')
print(f'labels shape: {example_labels.shape}')

All shapes are: (batch, time, features)
Window shape: (3, 7, 13)
Inputs shape: (3, 6, 13)
labels shape: (3, 1, 13)


In [15]:
w2.example = example_inputs, example_labels

In [16]:
w2.plot(plot_col='수온')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [17]:
@property
def train(self):
  return self.make_dataset(self.train_df)

@property
def val(self):
  return self.make_dataset(self.val_df)

@property
def test(self):
  return self.make_dataset(self.test_df)

@property
def example(self):
  """Get and cache an example batch of `inputs, labels` for plotting."""
  result = getattr(self, '_example', None)
  if result is None:
    # No example batch was found, so get one from the `.train` dataset
    result = next(iter(self.train))
    # And cache it for next time
    self._example = result
  return result

WindowGenerator.train = train
WindowGenerator.val = val
WindowGenerator.test = test
WindowGenerator.example = example

In [18]:
def sample_batch_index(total, batch_size):
    '''Sample index of the mini-batch.

    Args:
        - total: total number of samples
        - batch_size: batch size

    Returns:
        - batch_idx: batch index
    '''
    total_idx = np.random.permutation(total)
    batch_idx = total_idx[:batch_size]
    return batch_idx

In [19]:
def binary_sampler(p, shape):
  '''Sample binary random variables.
  
  Args:
    - p: probability of 1
    - shape: matrix shape
    
  Returns:
    - binary_random_matrix: generated binary random matrix.
  '''
  unif_random_matrix = np.random.uniform(0., 1., size = shape)
  binary_random_matrix = 1*(unif_random_matrix < p)
  return binary_random_matrix

In [20]:
def uniform_sampler(low, high, shape):
  '''Sample uniform random variables.
  
  Args:
    - low: low limit
    - high: high limit
    - rows: the number of rows
    - cols: the number of columns
    
  Returns:
    - uniform_random_matrix: generated uniform random matrix.
  '''
  return np.random.uniform(low, high, size = shape)

In [21]:
def normalization (data, parameters=None):
  '''Normalize data in [0, 1] range.
  
  Args:
    - data: original data
  
  Returns:
    - norm_data: normalized data
    - norm_parameters: min_val, max_val for each feature for renormalization
  '''

  # Parameters
  _, dim = data.shape
  norm_data = data.copy()

  if parameters is None:

    # MixMax normalization
    min_val = np.zeros(dim)
    max_val = np.zeros(dim)
   
    # For each dimension
    for i in range(dim):
      min_val[i] = np.nanmin(norm_data[:,i])
      norm_data[:,i] = norm_data[:,i] - np.nanmin(norm_data[:,i])
      max_val[i] = np.nanmax(norm_data[:,i])
      norm_data[:,i] = norm_data[:,i] / (np.nanmax(norm_data[:,i]) + 1e-6)

    # Return norm_parameters for renormalization
    norm_parameters = {'min_val': min_val,
                       'max_val': max_val}
  else:
    min_val = parameters['min_val']
    max_val = parameters['max_val']

    # For each dimension
    for i in range(dim):
      norm_data[:,i] = norm_data[:,i] - min_val[i]
      norm_data[:,i] = norm_data[:,i] / (max_val[i] + 1e-6)

    norm_parameters = parameters

  return norm_data, norm_parameters

In [22]:
class MissData(object):
    def __init__(self, load_dir=None):
        if load_dir:
            self.missarr = np.load(os.path.join(load_dir, 'miss.npy'))
            self.idxarr = np.load(os.path.join(load_dir, 'idx.npy'))
            
    def make_missdata(self, data_x, missrate=0.2):
        data = data_x.copy()
        rows, cols = data_x.shape
        total_no = rows*cols
        total_miss_no = np.round(total_no*missrate).astype(int)
        total_idx = self.idxarr.shape[0]
        idxarr = self.idxarr
        missarr = self.missarr
        #print(total_miss_no)
        miss_no = 0
        cum_no = self.idxarr[:,3:4]
        cum_no = cum_no.reshape((total_idx))
        cum_sum = np.max(cum_no)
        #print(cum_no)
        #print(total_idx)
        while True:
            loc_count = np.around(np.random.random()*cum_sum)
            #print('loc_count =', loc_count)
            idx = len(cum_no[cum_no <= loc_count])-1
            #print(cum_no[cum_no <= loc_count])
            #print('idx =', idx)
            startnan = idxarr[idx][0]
            nanlen = idxarr[idx][2]
            loc = np.around(np.random.random()*(rows-nanlen)).astype(int)
            #print('loc =', loc)
            #print(loc_count, idx)
            #print(idxarr[idx])
            #data_copy = data[loc:loc+nanlen].copy()
            data_copy = data[loc:loc+nanlen]
            #print('startnan=', startnan)
            #isnan = missarr[startnan:startnan+nanlen].copy()
            isnan = missarr[startnan:startnan+nanlen]
            #print('isnan =',isnan)
            miss_no += idxarr[idx][1]
            if (miss_no > total_miss_no):
                break
            data_copy[isnan==1] = np.nan
            data[loc:loc+nanlen] = data_copy
        #print('miss_data =', data)
        return data
    
    def save(data, max_tseq, save_dir='save'):
        no, dim = data.shape
        #print((no, dim))
        isnan = np.isnan(data).astype(int)
        isany = np.any(isnan, axis=1).astype(int)
        shifted = np.roll(isany, 1)
        shifted[0] = 1
        #print(isnan)
        #print(isany.astype(int))
        #print(shifted)
        startnan = ((isany == 1) & (shifted ==0)).astype(int)
        #print(startnan)
        group = startnan.cumsum()
        group = group*isany
        #print(group)
        n = np.max(group)
        #print(n)
        missarr = None
        cum_no = 0
        rowidx = 0
        for i in range(1, n+1):
            g = (group == i).astype(int)
            i = np.argmax(g)
            rows = g.sum()
            #print(len)
            #print(i)
            #print(type(missarr))
            if rows <= max_tseq:
                nanseq = isnan[i:i+rows, :]
                no = np.sum(nanseq)
                #print(no)
                if missarr is None:
                    missarr = nanseq
                    idxarr = np.array([[rowidx, no, rows, cum_no]])
                else:
                    missarr = np.concatenate((missarr, nanseq))
                    idxarr = np.concatenate((idxarr, [[rowidx, no, rows, cum_no]]), axis=0)
                cum_no += no
                rowidx += rows

        #print(idxarr)
        miss_npy_file = os.path.join(save_dir, 'miss.npy')
        idx_npy_file = os.path.join(save_dir, 'idx.npy')
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        np.save(miss_npy_file, missarr)
        np.save(idx_npy_file, idxarr)
        print('miss_data file saved')

In [23]:
norm_df = pd.concat(df,axis=0)
n_data = norm_df.to_numpy()
MissData.save(n_data, max_tseq=10)
n_data
n_data = n_data[0:100]
isnan = np.isnan(n_data).astype(int)
isnan[50:100]
miss = MissData(load_dir='save')
tt = miss.make_missdata(n_data)
tt = np.isnan(tt).astype(int)
tt[0:50]

miss_data file saved


array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,

**miss data 준비**

In [24]:
norm_df = pd.concat(df,axis=0)
norm_data = norm_df.to_numpy()
MissData.save(norm_data, max_tseq = 12)

miss_data file saved


In [25]:
def interpolate(np_data, max_gap=3):
    #n = np_data.shape[1]
    data = pd.DataFrame(np_data)
    #data[0][0] = np.nan
    #data[0][1] = np.nan
    #data[0][2] = np.nan
    #data[data.columns[0]][0] = np.nan
    #data[data.columns[0]][1] = np.nan
    #data[data.columns[0]][2] = np.nan
    
    # create mask
    mask = data.copy()
    grp = ((mask.notnull() != mask.shift().notnull()).cumsum())
    grp['ones'] = 1
    for i in data.columns:
        mask[i] = (grp.groupby(i)['ones'].transform('count') < max_gap) | data[i].notnull()
    data = data.interpolate(method='polynomial', order=5, limit=max_gap, axis=0).bfill()[mask]
    return data
    
#filled_data = interpolate(norm_data, max_gap=3)
#np.arange(0, 5, dtype=int)
#['%d'%val for val in range(0,5)]

In [26]:
from tensorflow import keras

class GainDataGenerator(keras.utils.Sequence):
    'Generates data for GAIN'
    def __init__(self,
                 data_list,
                 batch_size=32,
                 input_width=24*3,
                 label_width=24*3,
                 shift=0,
                 fill_no=4,
                 miss_rate=0.2,
                 hint_rate=0.9,
                 normalize=True,
                 miss_pattern=None,
                 alpha=100.):
        'Initialization'
        window_size = input_width
        
        # interpollation
        filled_data = []
        for data in data_list:
            data = interpolate(data, max_gap=fill_no)
            filled_data.append(data)
            
        data_list = filled_data
        
        # whole data
        self.data = np.concatenate(data_list)

        # TO-DO
        
        # pre calculation for  sequence data
        last_cum = 0
        cums = []
        for data in data_list:
            isnan = np.isnan(data)
            isany = np.any(isnan, axis=1)
            shifted = np.roll(isany, 1)
            shifted[0] = True # set to nan
            start_seq = ((isany == False) & (shifted == True)).astype(int)
            cum = start_seq.cumsum()
            cum += last_cum
            last_cum = np.max(cum)
            cum[isany == 1] = np.nan
            cums.append(cum)
            
        
        # normlize for spam
        if normalize:
            self.data, norm_param = normalization(self.data)
        #print(norm_param)
        
        # Define mask matrix
        if miss_pattern is None:
            self.data_m = binary_sampler(1-miss_rate, self.data.shape)
        else:
            #MissData.save(self.data, max_tseq = 12)
            self.miss = MissData(load_dir='save')
            self.miss_rate = miss_rate
            miss_data = self.miss.make_missdata(self.data, self.miss_rate)
            self.data_m = 1. - np.isnan(miss_data).astype(float)
        
        # sequence data
        self.ids = np.concatenate(cums)
        data_idx = np.empty((0), dtype=int)
        for i in range(1, last_cum+1):
            seq_len = (self.ids == i).sum()
            start_id = np.argmax(self.ids == i)
            time_len = seq_len - window_size + 1
            start_ids = np.arange(start_id, start_id+time_len)
            data_idx = np.append(data_idx, start_ids)
            
        # start index set for sequence data
        self.data_idx = data_idx
        self.input_width = input_width
        self.no = len(data_idx)
        
        #print('self.no = ', self.no)
        
        self.batch_size = batch_size
        
        # random shuffling  index
        self.batch_idx = sample_batch_index(self.no, self.no)
        self.batch_id = 0
        self.shape = (batch_size,self.input_width)+self.data.shape[1:]
        #self.hint_rate = hint_rate
            
    def __len__(self):
        'Denotes the number of batches per epoch'
        #return int(128/self.batch_size)
        #return 2
        return 1

    def __getitem__(self, index):
        'Generate one batch of data'
        #print('index =', index)
        # Sample batch
        x = np.empty((0, self.input_width, self.data.shape[1]))
        #m = np.empty((0, self.input_width, self.data.shape[1]))
        #h = np.empty((0, self.input_width, self.data.shape[1]))
        y = np.empty((0, self.input_width, self.data.shape[1]))
        #print(x.shape)
        #print(self.data.shape)
        #print(self.input_width)
        #self.batch_idx = sample_batch_index(self.no, self.batch_size)
        for cnt in range(0, self.batch_size):
            i = self.batch_idx[self.batch_id]
            self.batch_id += 1
            #self.batch_id %= self.batch_size
            self.batch_id %= self.no
            if (self.batch_id == 0):
                self.batch_idx = sample_batch_index(self.no, self.no)
                #miss_data = self.miss.make_missdata(self.data, self.miss_rate)
                #self.data_m = 1. - np.isnan(miss_data).astype(float)
            idx1 = self.data_idx[i]
            idx2 = self.data_idx[i]+self.input_width
            #print(idx1, idx2)
        
            Y_mb = self.data[idx1:idx2]
            X_mb = Y_mb.copy()
            M_mb = self.data_m[idx1:idx2]
            Z_mb = uniform_sampler(0, 0.01, shape=X_mb.shape)
            X_mb = M_mb*X_mb + (1-M_mb)*Z_mb
            #H_mb_temp = binary_sampler(self.hint_rate, shape=X_mb.shape)
            #H_mb = M_mb * H_mb_temp
            X_mb[M_mb == 0] = np.nan
            x = np.append(x, [X_mb], axis=0)
            #m = np.append(m, [M_mb], axis=0)
            #h = np.append(h, [H_mb], axis=0)
            y = np.append(y, [Y_mb], axis=0)
            
        #return [x, m, h], y
        return x, y
    
    def on_epoch_end(self):
        'Updates indexes after each epoch'
        return

dgen = GainDataGenerator(df)

In [27]:
it = iter(dgen)

In [28]:
x,y = next(it)

In [29]:
x.shape

(32, 72, 13)

In [30]:
class GAIN(keras.Model):
    def __init__(self, shape, alpha=100., load=False, hint_rate=0.9, gen_sigmoid=True, **kwargs):
        super(GAIN, self).__init__(**kwargs)
        self.shape = shape
        self.dim = np.prod(shape).astype(int)
        self.h_dim = self.dim
        self.gen_sigmoid = gen_sigmoid
        self.build_generator()
        self.build_discriminator()
        self.hint_rate = hint_rate
        self.alpha = alpha
        self.generator_optimizer = Adam()
        self.discriminator_optimizer = Adam()

    ## GAIN models
    def build_generator(self):
        last_activation = 'sigmoid' if self.gen_sigmoid else None
        xavier_initializer = tf.keras.initializers.GlorotNormal()

        x = Input(shape=(self.dim,), name='generator_input_x')
        m = Input(shape=(self.dim,), name='generator_input_m')

        a = Concatenate()([x, m])

        a = Dense(self.h_dim, activation='relu', kernel_initializer=xavier_initializer)(a)
        #a = keras.layers.BatchNormalization()(a)
        a = Dense(self.h_dim, activation='relu', kernel_initializer=xavier_initializer)(a)
        #a = keras.layers.BatchNormalization()(a)
        G_prob = Dense(self.dim, activation=last_activation, kernel_initializer=xavier_initializer)(a)
        self.generator = keras.models.Model([x, m], G_prob, name='generator')

    def build_discriminator(self):
        xavier_initializer = tf.keras.initializers.GlorotNormal()

        x = Input(shape=(self.dim,), name='discriminator_input_x')
        h = Input(shape=(self.dim,), name='discriminator_input_h')

        a = Concatenate()([x, h])

        a = Dense(self.h_dim, activation='relu', kernel_initializer=xavier_initializer)(a)
        a = Dense(self.h_dim, activation='relu', kernel_initializer=xavier_initializer)(a)
        D_prob = Dense(self.dim, activation='sigmoid', kernel_initializer=xavier_initializer)(a)
        self.discriminator = keras.models.Model([x, h], D_prob, name='discriminator')
        
    def call(self, inputs):
        if isinstance(inputs, tuple):
            inputs = inputs[0]
        shape = inputs.shape
        dims = np.prod(shape[1:])
        input_width = shape[1]
        #print('inputs.shape=',inputs.shape)
        x = inputs
        #x = x.reshape((n, -1))
        #print('dims=',dims)
        x = keras.layers.Reshape((dims,))(x)
        #x = keras.layers.Reshape(tf.TensorShape((self.dim,)))(x)
        #print('x =', x)
        #print('x.shape = ', x.shape)
        #x = keras.layers.Reshape(tf.TensorShape([57]))(x)
        
        isnan = tf.math.is_nan(x)
        #m = 1.- keras.backend.cast(isnan, dtype=tf.float32)
        m = tf.where(isnan, 0., 1.)
        z = keras.backend.random_uniform(shape=tf.shape(x), minval=0.0, maxval=0.01)
        x = tf.where(isnan, z, x)
        #z = uniform_sampler(0, 0.01, shape=x.shape)
        #z = tf.keras.backend.random_uniform(shape=x.shape, minval=0.0, maxval=0.01)
        imputed_data = self.generator([x, m], training=False)
        #imputed_data = m*x + (1-m)*imputed_data
        imputed_data = tf.where(isnan, imputed_data, np.nan)
        imputed_data = keras.layers.Reshape(shape[1:])(imputed_data)
        #print('imputed_data.shape = ', imputed_data.shape)
        
        return imputed_data
    
    def D_loss(M, D_prob):
        ## GAIN loss
        return -tf.reduce_mean(M * tf.keras.backend.log(D_prob + 1e-8) \
                         + (1-M) * tf.keras.backend.log(1. - D_prob + 1e-8))
    
    def G_loss(self, M, D_prob, X, G_sample):
        G_loss_temp = -tf.reduce_mean((1-M) * tf.keras.backend.log(D_prob + 1e-8))
        MSE_loss = tf.reduce_mean((M * X - M * G_sample)**2) / (tf.reduce_mean(M) + 1e-8)
        #G_loss_temp = GAIN.G_loss_bincross(M, D_prob)
        #MSE_loss = GAIN.MSE_loss(M, X, G_sample)
        G_loss = G_loss_temp + self.alpha * MSE_loss
        return G_loss
        
    def RMSE_loss(y_true, y_pred):
        isnan = tf.math.is_nan(y_pred)
        M = tf.where(isnan, 1., 0.)
        return tf.sqrt(tf.reduce_sum(tf.where(isnan, 0., y_pred-y_true)**2)/tf.reduce_sum(1-M))
    
    def train_step(self, data):
        #[x, m, h], y = data
        x, y = data
        #X = keras.layers.Reshape((self.dim,), input_shape=self.shape)(x)
        #Y = keras.layers.Reshape((self.dim,), input_shape=self.shape)(y)
        X = keras.layers.Flatten()(x)
        Y = keras.layers.Flatten()(y)
        #X = tf.reshape(x, shape=(x.shape[0], -1))
        #Y = tf.reshape(y, shape=(x.shape[0], -1))
        isnan = tf.math.is_nan(X)
        #M = 1 - keras.backend.cast(isnan, dtype=tf.float32)
        M = tf.where(isnan, 0., 1.)
        Z = keras.backend.random_uniform(shape=tf.shape(X), minval=0.0, maxval=0.01)
        #H_temp = binary_sampler(self.hint_rate, shape=X.shape)
        H_rand = keras.backend.random_uniform(shape=tf.shape(X), minval=0.0, maxval=1.)
        #H_temp = 1*keras.backend.cast((H_rand < self.hint_rate), dtype=tf.float32)
        H_temp = tf.where(H_rand < self.hint_rate, 1., 0.)
        
        H = M * H_temp
        #X = M * X + (1-M) * Z
        X = tf.where(isnan, Z, X)
        with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
            G_sample = self.generator([X, M], training=True)

            # Combine with observed data
            #Hat_X = tf.where(isnan, G_sample, X)
            Hat_X = X * M + G_sample * (1-M)
            D_prob = self.discriminator([Hat_X, H], training=True)
            gen_loss = self.G_loss(M, D_prob, X, G_sample)
            disc_loss = tf.keras.backend.mean(tf.keras.losses.binary_crossentropy(M, D_prob))
            #disc_loss = GAIN.D_loss(M, D_prob)
            #disc_loss = GAIN.D_loss(M, D_prob)

        gradients_of_generator = gen_tape.gradient(gen_loss, self.generator.trainable_variables)
        gradients_of_discriminator = disc_tape.gradient(disc_loss, self.discriminator.trainable_variables)

        self.generator_optimizer.apply_gradients(zip(gradients_of_generator, self.generator.trainable_variables))
        self.discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, self.discriminator.trainable_variables))
        
        rmse = tf.sqrt(tf.reduce_sum(tf.where(isnan, G_sample - Y, 0.)**2)/tf.reduce_sum(1-M))
        return {
                 'gen_loss':     gen_loss,
                 'disc_loss':    disc_loss,
                 'rmse':         rmse,
               }
    
    def save(self, save_dir='savedta'):
        if not os.path.exists(save_dir):
          os.makedirs(save_dir)
        disc_savefile = os.path.join(save_dir, 'discriminator.h5')
        gen_savefile = os.path.join(save_dir, 'generator.h5')
        self.discriminator.save_weights(disc_savefile)
        self.generator.save_weights(gen_savefile)

    def load(self, save_dir='savedata'):
        disc_savefile = os.path.join(save_dir, 'discriminator.h5')
        gen_savefile = os.path.join(save_dir, 'generator.h5')
        try:
          self.discriminator.load_weights(disc_savefile)
          self.generator.load_weights(gen_savefile)
          print('model weights loaded')
        except:
          print('model loadinng error')

## spam data gain 학습 테스트

In [31]:
df_spam = pd.read_csv('data/spam.csv')
dg_spam = GainDataGenerator([df_spam], batch_size=128, input_width=1, label_width=1)
it = iter(dg_spam)
x,y = next(it)
print(dg_spam.shape)
x.shape, y.shape

(128, 1, 57)


((128, 1, 57), (128, 1, 57))

In [41]:
model = GAIN(shape=dg_spam.shape[1:])
model.compile(loss=GAIN.RMSE_loss)

In [42]:
model.fit(dg_spam, batch_size=128, epochs=10)
#model.fit(x, y, batch_size=128)
#model.fit(dg_spam, batch_size=4601, epochs=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f770683ebd0>

In [43]:
x = dg_spam.data.copy()
y = dg_spam.data
m = dg_spam.data_m
x[m == 0] = np.nan
x = x.reshape(x.shape[0], 1, x.shape[1])
y = y.reshape(y.shape[0], 1, y.shape[1])
x.shape


#model.fit(x,y)

(4601, 1, 57)

In [44]:
#model.load()

**spam data rmse 측정**

In [45]:
print(y.shape)
ret = model.evaluate(x, y)
print(ret)

(4601, 1, 57)
0.4257941246032715


In [46]:
x_input = x[0:4601]
y_true = y[0:4601]
y_pred = model.predict(x_input)
#print(x_input)
#print(y_true)
#print(y_pred)
isnan = np.isnan(y_pred)
diff = y_pred - y_true
diff[isnan] = 0.
#print(diff)
m = isnan.astype(int)
n = np.sum(1-m)
rmse = np.sqrt(np.sum(diff**2)/float(n))
print('rmse =', rmse)

rmse = 0.42585734478225756


In [47]:
model.summary()

Model: "gain_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
generator (Functional)       (None, 57)                13167     
_________________________________________________________________
discriminator (Functional)   (None, 57)                13167     
Total params: 26,334
Trainable params: 26,334
Non-trainable params: 0
_________________________________________________________________


**spam data dataset으로 학습하기**

In [48]:
ds = tf.data.Dataset.from_generator(
  lambda: dg_spam,
  output_types=(tf.float32, tf.float32),
  output_shapes=(
    dg_spam.shape,
    dg_spam.shape
    #[batch_size, train_generator.dim],
    #[batch_size, train_generator.dim],
  )
).repeat(-1).prefetch(10)

In [49]:
it = iter(ds)
x,y = next(it)
x.shape, y.shape

(TensorShape([128, 1, 57]), TensorShape([128, 1, 57]))

In [50]:
history = model.fit(ds, steps_per_epoch=10, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


**학습성능 측정(rsme)**

In [51]:
model.evaluate(ds, steps=50)



0.05563908442854881

**학습 그래프**

In [52]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax2 = ax.twinx()
ax.plot(history.history['gen_loss'], label='gen_loss')
ax.plot(history.history['disc_loss'], label='disc_loss')
ax2.plot(history.history['rmse'], label='rmse', color='green')
#ax2.plot(history.history['val_loss'], label='val_loss', color='red')
#plt.legend(history.history.keys(), loc='upper right')
#ax.legend(loc='upper center')
ax.legend(loc='upper center')
ax2.legend(loc='upper right')
ax.set_xlabel("epochs")
ax.set_ylabel("loss")
ax2.set_ylabel("rmse")
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# 수질 GAIN 데이터

**데이터 준비**

In [53]:
def make_dataset_gain(self, data):
  dg = GainDataGenerator(
      df,
      input_width = self.input_width,
      label_width = self.label_width,
      batch_size = 128,
      normalize = False,
      miss_pattern = True,
      miss_rate = 0.2,
      fill_no = 2,
  )
  self.dg = dg
  ds = tf.data.Dataset.from_generator(
      lambda: dg,
      output_types=(tf.float32, tf.float32),
      output_shapes=(
        dg.shape,
        dg.shape
        #[batch_size, train_generator.dim],
        #[batch_size, train_generator.dim],
      )
  )
  return ds

WindowGenerator.make_dataset = make_dataset_gain

In [54]:
train_df = df_all
val_df = df_all
test_df = df_all

In [55]:
wide_window = WindowGenerator(
    input_width=24*3, label_width=24*3, shift=0,
    #label_columns=['T (degC)']
)

wide_window

Total window size: 72
Input indices: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71]
Label indices: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71]
Label column name(s): None

In [56]:
def plot2(self, model=None, plot_col='T (degC)', max_subplots=3):
  inputs, labels = self.example
  plt.figure(figsize=(9, 8))
  plot_col_index = self.column_indices[plot_col]
  max_n = min(max_subplots, len(inputs))
  for n in range(max_n):
    plt.subplot(3, 1, n+1)
    plt.ylabel(f'{plot_col} [normed]')
    plt.plot(self.input_indices, inputs[n, :, plot_col_index],
             label='Inputs', marker='.', zorder=-10)

    if self.label_columns:
      label_col_index = self.label_columns_indices.get(plot_col, None)
    else:
      label_col_index = plot_col_index

    if label_col_index is None:
      continue

    plt.scatter(self.label_indices, labels[n, :, label_col_index],
                edgecolors='k', label='Labels', c='#2ca02c', s=64)
    if model is not None:
      predictions = model(inputs)
      plt.scatter(self.label_indices, predictions[n, :, label_col_index],
                  marker='X', edgecolors='k', label='Predictions',
                  c='#ff7f0e', s=64)

    if n == 0:
      plt.legend()

  plt.xlabel('Time [h]')

#WindowGenerator.plot = plot2

In [57]:
df[1]

Unnamed: 0,수온,수소이온농도,전기전도도,용존산소,탁도,총유기탄소,총질소,총인,클로로필-a,Day sin,Day cos,Year sin,Year cos
0,-1.469739,-0.070664,-1.653523,0.791895,-0.513758,-1.270012,-0.218354,-1.314584,-0.503258,-9.999715e-01,-9.999715e-01,-0.006269,1.415570
1,-1.469739,-0.070664,-1.653523,0.740174,-0.435692,-1.270012,-0.012435,-1.048260,-0.490810,-1.224710e+00,-7.070866e-01,-0.005255,1.415574
2,-1.469739,-0.070664,-1.653523,0.740174,-0.435692,-1.270012,0.337989,-1.048260,-0.478362,-1.365986e+00,-3.660150e-01,-0.004242,1.415577
3,-1.456182,-0.070664,-1.751507,0.740174,-0.435692,-1.270012,0.388566,-0.781935,-0.453465,-1.414173e+00,-3.519073e-12,-0.003229,1.415580
4,-1.456182,-0.070664,-1.751507,0.740174,-0.435692,-1.270012,0.121232,-1.048260,-0.441017,-1.365986e+00,3.660150e-01,-0.002215,1.415582
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,-1.225713,1.023062,-0.869649,0.843615,-0.513758,-0.480013,-0.359246,-0.781935,,7.070866e-01,-1.224710e+00,-0.017232,1.415479
8756,-1.225713,1.023062,-0.771665,0.895336,-0.513758,-0.480013,-0.391759,-0.781935,,3.660150e-01,-1.365986e+00,-0.016219,1.415491
8757,-1.239270,1.023062,-0.771665,0.895336,-0.513758,-0.480013,-0.261705,-1.048260,,3.576374e-12,-1.414173e+00,-0.015206,1.415502
8758,-1.239270,1.023062,-0.673680,0.895336,-0.513758,-0.480013,-0.297831,-0.781935,,-3.660150e-01,-1.365986e+00,-0.014193,1.415512


In [58]:
wide_window.plot(plot_col='총질소')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [59]:
plt.figure(figsize=(9,10))
isnan = np.isnan(norm_data).astype(int)
data = isnan
n = data.shape[0]
seq_len = n//8
for i in range(8):
    plt.subplot(181+i)
    plt.imshow(data[i*seq_len:(i+1)*seq_len, 0:7], aspect='auto')
    plt.yticks([])
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [60]:
plt.figure(figsize=(9,10))
n = wide_window.dg.data_m.shape[0]
train = n//8
for i in range(8):
    plt.subplot(181+i)
    plt.imshow(wide_window.dg.data_m[i*train:(i+1)*train, 0:7], aspect='auto')
    plt.yticks([])
#plt.imshow(wide_window.dg.data[0:100])
#plt.imshow(wide_window.dg.data_m[800:900], aspect='auto')
#print(wide_window.dg.data[0:50])
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## 컴파일 및 학습

In [61]:
val_performance = {}
performance = {}

In [62]:
gain = GAIN(shape=wide_window.dg.shape[1:], gen_sigmoid=False)
gain.compile(loss=GAIN.RMSE_loss)

In [63]:
MAX_EPOCHS = 300

def compile_and_fit(model, window, patience=10):
  early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                    patience=patience,
                                                    mode='min')

  #model.compile(loss=tf.losses.MeanSquaredError(),
                #optimizer=tf.optimizers.Adam(),
                #metrics=[tf.metrics.MeanAbsoluteError()])
  model.compile(loss=GAIN.RMSE_loss)

  history = model.fit(window.train, epochs=MAX_EPOCHS,
                      validation_data=window.val,
                      callbacks=[early_stopping])
  return history

In [64]:
history = compile_and_fit(gain, wide_window, patience=MAX_EPOCHS//5)


val_performance['Gain'] = gain.evaluate(wide_window.val)
performance['Gain'] = gain.evaluate(wide_window.test, verbose=0)


#early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
#                                                    patience=2,
#                                                    mode='min')
#gain.compile()


Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300


Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78/300
Epoch 79/300
Epoch 80/300
Epoch 81/300
Epoch 82/300
Epoch 83/300
Epoch 84/300
Epoch 85/300
Epoch 86/300
Epoch 87/300
Epoch 88/300
Epoch 89/300
Epoch 90/300
Epoch 91/300
Epoch 92/300
Epoch 93/300
Epoch 94/300
Epoch 95/300
Epoch 96/300
Epoch 97/300
Epoch 98/300
Epoch 99/300
Epoch 100/300
Epoch 101/300
Epoch 102/300
Epoch 103/300
Epoch 104/300
Epoch 105/300
Epoch 106/300
Epoch 107/300
Epoch 108/300
Epoch 109/300
Epoch 110/300
Epoch 111/300
Epoch 112/300
Epoch 113/300
Epoch 114/300
Epoch 115/300
Epoch 116/300
Epoch 117/300
Epoch 118/300


Epoch 119/300
Epoch 120/300
Epoch 121/300
Epoch 122/300
Epoch 123/300
Epoch 124/300
Epoch 125/300
Epoch 126/300
Epoch 127/300
Epoch 128/300
Epoch 129/300
Epoch 130/300
Epoch 131/300
Epoch 132/300
Epoch 133/300
Epoch 134/300
Epoch 135/300
Epoch 136/300
Epoch 137/300
Epoch 138/300
Epoch 139/300
Epoch 140/300
Epoch 141/300
Epoch 142/300
Epoch 143/300
Epoch 144/300
Epoch 145/300
Epoch 146/300
Epoch 147/300
Epoch 148/300
Epoch 149/300
Epoch 150/300
Epoch 151/300
Epoch 152/300
Epoch 153/300
Epoch 154/300
Epoch 155/300
Epoch 156/300
Epoch 157/300
Epoch 158/300
Epoch 159/300
Epoch 160/300
Epoch 161/300
Epoch 162/300
Epoch 163/300
Epoch 164/300
Epoch 165/300
Epoch 166/300
Epoch 167/300
Epoch 168/300
Epoch 169/300
Epoch 170/300
Epoch 171/300
Epoch 172/300
Epoch 173/300
Epoch 174/300
Epoch 175/300
Epoch 176/300
Epoch 177/300


Epoch 178/300
Epoch 179/300
Epoch 180/300
Epoch 181/300
Epoch 182/300
Epoch 183/300
Epoch 184/300
Epoch 185/300
Epoch 186/300
Epoch 187/300
Epoch 188/300
Epoch 189/300
Epoch 190/300
Epoch 191/300
Epoch 192/300
Epoch 193/300
Epoch 194/300
Epoch 195/300
Epoch 196/300
Epoch 197/300
Epoch 198/300
Epoch 199/300
Epoch 200/300
Epoch 201/300
Epoch 202/300
Epoch 203/300
Epoch 204/300
Epoch 205/300
Epoch 206/300
Epoch 207/300
Epoch 208/300
Epoch 209/300
Epoch 210/300
Epoch 211/300
Epoch 212/300
Epoch 213/300
Epoch 214/300
Epoch 215/300
Epoch 216/300


**학습 loss history 출력**

In [65]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax2 = ax.twinx()
ax.plot(history.history['gen_loss'], label='gen_loss')
ax.plot(history.history['disc_loss'], label='disc_loss')
ax2.plot(history.history['rmse'], label='rmse', color='green')
ax2.plot(history.history['val_loss'], label='val_loss', color='red')
#plt.legend(history.history.keys(), loc='upper right')
#ax.legend(loc='upper center')
ax.legend(loc='upper center')
ax2.legend(loc='upper right')
ax.set_xlabel("epochs")
ax.set_ylabel("loss")
ax2.set_ylabel("rmse")
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

성능 측정

In [66]:
gain.evaluate(wide_window.test.repeat(), steps=100)



0.21983663737773895

샘플 prediction 출력

In [67]:
wide_window.plot(gain, plot_col='클로로필-a')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## 학습데이터 테스트

In [68]:
total_n = wide_window.dg.data.shape[0]
print(total_n)
unit_shape = wide_window.dg.shape[1:]
print(unit_shape)
dim = np.prod(wide_window.dg.shape[1:]).astype(int)
print(dim)
n = (total_n//dim)*dim
print(n)
x = wide_window.dg.data[0:n].copy()
y = wide_window.dg.data[0:n].copy()
m = wide_window.dg.data_m[0:n]
x[m == 0] = np.nan
print('x.shape =', x.shape)
x = x.reshape((-1,)+unit_shape)
y_true = y.reshape((-1,)+unit_shape)
print('x.shape =', x.shape)

17520
(72, 13)
936
16848
x.shape = (16848, 13)
x.shape = (234, 72, 13)


In [69]:
y_pred = gain.predict(x)

In [70]:
y_pred = y_pred.reshape((n, 13))
x = x.reshape((n, 13))

In [71]:
x.shape

(16848, 13)

In [72]:
plt.figure()
plt.plot(x[:, 8])
plt.plot(y_pred[:, 8])
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## 원본 데이터 테스트

In [73]:
norm_df = pd.concat(df,axis=0)

In [74]:
data = norm_df.to_numpy()
x = data[0:n].copy()
y_true = data[0:n].copy()
isnan = np.isnan(x)
x[isnan] = np.nan

total_n = wide_window.dg.data.shape[0]
print(total_n)
unit_shape = wide_window.dg.shape[1:]
print(unit_shape)
dim = np.prod(wide_window.dg.shape[1:]).astype(int)
print(dim)
n = (total_n//dim)*dim

print('x.shape =', x.shape)
x_reshape = x.reshape((-1,)+unit_shape)
print('x_reshape.shape =', x_reshape.shape)

17520
(72, 13)
936
x.shape = (16848, 13)
x_reshape.shape = (234, 72, 13)


In [75]:
y_pred = gain.predict(x_reshape)

In [76]:
y_pred = y_pred.reshape(y_true.shape)
y_pred.shape

(16848, 13)

In [77]:
n = 8
plt.figure(figsize=(9,20))
for i in range(n):
    #plt.subplot('%d1%d'%(n,i))
    plt.subplot(811+i)
    plt.plot(x[:, i])
    plt.plot(y_pred[:, i])
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## 연습 섹션

In [78]:
it = iter(wide_window.val)
x,y = next(it)

In [79]:
x.shape, y.shape

(TensorShape([128, 72, 13]), TensorShape([128, 72, 13]))

In [80]:
history = gain.fit(wide_window.train, epochs=20,
                      validation_data=wide_window.val,
                      callbacks=[])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [81]:
it = iter(wide_window.val)
x,y = next(it)
x.shape, y.shape

(TensorShape([128, 72, 13]), TensorShape([128, 72, 13]))

In [82]:
gain.predict(x)

array([[[        nan,         nan,         nan, ...,         nan,
                 nan,         nan],
        [        nan,         nan,         nan, ...,         nan,
                 nan,         nan],
        [        nan,         nan,         nan, ...,         nan,
                 nan,         nan],
        ...,
        [        nan,         nan,         nan, ...,         nan,
                 nan,         nan],
        [        nan,         nan, -0.70629   , ...,         nan,
                 nan,         nan],
        [        nan,         nan, -0.5684247 , ...,         nan,
                 nan,         nan]],

       [[-0.47959065, -1.3098589 ,  0.50830597, ...,         nan,
                 nan,         nan],
        [-0.59616673, -1.345786  ,  0.6714608 , ...,         nan,
                 nan,         nan],
        [-0.46581385, -1.5985745 ,  0.6721713 , ...,         nan,
                 nan,         nan],
        ...,
        [        nan,         nan,         nan, ...,  

In [83]:
df[0].isna().astype(int).sum()

수온           577
수소이온농도       575
전기전도도        580
용존산소         603
탁도           860
총유기탄소        745
총질소          756
총인          1831
클로로필-a       507
Day sin        0
Day cos        0
Year sin       0
Year cos       0
dtype: int64

In [84]:
date_time1 = pd.to_datetime(df_full[0].iloc[:, 0], format='%Y.%m.%d %H:%M')
date_time2 = pd.to_datetime(df_full[0].iloc[:, 0], format='%Y.%m.%d %H:%M')

In [85]:
timestamp_s1 = date_time1.map(datetime.datetime.timestamp)
timestamp_s2 = date_time2.map(datetime.datetime.timestamp)

In [86]:
day = 24*60*60
year = (365.2425)*day

df[0]['Day sin'] = np.sin(timestamp_s1 * (2 * np.pi / day))
df[0]['Day cos'] = np.cos(timestamp_s1 * (2 * np.pi / day))
df[0]['Year sin'] = np.sin(timestamp_s1 * (2 * np.pi / year))
df[0]['Year cos'] = np.cos(timestamp_s1 * (2 * np.pi / year))

df[1]['Day sin'] = np.sin(timestamp_s2 * (2 * np.pi / day))
df[1]['Day cos'] = np.cos(timestamp_s2 * (2 * np.pi / day))
df[1]['Year sin'] = np.sin(timestamp_s2 * (2 * np.pi / year))
df[1]['Year cos'] = np.cos(timestamp_s2 * (2 * np.pi / year))

In [87]:
class CustomModel(keras.Model):
    def train_step(self, data):
        print(data[0].shape)
        # Unpack the data. Its structure depends on your model and
        # on what you pass to `fit()`.
        x, y = data

        with tf.GradientTape() as tape:
            y_pred = self(x, training=True)  # Forward pass
            # Compute the loss value
            # (the loss function is configured in `compile()`)
            loss = self.compiled_loss(y, y_pred, regularization_losses=self.losses)

        # Compute gradients
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        # Update metrics (includes the metric that tracks the loss)
        self.compiled_metrics.update_state(y, y_pred)
        # Return a dict mapping metric names to current value
        return {m.name: m.result() for m in self.metrics}

In [88]:
import numpy as np

# Construct and compile an instance of CustomModel
inputs = keras.Input(shape=(32,))
outputs = keras.layers.Dense(1)(inputs)
model = CustomModel(inputs, outputs)
model.compile(optimizer="adam", loss="mse", metrics=["mae"])

# Just use `fit` as usual
x = np.random.random((1000, 32))
y = np.random.random((1000, 1))
model.fit(x, y, epochs=3)

Epoch 1/3
(None, 32)
(None, 32)
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f7674278750>

In [89]:
ds = tf.data.Dataset.from_tensor_slices((x,y))

In [90]:
ds.element_spec


(TensorSpec(shape=(32,), dtype=tf.float64, name=None),
 TensorSpec(shape=(1,), dtype=tf.float64, name=None))

In [91]:
ds = ds.batch(5)
ds.element_spec

(TensorSpec(shape=(None, 32), dtype=tf.float64, name=None),
 TensorSpec(shape=(None, 1), dtype=tf.float64, name=None))

In [92]:
model.fit(ds)

(None, 32)


<tensorflow.python.keras.callbacks.History at 0x7f76e40d9a50>

# MNIST with data generator

https://towardsdatascience.com/keras-custom-data-generators-example-with-mnist-dataset-2a7a2d2b0360


In [93]:
import tensorflow as tf
import os
import tensorflow.keras as keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D
import numpy as np
import math

In [94]:
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()

In [95]:
class DataGenerator(tf.compat.v2.keras.utils.Sequence):
 
    def __init__(self, X_data , y_data, batch_size, dim, n_classes,
                 to_fit, shuffle = True):
        self.batch_size = batch_size
        self.X_data = X_data
        self.labels = y_data
        self.y_data = y_data
        self.to_fit = to_fit
        self.n_classes = n_classes
        self.dim = dim
        self.shuffle = shuffle
        self.n = 0
        self.list_IDs = np.arange(len(self.X_data))
        self.on_epoch_end()
    def __next__(self):
        # Get one batch of data
        data = self.__getitem__(self.n)
        # Batch index
        self.n += 1
        
        # If we have processed the entire dataset then
        if self.n >= self.__len__():
            self.on_epoch_end
            self.n = 0
        
        return data
    def __len__(self):
        # Return the number of batches of the dataset
        return math.ceil(len(self.indexes)/self.batch_size)
    def __getitem__(self, index):
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:
            (index+1)*self.batch_size]
        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        
        X = self._generate_x(list_IDs_temp)
        
        if self.to_fit:
            y = self._generate_y(list_IDs_temp)
            return X, y
        else:
            return X
    def on_epoch_end(self):
        
        self.indexes = np.arange(len(self.X_data))
        
        if self.shuffle: 
            np.random.shuffle(self.indexes)
    def _generate_x(self, list_IDs_temp):
               
        X = np.empty((self.batch_size, *self.dim))
        
        for i, ID in enumerate(list_IDs_temp):
            
            X[i,] = self.X_data[ID]
            
            # Normalize data
            X = (X/255).astype('float32')
            
        return X[:,:,:, np.newaxis]
    def _generate_y(self, list_IDs_temp):
        
        y = np.empty(self.batch_size)
        
        for i, ID in enumerate(list_IDs_temp):
            
            y[i] = self.y_data[ID]
            
        return keras.utils.to_categorical(
                y,num_classes=self.n_classes)

In [96]:
n_classes = 10
input_shape = (28, 28)
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=(28, 28 , 1)))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(n_classes, activation='softmax'))
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

In [97]:
train_generator = DataGenerator(x_train, y_train, batch_size = 64,
                                dim = input_shape,
                                n_classes=10, 
                                to_fit=True, shuffle=True)
val_generator =  DataGenerator(x_test, y_test, batch_size=64, 
                               dim = input_shape, 
                               n_classes= n_classes, 
                               to_fit=True, shuffle=True)

In [98]:
steps_per_epoch = len(train_generator)
validation_steps = len(val_generator)

In [99]:
model.fit(
        train_generator,
        steps_per_epoch=steps_per_epoch,
        epochs=1,
        validation_data=val_generator,
        validation_steps=validation_steps)





<tensorflow.python.keras.callbacks.History at 0x7f7674178c90>

In [100]:
it = iter(train_generator)

In [101]:
x,y = next(it)

In [102]:
x.shape

(64, 28, 28, 1)

In [103]:
y.shape

(64, 10)

## MNIST with custom model

In [104]:
n_classes = 10
input_shape = (28, 28, 1)
input_data = keras.layers.Input(shape=input_shape)
x = Conv2D(32, kernel_size=(3, 3),
                 activation='relu')(input_data)
x = Conv2D(64, (3, 3), activation='relu')(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Dropout(0.25)(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
output_data = Dense(n_classes, activation='softmax')(x)
model = CustomModel(input_data, output_data)
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

In [105]:
model.fit(
        train_generator,
        steps_per_epoch=steps_per_epoch,
        epochs=1,
        validation_data=val_generator,
        validation_steps=validation_steps)

(None, None, None, None)
(None, None, None, None)


<tensorflow.python.keras.callbacks.History at 0x7f767410a6d0>

결론: DataGenerator 만으로는 train_step에 input data의 shape에 None으로 들어간다.

```py
        X = keras.layers.Reshape((tf.reduce_sum(x.shape[1:]),))(x)
        Y = keras.layers.Reshape((tf.reduce_sum(x.shape[1:]),))(y)
        X = tf.reshape(x, shape=(x.shape[0], -1))
        Y = tf.reshape(y, shape=(x.shape[0], -1)
```

이런 함수들을 train_step 내에 사용할 수 없다

# 한글 폰트

In [106]:


import matplotlib
import matplotlib.font_manager

[f.fname for f in matplotlib.font_manager.fontManager.ttflist]



['/home/kotech/workspace/venv-tensor2n-gpu/lib/python3.7/site-packages/matplotlib/mpl-data/fonts/ttf/cmsy10.ttf',
 '/home/kotech/workspace/venv-tensor2n-gpu/lib/python3.7/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans-Bold.ttf',
 '/home/kotech/workspace/venv-tensor2n-gpu/lib/python3.7/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSansMono-Bold.ttf',
 '/home/kotech/workspace/venv-tensor2n-gpu/lib/python3.7/site-packages/matplotlib/mpl-data/fonts/ttf/STIXSizFourSymReg.ttf',
 '/home/kotech/workspace/venv-tensor2n-gpu/lib/python3.7/site-packages/matplotlib/mpl-data/fonts/ttf/STIXSizThreeSymReg.ttf',
 '/home/kotech/workspace/venv-tensor2n-gpu/lib/python3.7/site-packages/matplotlib/mpl-data/fonts/ttf/STIXSizThreeSymBol.ttf',
 '/home/kotech/workspace/venv-tensor2n-gpu/lib/python3.7/site-packages/matplotlib/mpl-data/fonts/ttf/STIXGeneral.ttf',
 '/home/kotech/workspace/venv-tensor2n-gpu/lib/python3.7/site-packages/matplotlib/mpl-data/fonts/ttf/STIXNonUniBol.ttf',
 '/home/kotech/work

In [107]:
! fc-list :lang=ko

/usr/share/fonts/truetype/nanum/NanumSquareRoundB.ttf: 나눔스퀘어라운드,NanumSquareRound,NanumSquareRound Bold,나눔스퀘어라운드 Bold:style=Bold,Regular
/usr/share/fonts/opentype/noto/NotoSerifCJK-Bold.ttc: Noto Serif CJK SC:style=Bold
/usr/share/fonts/opentype/noto/NotoSerifCJK-Bold.ttc: Noto Serif CJK TC:style=Bold
/usr/share/fonts/opentype/noto/NotoSansCJK-Black.ttc: Noto Sans CJK HK,Noto Sans CJK HK Black:style=Black,Regular
/usr/share/fonts/opentype/noto/NotoSerifCJK-Bold.ttc: Noto Serif CJK JP:style=Bold
/usr/share/fonts/opentype/noto/NotoSerifCJK-Bold.ttc: Noto Serif CJK KR:style=Bold
/usr/share/fonts/truetype/nanum/NanumSquareRoundR.ttf: 나눔스퀘어라운드,NanumSquareRound,NanumSquareRound Regular,나눔스퀘어라운드 Regular:style=Regular
/usr/share/fonts/truetype/nanum/NanumSquareB.ttf: 나눔스퀘어,NanumSquare,NanumSquare Bold,나눔스퀘어 Bold:style=Bold
/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc: Noto Sans CJK JP:style=Regular
/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc: Noto Sans CJK HK:style

In [108]:
import matplotlib
import matplotlib.font_manager as fm
fm.get_fontconfig_fonts()
font_location = '/usr/share/fonts/truetype/nanum/NanumGothicCoding.ttf'
#font_location = '/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc'
# font_location = 'C:/Windows/Fonts/NanumGothic.ttf' # For Windows
fprop = fm.FontProperties(fname=font_location)

In [109]:
fig = plt.figure()  
plt.plot((1,1), label='가-가가')  
plt.title('가가가',fontproperties=fprop)  
plt.legend(prop=fprop)  
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …