# Data Exploration


In [1]:
!git clone https://github.com/jsyoon0823/TimeGAN

Cloning into 'TimeGAN'...
remote: Enumerating objects: 77, done.[K
remote: Counting objects: 100% (40/40), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 77 (delta 29), reused 25 (delta 25), pack-reused 37[K
Receiving objects: 100% (77/77), 1.29 MiB | 3.21 MiB/s, done.
Resolving deltas: 100% (32/32), done.


In [2]:
import pandas as pd

# Replace 'your_file.csv' with the actual file name
file_path = '/content/TimeGAN/data/stock_data.csv'

# Read the CSV file into a DataFrame
stock = pd.read_csv(file_path)

# Display the first few rows of the DataFrame
print(stock.head())



        Open       High        Low      Close  Adj_Close    Volume
0  49.676899  51.693783  47.669952  49.845802  49.845802  44994500
1  50.178635  54.187561  49.925285  53.805050  53.805050  23005800
2  55.017166  56.373344  54.172661  54.346527  54.346527  18393200
3  55.260582  55.439419  51.450363  52.096165  52.096165  15361800
4  52.140873  53.651051  51.604362  52.657513  52.657513   9257400


In [3]:
print("The number of rows and columns in the data are:", stock.shape)

The number of rows and columns in the data are: (3685, 6)


In [5]:
null_values = stock.isnull()

# Summarize null values in each column
null_count = stock.isnull().sum()

# Display the count of null values in each column
print("\nCount of null values in each column:")
print(null_count)




Count of null values in each column:
Open         0
High         0
Low          0
Close        0
Adj_Close    0
Volume       0
dtype: int64


# Preprocess: Dataloading.py

In [6]:

import numpy as np


Normalisaion of data

In [7]:
def MinMaxScaler(data):  # used to normalise the data
  """Min Max normalizer.

  Args:
    - data: original data

  Returns:
    - norm_data: normalized data
  """
  numerator = data - np.min(data, 0)
  denominator = np.max(data, 0) - np.min(data, 0)
  norm_data = numerator / (denominator + 1e-7)
  return norm_data



In [8]:
normalised_stock= MinMaxScaler(stock)
print(normalised_stock.head())

       Open      High       Low     Close  Adj_Close    Volume
0  0.000329  0.000942  0.000000  0.000135   0.000135  0.543578
1  0.000740  0.002981  0.001877  0.003383   0.003383  0.277886
2  0.004700  0.004767  0.005413  0.003828   0.003828  0.222151
3  0.004900  0.004004  0.003147  0.001981   0.001981  0.185523
4  0.002346  0.002542  0.003275  0.002442   0.002442  0.111763


### Slice and shuffle data before loading
**Independent and Identically Distributed** (i.i.d.) Assumption: Many machine learning models assume that the training data is i.i.d. Shuffling the data helps to approximate this condition.

**Prevent Overfitting:** Shuffling ensures that the model sees a diverse set of patterns in each batch, which helps in preventing overfitting to specific sequences.

**Better Generalization:** By mixing the sequences, the model is less likely to memorize the order of the data and more likely to learn generalizable patterns.

In [9]:
def real_data_loading (data_name, seq_len): # to shuffle the dataset
  """Load and preprocess real-world datasets.

  Args:
    - data_name: stock or energy
    - seq_len: sequence length

  Returns:
    - data: preprocessed data.
  """
  assert data_name in ['stock','energy']

  if data_name == 'stock':
    ori_data = np.loadtxt('/content/TimeGAN/data/stock_data.csv', delimiter = ",",skiprows = 1)
  elif data_name == 'energy':
    ori_data = np.loadtxt('data/energy_data.csv', delimiter = ",",skiprows = 1)

  # Flip the data to make chronological data
  ori_data = ori_data[::-1]
  # Normalize the data
  ori_data = MinMaxScaler(ori_data)

  # Preprocess the dataset
  temp_data = []    #
  # Cut data by sequence length
  for i in range(0, len(ori_data) - seq_len):
    _x = ori_data[i:i + seq_len]              #slicing of data as lists of seqence length
    temp_data.append(_x)

  # Mix the datasets (to make it similar to i.i.d)
  idx = np.random.permutation(len(temp_data))    #shuffling of the lists in temp data
  data = []
  for i in range(len(temp_data)):
    data.append(temp_data[idx[i]])              #shuffled lists of lists as final data

  return data

In [10]:

original_data = real_data_loading("stock", 24)



In [11]:
original_data[0][:,1]

array([0.47147531, 0.47279136, 0.47166331, 0.47897112, 0.4818403 ,
       0.49075029, 0.48016458, 0.48077765, 0.47028185, 0.47021649,
       0.46889221, 0.47082138, 0.46247542, 0.45197962, 0.46525469,
       0.46086511, 0.45965528, 0.47840709, 0.47928993, 0.48477489,
       0.47506384, 0.4634073 , 0.4485955 , 0.48188116])

In [12]:
data = [0.19997616, 0.20317213, 0.20302168, 0.19481623, 0.18826976,
       0.18364251, 0.1816298 , 0.18290655, 0.18382956, 0.18653354,
       0.17825083, 0.17899088, 0.17729123, 0.14493301, 0.14036674,
       0.14594953, 0.14555512, 0.14836481, 0.14872669, 0.15042227,
       0.15201213, 0.15363451, 0.14549412, 0.14741334]

### Extracting the number of time points in the sliced sequences
Gives the number of time point in the data and the maxnimum length of the time points in a single sequence of the data.


In [14]:
def extract_time (data):
  """Returns Maximum sequence length and each sequence length.

  Args:
    - data: original data

  Returns:
    - time: extracted time information
    - max_seq_len: maximum sequence length
  """
  time = list()
  max_seq_len = 0
  for i in range(len(data)):
    max_seq_len = max(max_seq_len, len(data[i][:,0]))
    time.append(len(data[i][:,0]))

  return time, max_seq_len

In [16]:
import tensorflow as tf


In [18]:
def rnn_cell(module_name, hidden_dim):
  """Basic RNN Cell.

  Args:
    - module_name: gru, lstm, or lstmLN

  Returns:
    - rnn_cell: RNN Cell
  """
  assert module_name in ['gru','lstm','lstmLN']

  # GRU
  if (module_name == 'gru'):
    rnn_cell = tf.nn.rnn_cell.GRUCell(num_units=hidden_dim, activation=tf.nn.tanh)
  # LSTM
  elif (module_name == 'lstm'):
    rnn_cell = tf.contrib.rnn.BasicLSTMCell(num_units=hidden_dim, activation=tf.nn.tanh)
  # LSTM Layer Normalization
  elif (module_name == 'lstmLN'):
    rnn_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(num_units=hidden_dim, activation=tf.nn.tanh)
  return rnn_cell

The below function generates a batch of random vectors, where each vector has a specified sequence length (from T_mb) and a fixed dimensionality (z_dim). The total number of vectors generated is batch_size.

In [19]:
def random_generator (batch_size, z_dim, T_mb, max_seq_len):
  """Random vector generation.

  Args:
    - batch_size: size of the random vector
    - z_dim: dimension of random vector
    - T_mb: time information for the random vector
    - max_seq_len: maximum sequence length

  Returns:
    - Z_mb: generated random vector
  """
  for i in range(batch_size):
    temp = np.zeros([max_seq_len, z_dim])
    temp_Z = np.random.uniform(0., 1, [T_mb[i], z_dim])
    temp[:T_mb[i],:] = temp_Z
    list().append(temp_Z)
  return list()

In [23]:
def batch_generator(data, time, batch_size):
  """Mini-batch generator.

  Args:
    - data: time-series data
    - time: time information
    - batch_size: the number of samples in each batch

  Returns:
    - X_mb: time-series data in each batch
    - T_mb: time information in each batch
  """
  no = len(data)
  idx = np.random.permutation(no)
 train_idx = idx[:batch_size]

  X_mb = list(data[i] for i in train_idx)
  T_mb = list(time[i] for i in train_idx)

  return X_mb, T_mb

In [37]:
data = [2, 4, 3, 4, 5]
time = [1,2,4,6,7]
batch_size = 2
no = len(data)
idx = np.random.permutation(no)
print("idx", idx)
train_idx = idx[:batch_size]
print("train_idx", train_idx)
X_mb= list(data[i] for i in train_idx)
T_mb = list(time[i] for i in train_idx)
X_mb, T_mb

idx [1 3 0 4 2]
train_idx [1 3]


([4, 4], [2, 6])