In [1]:
from google.colab import drive
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
import h5py
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import OneHotEncoder
drive.mount('/content/drive')
file_path="/content/drive/MyDrive/dataset_ts_light_version.hdf5"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
with h5py.File(file_path, 'r') as f:
  for key in f.keys():
    print(f"{key}: shape={f[key].shape}")

  X_train= f['x_train'][:]
  y_train= f['y_train'][:]

  X_val= f['x_validation'][:]
  y_val= f['y_validation'][:]

  X_test= f['x_test'][:]
  y_test= f['y_test'][:]

x_test: shape=(7766, 48, 48, 3)
x_train: shape=(90601, 48, 48, 3)
x_validation: shape=(31063, 48, 48, 3)
y_test: shape=(7766,)
y_train: shape=(90601,)
y_validation: shape=(31063,)


## **Data Cleaning**

In [3]:
#function to check if any null values exist in labels or pixels
def is_valid(x,y):
  null_label = False
  null_img = False
  for i, (img, label) in enumerate(zip(x, y)):
    if(label < 0 or label > 42):
      null_label = True
    if(np.isnan(img).any()):
      null_img = True
    if(null_img and null_label):
      return null_img, null_label
  return null_img, null_label


In [4]:
#checking
print(is_valid(X_train, y_train))
print(is_valid(X_val, y_val))
print(is_valid(X_test, y_test))
#data is perfectly clean

(False, False)
(False, False)
(False, False)


## **Pre-Processing**

In [5]:
#standardizing X's
X = X_train.reshape(X_train.shape[0], -1)
del X_train
means = X.mean(axis=1, keepdims=True)
stds = X.std(axis=1, keepdims=True) + 1e-8
X_standardized = (X-means)/stds
X_standardized[:5]
del X
del means
del stds
# deleting numpy arrays to save RAM
# X_train was too large to do via a function
def standardize(X):
  means = X.mean(axis=1, keepdims=True)
  stds = X.std(axis=1, keepdims=True) + 1e-8 # to avoid division by 0
  X_standardized = (X-means)/stds
  return X_standardized
X_val = standardize(X_val)
X_test = standardize(X_test)

In [6]:
#one hot encoding y's
encoder = OneHotEncoder(sparse_output = False)

y_train_encoded = encoder.fit_transform(y_train.reshape(-1,1))

y_val = encoder.fit_transform(y_val.reshape(-1,1))

y_test = encoder.fit_transform(y_test.reshape(-1,1))

In [7]:
# converting to tensors for the convolutional NN
train = torch.from_numpy(X_standardized)
del X_standardized
target = torch.from_numpy(y_train)
del y_train
val = torch.from_numpy(X_val)
del X_val
target_val = torch.from_numpy(y_val)
del y_val
test = torch.from_numpy(X_test)
del X_test
target_test = torch.from_numpy(y_test)
del y_test