<a href="https://colab.research.google.com/github/rohaan2614/var_from_scratch/blob/master/var_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Data Prep

In [1]:
import numpy as np
import gzip
from urllib import request
from tqdm import tqdm
import os

In [2]:
# Constants
url = "https://ossci-datasets.s3.amazonaws.com/mnist/"
filenames = ['train-images-idx3-ubyte.gz', 'train-labels-idx1-ubyte.gz',
             't10k-images-idx3-ubyte.gz', 't10k-labels-idx1-ubyte.gz']

In [3]:
# Create the "data" directory if it doesn't exist
os.makedirs("data", exist_ok=True)

In [4]:
# Download the dataset
for filename in tqdm(filenames, desc="Downloading Dataset"):
    request.urlretrieve(url + filename, os.path.join("data", filename))

Downloading Dataset: 100%|██████████| 4/4 [00:00<00:00,  4.02it/s]


In [5]:
# Read dataset
data = []
for filename in os.listdir("data"):
    filepath = os.path.join("data", filename)
    with gzip.open(filepath, 'rb') as f:
        if 'labels' in filename:
            # Load the labels as a one-dimensional array of integers
            data.append(np.frombuffer(f.read(), np.uint8, offset=8))
        else:
            # Load the images as a two-dimensional array of pixels
            data.append(np.frombuffer(f.read(), np.uint8, offset=16).reshape(-1,28*28))

In [6]:
print("Data Length: ", len(data), "\ndata: ", data)

Data Length:  4 
data:  [array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8), array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8), array([7, 2, 1, ..., 4, 5, 6], dtype=uint8), array([5, 0, 4, ..., 5, 6, 8], dtype=uint8)]


In [7]:
# Split into training and testing sets
X_train, y_train, X_test, y_test = data

In [8]:
print("X_Train: ", X_train)
print("X_Test: ", X_test)

X_Train:  [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
X_Test:  [7 2 1 ... 4 5 6]


In [9]:
# Normalize the pixel values
X_train = X_train.astype(np.float32) / 255.0
X_test = X_test.astype(np.float32) / 255.0

In [10]:
print("X_Train: ", X_train)
print("X_Test: ", X_test)

X_Train:  [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
X_Test:  [0.02745098 0.00784314 0.00392157 ... 0.01568628 0.01960784 0.02352941]


In [11]:
print("y_Train: ", y_train)
print("y_Test: ", y_test)

y_Train:  [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
y_Test:  [5 0 4 ... 5 6 8]


In [12]:
# Convert labels to integers
y_train = y_train.astype(np.int64)
y_test = y_test.astype(np.int64)

In [13]:
print("y_Train: ", y_train)
print("y_Test: ", y_test)

y_Train:  [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
y_Test:  [5 0 4 ... 5 6 8]
