In [6]:
import os, ssl
from urllib.request import urlretrieve
from os.path import isfile, isdir
from tqdm import tqdm #tqdm is for displaying progress bar
import tarfile

cifar10_dataset_folder_path = 'cifar-10-batches-py'

class DownloadProgress(tqdm):
    last_block = 0
    
    def hook(self,block_num=1,block_size=1,total_size=None):
        self.total = total_size
        self.update((block_num-self.last_block)*block_size)
        self.last_block=block_num

if not isfile('cifar-10-python.tar.gz'):
    if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
        ssl._create_default_https_context = ssl._create_unverified_context
    with DownloadProgress(unit='B',unit_scale=True,miniters=1,desc='CIFAR-10 DATASET') as pbar:
        urlretrieve(
            'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz',
            'cifar-10-python.tar.gz',
            pbar.hook)
        
            
if not isdir(cifar10_dataset_folder_path):
    with tarfile.open('cifar-10-python.tar.gz') as tar:
        tar.extractall()
        tar.close()

CIFAR-10 DATASET: 171MB [02:48, 1.01MB/s]                              


Python pickle implements binary protocols for serializing and deserializing of python object structures

In [7]:
import pickle
import numpy as np
import matplotlib.pyplot as plt

### Understanding Data

The original batch data is 10,000 x 3072 dimensions densor expressed in numpy array where the number of columns (10,0000) indicates the number of sample data as stated in CIFAR 10 dataset. <br>
The row vector 3072 represents color image of 32 x 32 pixels <br>
Since this project uses convolutional neural network for classification task, the row vector 3072 is not an approprate form of image data to feed. <br>
In order to feed an image data into a CNN model, dimensions of the tensor representing an image data should width x height x number of channels


In [8]:
import pickle
import numpy as np
import matplotlib.pyplot as plt

In [11]:
def load_label_names():
    return ['airplane','automobile','bird','cat','deer','dog','frog','horse','ship','truck']

The labelled data is just a list of 10,000 numbers in the range 0 to 9, which corresponds to each of the 10 classes <br>
We need to modify it into new shape <br>
We need to define label names with all the labels <br>
The row vector 3,072 has the exact same number of elements 32 x 32 x 3 (dimensions of image x number of channels)


In order to reshape the vector, 2 steps are required
1. Divide the row vector into 3 pieces. Each piece corresponds to each channel. This results in 3 x 1024 dimensions of tensor
2. Divide the resulting tensor from previous step with 32. 32, here means width of image. This results in image data being represented as 3 x 32 x 32 (number of channels x width x height form). This should be sent as argument to reshape function in order to implement the directions in logical sense in numpy. But number of channels x width x height form is not what reshape, tensorflow and matplotlib are expecting. What they are expecting is shape of width x height x number of channels, to do that we swap the order of each of the axis using transpose function


In [13]:
def display_stats(cifar10_dataset_folder_path,batch_id,sample_id):
    features,labels = load_cfar10_batch(cifar10_dataset_folder_path,batch_id)
    
    if not (0 <= sample_id < len(features)):
        print('{} samples in batch {}. {} is out of range.'.format(len(features),batch_id,sample_id))
        return None
    
    print('\n Stats of batch #{}:'.format(batch_id))
    print('# of Samples: {}\n').format(len(features))
    
    label_names = load_label_names()
    label_counts = dict(zip(*np.unique(labels,return_counts=True)))
    for key, value in label_counts.items():
        print('Label counts of [{}]({}):{}'.format(key,label_names[key].upper(),value))
        
    sample_image = features[sample_id]
    sample_label = labels[sample_id]
    
    print('\n Example of Image {}:'.format(sample_id))
    print('Image - Min Value: {} Max Value: {}'.format(sample_image.min(),sample_image.max()))
    print('Image - Shape: {}'.format(sample_image.shape))
    print('Label - Label Id: {} Name: {}'.format(sample_label,label_names[sample_label]))
    
    plt.imshow(sample_image)