In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


## Prepareing Data
+ Download
+ Labeling
+ Spliting

In [2]:
import sys
import os
import urllib.request
import tarfile
import zipfile

In [3]:
data_url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"

### Data Download

In [4]:
def download_process_bar(count, block_size, total_size):
    completed = float(count * block_size) / total_size
    msg = "\r- Download progress: {0:.1%}".format(completed)
    # Print it.
    sys.stdout.write(msg)
    sys.stdout.flush()

In [5]:
def download_and_extract(url, download_dir):
    """
    Download and extract the data if it doesn't already exist.
    :param download_dir: Directory where the downloaded file is saved.
        Example: "data/CIFAR-10/"
    """
    filename = url.split('/')[-1]
    file_path = os.path.join(download_dir, filename)

    if not os.path.exists(file_path):
        # Check if the download directory exists, otherwise create it.
        if not os.path.exists(download_dir):
            os.makedirs(download_dir)

        # Download the file from the internet.
        file_path, _ = urllib.request.urlretrieve(url=url,
                                                  filename=file_path,
                                                  reporthook=download_process_bar)

        print()
        print("Download finished. Extracting files.")

        if file_path.endswith(".zip"):
            # Unpack the zip-file.
            zipfile.ZipFile(file=file_path, mode="r").extractall(download_dir)
        elif file_path.endswith((".tar.gz", ".tgz")):
            # Unpack the tar-ball.
            tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)

        print("Done.")
    else:
        print("Data has apparently already been downloaded and unpacked.")
        

### Loading Data

In [6]:
import pickle
import os
import platform

In [7]:
def load_binary(f):
    ver = platform.python_version_tuple()
    if ver[0]=='3':
        return pickle.load(f, encoding='latin1')
    else :
        return ValueError("invalid python version: {}".format(version))

In [8]:
def load_cifar_batch(filename):
    with open(filename,'rb') as f:
        data = load_binary(f)
        x = data['data']
        y = data['labels']
        x = x.reshape(10000,3,32,32).transpose(0,2,3,1).astype('float')
        y = np.array(y)
        return x,y

In [19]:
def load_cifar10(rootdir):
    indipendent = []
    dependent = []
    
    for b in range(1,6):
        F = os.path.join(rootdir,'data_batch_%d'%(b,))
        X , Y = load_cifar_batch(F)
        indipendent.append(X)
        dependent.append(Y)
    xtrain = np.concatenate(indipendent)
    ytrian = np.concatenate(dependent)
    del indipendent, dependent
    xtest, ytest = load_cifar_batch(os.path.join(rootdir,'test_batch'))
    
    return xtrain, ytrian, xtest, ytest

## KNN Class define

In [10]:
class KNN_image_recognesion():
    
    def __init__(self):
        pass
    
    def fit_train(self,X,Y):
        self.xtrain = x
        self.ytrain = y
        
    def predict(self,X,k=1):
        dist = self.claculate_distance(X)
        return self.predict_labels(dist,k=k)
    
    def claculate_distance(self,X):
        n_test = X.shape[0]
        n_train = self.xtrain.shape[0]
        dist = np.zeros((n_test,n_train))
        dist = np.sqrt(
            np.sum(np.square(self.xtrain),axis = 1) + 
            np.sum(np.square(X),axis = 1)[:,np.newaxis] - 2 * np.dot(X, self.xtrain.T)
        )
        return dist

    def predict_labels(self,dist,k=1):
        n_test = dist.shape[0]
        y_pred = np.zeros(n_test)
        
        for i in range(n_test):
            closest = []
            sorted_dist = np.argsort(dist[i])
            closest = list(self.ytrain[sorted_dist[0:k]])
            y_pred[i] = np.argmax(np.bincount(closest))
            
        return y_pred
    

In [11]:
download_dir = "/kaggle/working/"

In [12]:
download_and_extract(data_url,download_dir)

- Download progress: 100.0%
Download finished. Extracting files.
Done.


In [16]:
!ls "cifar-10-batches-py"

batches.meta  data_batch_2  data_batch_4  readme.html
data_batch_1  data_batch_3  data_batch_5  test_batch


In [17]:
cifar10_dir = "/kaggle/working/cifar-10-batches-py"

In [20]:
X_train, y_train, X_test, y_test = load_cifar10(cifar10_dir)
print('Training data shape: ', X_train.shape)
print('Training labels shape: ', y_train.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)

Training data shape:  (50000, 32, 32, 3)
Training labels shape:  (50000,)
Test data shape:  (10000, 32, 32, 3)
Test labels shape:  (10000,)


In [None]:
classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

In [None]:
# smaple data checking
n_classess = len(classes)
num_sample = 10

