In [163]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.misc
import tensorflow as tf
import glob
import sys
# you shouldn't need to make any more imports

In [97]:
# Helper functions, DO NOT modify this

def get_img_array(path):
    """
    Given path of image, returns it's numpy array
    """
    return scipy.misc.imread(path)

def get_files(folder):
    """
    Given path to folder, returns list of files in it
    """
    filenames = [file for file in glob.glob(folder+'*/*')]
    filenames.sort()
    return filenames

def get_label(filepath, label2id):
    """
    Files are assumed to be labeled as: /path/to/file/999_frog.png
    Returns label for a filepath
    """
    tokens = filepath.split('/')
    label = tokens[-1].split('_')[1][:-4]
    if label in label2id:
        return label2id[label]
    else:
        sys.exit("Invalid label: " + label)

In [259]:
# Functions to load data, DO NOT change these

def get_labels(folder, label2id):
    """
    Returns vector of labels extracted from filenames of all files in folder
    :param folder: path to data folder
    :param label2id: mapping of text labels to numeric ids. (Eg: automobile -> 0)
    """
    files = get_files(folder)
    y = []
    for f in files:
        y.append(get_label(f,label2id))
    return np.array(y)

def one_hot(y, num_classes=10):
    """
    Converts each label index in y to vector with one_hot encoding
    """
    y_one_hot = np.zeros((y.shape[0], num_classes))
    y_one_hot[np.arange(y.shape[0]),y] = 1
    return y_one_hot.T

def get_label_mapping(label_file):
    """
    Returns mappings of label to index and index to label
    The input file has list of labels, each on a separate line.
    """
    with open(label_file, 'r') as f:
        id2label = f.readlines()
        id2label = [l.strip() for l in id2label]
    label2id = {}
    count = 0
    for label in id2label:
        label2id[label] = count
        count += 1
    return id2label, label2id

def get_images(folder):
    """
    returns numpy array of all samples in folder
    each column is a sample resized to 30x30 and flattened
    """
    files = get_files(folder)
    images = []
    count = 0
    
    for f in files:
        count += 1
        if count % 10000 == 0:
            print("Loaded {}/{}".format(count,len(files)))
        img_arr = get_img_array(f)
        img_arr = img_arr.flatten() / 255.0
        images.append(img_arr)
    X = np.column_stack(images)

    return X

def get_train_data(data_root_path):
    """
    Return X and y
    """
    train_data_path = data_root_path + 'train'
    id2label, label2id = get_label_mapping(data_root_path+'labels.txt')
    print(label2id)
    X = get_images(train_data_path)
    y = get_labels(train_data_path, label2id)
    return X, y

def save_predictions(filename, y):
    """
    Dumps y into .npy file
    """
    np.save(filename, y)
    

In [212]:
# Load the data, using utility functions from HW1
data_root_path = 'cifar10-hw1/'
X, Y = get_train_data(data_root_path) # this may take a few minutes
X_test = get_images(data_root_path + 'test')
print('Data loading done')

{'airplane': 0, 'frog': 6, 'horse': 7, 'truck': 9, 'bird': 2, 'automobile': 1, 'ship': 8, 'cat': 3, 'dog': 5, 'deer': 4}
Loaded 10000/50000
Loaded 20000/50000
Loaded 30000/50000
Loaded 40000/50000
Loaded 50000/50000
Loaded 10000/10000
Data loading done


In [226]:
# Hold out 10% of the data to use as a validation set:
#y_OH = tf.one_hot(Y,10)
train_inds = np.random.choice(X.shape[1],int(X.shape[1]*.90),replace=False)
validation_inds = np.setdiff1d(np.arange(X.shape[1]),train_inds)
X_train = X[:,train_inds].astype(np.float32)
y_train = Y[train_inds].astype(np.int32)
X_validation = X[:,validation_inds].astype(np.float32)
y_validation = Y[validation_inds].astype(np.int32)
#X_placeholder = tf.placeholder(X_train.dtype, X_train.shape)
#y_placeholder = tf.placeholder(y_train.dtype, y_train.shape)
#dataset_train = tf.contrib.data.Dataset.from_tensor_slices((X_placeholder, y_placeholder))
#iterator = dataset_train.make_initializable_iterator()
#sess.run(iterator.initializer, feed_dict={X_placeholder: X_train,
                                          #y_placeholder: tfy_train})

In [250]:
def get_batch(X, y, batch_size):
    selector = np.random.choice(y.shape[1], batch_size, replace=False)
    return X[:, selector].T, y[:,selector].T

In [269]:
sess = tf.InteractiveSession()

In [102]:
with tf.name_scope('input'):
    x = tf.placeholder(tf.float32, [None, 3072], name='x-input')
    y_ = tf.placeholder(tf.float32, [None, 10], name='y-input' )

In [268]:
def weight_variable(shape):
    """Create a weight variable with appropriate initialization."""
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    """Create a bias variable with appropriate initialization."""
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

def variable_summaries(var):
    """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
    with tf.name_scope('summaries'):
      mean = tf.reduce_mean(var)
      tf.summary.scalar('mean', mean)
      with tf.name_scope('stddev'):
        stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
      tf.summary.scalar('stddev', stddev)
      tf.summary.scalar('max', tf.reduce_max(var))
      tf.summary.scalar('min', tf.reduce_min(var))
      tf.summary.histogram('histogram', var)

In [103]:
W = tf.Variable(tf.zeros([3072, 10]))
b = tf.Variable(tf.zeros([10]))

In [104]:
y = tf.nn.softmax(tf.matmul(x, W) + b)

In [106]:
cross_entropy = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))

In [107]:
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)


In [166]:
tf.global_variables_initializer().run()

In [251]:
for _ in range(1000):
  batch_xs, batch_ys = get_batch(X_train, one_hot(y_train), 100)
  sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})

In [254]:
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))


In [255]:
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [265]:
print(sess.run(accuracy, feed_dict={x: X_validation.T, y_: one_hot(y_validation).T}))


0.2384


<tf.Tensor 'Placeholder_7:0' shape=(?, 3072) dtype=float32>

<tf.Tensor 'Softmax_2:0' shape=(?, 10) dtype=float32>

TensorShape([Dimension(None), Dimension(10)])

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.], dtype=float32)

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

(50000,)

TensorShape([Dimension(50000), Dimension(10)])

array([ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.], dtype=float32)

(50000, 10)

(3072, 45000)

45000

<BatchDataset shapes: ((?, 3072), (?,)), types: (tf.float32, tf.float32)>

(45000, 10)

7

(3072, 5000)