In [1]:
def compute_cost_function(X, Y, theta, lambda_factor, temp_parameter):
    """
    Computes the total cost over every datapoint.

    Args:
        X - (n, d) NumPy array (n datapoints each with d features)
        Y - (n, ) NumPy array containing the labels (a number from 0-9) for each
            data point
        theta - (k, d) NumPy array, where row j represents the parameters of our
                model for label j
        lambda_factor - the regularization constant (scalar)
        temp_parameter - the temperature parameter of softmax function (scalar)

    Returns
        c - the cost value (scalar)
    """
    import gc
    print("fdasfdas")
    stheta=sparse.coo_matrix(theta)
    # YOUR CODE HERE
    ex = np.exp(np.dot(theta, X.T)/temp_parameter)
    ex2 = ex.sum(0)
    tmpx=ex/ex2
    p=tmpx > 0.0
    res=np.zeros_like(tmpx)
    res[p]=np.log(tmpx[p])
    print("test1")

    # log = np.ma.log(ex/ex2)
    # prepare equality calculation
    tl = np.tile(np.arange(theta.shape[0]), (Y.shape[0], 1))
    ts2 = np.tile(Y, (theta.shape[0], 1))
    iseq = np.equal(ts2.T, tl)*1
    print("test2")
    # final e
    # part1 =  -1/Y.shape[0]*np.dot(iseq, log)  if log.ndim==0 else 0
    del ex,ex2,theta,tl,ts2,tmpx,X
    gc.collect()
    print("memory clean")
    part1 =  -1/Y.shape[0]*np.dot(iseq, res)
    print("test3")
    part2 = stheta.power(2).sum()*lambda_factor/2
    tmp1 = part1  + part2
    print("test4")
    res=tmp1.sum(1)[0]
    
    if np.isnan(res):
        res=part2
        
    # if(tmp1.ndim==0)
    # res = tmp1.sum(1) if log.ndim!=0 else part2 
    return res

In [2]:
def run_softmax_on_MNIST(temp_parameter=1):
    """
    Trains softmax, classifies test data, computes test error, and plots cost function

    Runs softmax_regression on the MNIST training set and computes the test error using
    the test set. It uses the following values for parameters:
    alpha = 0.3
    lambda = 1e-4
    num_iterations = 150

    Saves the final theta to ./theta.pkl.gz

    Returns:
        Final test error
    """
    train_x, train_y, test_x, test_y = get_MNIST_data()
    theta, cost_function_history = softmax_regression(train_x, train_y, temp_parameter, alpha=0.3, lambda_factor=1.0e-4, k=10, num_iterations=150)
    plot_cost_function_over_time(cost_function_history)
    test_error = compute_test_error(test_x, test_y, theta, temp_parameter)
    # Save the model parameters theta obtained from calling softmax_regression to disk.
    write_pickle_data(theta, "./theta.pkl.gz")

    # TODO: add your code here for the "Using the Current Model" question in tab 4.
    #      and print the test_error_mod3
    return test_error

In [3]:

def softmax_regression(X, Y, temp_parameter, alpha, lambda_factor, k, num_iterations):
    """
    Runs batch gradient descent for a specified number of iterations on a dataset
    with theta initialized to the all-zeros array. Here, theta is a k by d NumPy array
    where row j represents the parameters of our model for label j for
    j = 0, 1, ..., k-1

    Args:
        X - (n, d - 1) NumPy array (n data points, each with d-1 features)
        Y - (n, ) NumPy array containing the labels (a number from 0-9) for each
            data point
        temp_parameter - the temperature parameter of softmax function (scalar)
        alpha - the learning rate (scalar)
        lambda_factor - the regularization constant (scalar)
        k - the number of labels (scalar)
        num_iterations - the number of iterations to run gradient descent (scalar)

    Returns:
        theta - (k, d) NumPy array that is the final value of parameters theta
        cost_function_progression - a Python list containing the cost calculated at each step of gradient descent
    """
    X = augment_feature_vector(X)
    theta = np.zeros([k, X.shape[1]])
    cost_function_progression = []
    for i in range(num_iterations):
        cost_function_progression.append(compute_cost_function(
            X, Y, theta, lambda_factor, temp_parameter))
        theta = run_gradient_descent_iteration(
            X, Y, theta, alpha, lambda_factor, temp_parameter)
    return theta, cost_function_progression


def get_classification(X, theta, temp_parameter):
    """
    Makes predictions by classifying a given dataset

    Args:
        X - (n, d - 1) NumPy array (n data points, each with d - 1 features)
        theta - (k, d) NumPy array where row j represents the parameters of our model for
                label j
        temp_parameter - the temperature parameter of softmax function (scalar)

    Returns:
        Y - (n, ) NumPy array, containing the predicted label (a number between 0-9) for
            each data point
    """
    X = augment_feature_vector(X)
    probabilities = compute_probabilities(X, theta, temp_parameter)
    return np.argmax(probabilities, axis=0)


def plot_cost_function_over_time(cost_function_history):
    plt.plot(range(len(cost_function_history)), cost_function_history)
    plt.ylabel('Cost Function')
    plt.xlabel('Iteration number')
    plt.show()


def compute_test_error(X, Y, theta, temp_parameter):
    error_count = 0.
    assigned_labels = get_classification(X, theta, temp_parameter)
    return 1 - np.mean(assigned_labels == Y)

In [4]:
import pickle, gzip, numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import math


def plot_images(X):
    if X.ndim == 1:
        X = np.array([X])
    num_images = X.shape[0]
    num_rows = math.floor(math.sqrt(num_images))
    num_cols = math.ceil(num_images/num_rows)
    for i in range(num_images):
        reshaped_image = X[i,:].reshape(28,28)
        plt.subplot(num_rows, num_cols, i+1)
        plt.imshow(reshaped_image, cmap = cm.Greys_r)
        plt.axis('off')
    plt.show()


def pick_examples_of(X, Y, labels, total_count):
    bool_arr = None
    for label in labels:
        bool_arr_for_label = (Y == label)
        if bool_arr is None:
            bool_arr = bool_arr_for_label
        else:
            bool_arr |= bool_arr_for_label
    filtered_x = X[bool_arr]
    filtered_y = Y[bool_arr]
    return (filtered_x[:total_count], filtered_y[:total_count])


def extract_training_and_test_examples_with_labels(train_x, train_y, test_x, test_y, labels, training_count, test_count):
    filtered_train_x, filtered_train_y = pick_examples_of(train_x, train_y, labels, training_count)
    filtered_test_x, filtered_test_y = pick_examples_of(test_x, test_y, labels, test_count)
    return (filtered_train_x, filtered_train_y, filtered_test_x, filtered_test_y)

def write_pickle_data(data, file_name):
    f = gzip.open(file_name, 'wb')
    pickle.dump(data, f)
    f.close()

def read_pickle_data(file_name):
    f = gzip.open(file_name, 'rb')
    data = pickle.load(f, encoding='latin1')
    f.close()
    return data

def get_MNIST_data():
    """
    Reads mnist dataset from file

    Returns:
        train_x - 2D Numpy array (n, d) where each row is an image
        train_y - 1D Numpy array (n, ) where each row is a label
        test_x  - 2D Numpy array (n, d) where each row is an image
        test_y  - 1D Numpy array (n, ) where each row is a label

    """
    train_set, valid_set, test_set = read_pickle_data('/dbfs/FileStore/tables/mnist_pkl-d4040.gz') #file_location = "/FileStore/tables/train_labels_mini_txt-d5d7d.gz"
    train_x, train_y = train_set
    valid_x, valid_y = valid_set
    train_x = np.vstack((train_x, valid_x))
    train_y = np.append(train_y, valid_y)
    test_x, test_y = test_set
    return (train_x, train_y, test_x, test_y)

def load_train_and_test_pickle(file_name):
    train_x, train_y, test_x, test_y = read_pickle_data(file_name)
    return train_x, train_y, test_x, test_y

# returns the feature set in a numpy ndarray
def load_CSV(filename):
    stuff = np.asarray(np.loadtxt(open(filename, 'rb'), delimiter=','))
    return stuff


In [5]:
def augment_feature_vector(X):
    """
    Adds the x[i][0] = 1 feature for each data point x[i].

    Args:
        X - a NumPy matrix of n data points, each with d - 1 features

    Returns: X_augment, an (n, d) NumPy array with the added feature for each datapoint
    """
    column_of_ones = np.zeros([len(X), 1]) + 1
    return np.hstack((column_of_ones, X))


def compute_probabilities(X, theta, temp_parameter):
    """
    Computes, for each datapoint X[i], the probability that X[i] is labeled as j
    for j = 0, 1, ..., k-1

    Args:
        X - (n, d) NumPy array (n datapoints each with d features)
        theta - (k, d) NumPy array, where row j represents the parameters of our model for label j
        temp_parameter - the temperature parameter of softmax function (scalar)
    Returns:
        H - (k, n) NumPy array, where each entry H[j][i] is the probability that X[i] is labeled as j
    """
    # YOUR CODE HERE
    p1 = np.dot(theta, X.T)/temp_parameter
    c = p1.max(0)
    p1 = np.exp(np.dot(theta, X.T)/temp_parameter-c)
    norm = 1/p1.sum(0)
    return norm*p1


def compute_cost_function(X, Y, theta, lambda_factor, temp_parameter):
    """
    Computes the total cost over every datapoint.

    Args:
        X - (n, d) NumPy array (n datapoints each with d features)
        Y - (n, ) NumPy array containing the labels (a number from 0-9) for each
            data point
        theta - (k, d) NumPy array, where row j represents the parameters of our
                model for label j
        lambda_factor - the regularization constant (scalar)
        temp_parameter - the temperature parameter of softmax function (scalar)

    Returns
        c - the cost value (scalar)
    """
    import gc
    stheta=sparse.coo_matrix(theta)
    # YOUR CODE HERE
    ex = np.exp(np.dot(stheta, X.T)/temp_parameter)
    ex2 = ex.sum(0)
    tmpx=ex/ex2
    p=tmpx > 0.0
    res=np.zeros_like(tmpx)
    res[p]=np.log(tmpx[p])

    del theta,X,ex,ex2,tmpx
    gc.collect()
    # log = np.ma.log(ex/ex2)
    # prepare equality calculation
    tl = np.tile(np.arange(theta.shape[0]), (Y.shape[0], 1))
    ts2 = np.tile(Y, (theta.shape[0], 1))
    stl=sparse.coo_matrix(tl)
    sts2=sparse.coo_matrix(ts2)
    del tl,ts2
    siseq = (sts2.T == stl)
    del stl,sts2
    gc.collect()
    # final e
    # part1 =  -1/Y.shape[0]*np.dot(iseq, log)  if log.ndim==0 else 0
    part1 =  -1/Y.shape[0]*np.dot(siseq, res)
    del siseq,res
    gc.collect()
    part2 = stheta.power(2).sum()*lambda_factor/2
    del stheta
    tmp1 = part1  + part2
    del part1,part2
    gc.collect()
    tmp1s=sparse.coo_matrix(tmp1)
    res=tmp1s.sum(1)[0]
    
    if np.isnan(res):
        res=part2
        
    # if(tmp1.ndim==0)
    # res = tmp1.sum(1) if log.ndim!=0 else part2 
    return res


In [6]:
import scipy.sparse as sparse

In [7]:
def compute_probabilities(X, theta, temp_parameter):
    """
    Computes, for each datapoint X[i], the probability that X[i] is labeled as j
    for j = 0, 1, ..., k-1

    Args:
        X - (n, d) NumPy array (n datapoints each with d features)
        theta - (k, d) NumPy array, where row j represents the parameters of our model for label j
        temp_parameter - the temperature parameter of softmax function (scalar)
    Returns:
        H - (k, n) NumPy array, where each entry H[j][i] is the probability that X[i] is labeled as j
    """
    # YOUR CODE HERE
    p1 = np.dot(theta, X.T)/temp_parameter
    c = p1.max(0)
    p1 = np.exp(np.dot(theta, X.T)/temp_parameter-c)
    norm = 1/p1.sum(0)
    return norm*p1
  
def run_gradient_descent_iteration(X, Y, theta, alpha, lambda_factor, temp_parameter):
    """
    Runs one step of batch gradient descent

    Args:
        X - (n, d) NumPy array (n datapoints each with d features)
        Y - (n, ) NumPy array containing the labels (a number from 0-9) for each
            data point
        theta - (k, d) NumPy array, where row j represents the parameters of our
                model for label j
        alpha - the learning rate (scalar)
        lambda_factor - the regularization constant (scalar)
        temp_parameter - the temperature parameter of softmax function (scalar)

    Returns:
        theta - (k, d) NumPy array that is the final value of parameters theta
    """
    # YOUR CODE HERE
    p1 = -1/(Y.shape[0]*temp_parameter)
    ex = np.exp(np.dot(theta, X.T)/temp_parameter)
    ex2 = ex.sum(0)

    tmpx=ex/ex2
    p=tmpx > 0.0
    res=np.zeros_like(tmpx)
    res[p]=tmpx[p]

    tl = np.tile(np.arange(theta.shape[0]), (Y.shape[0], 1))
    ts2 = np.tile(Y, (theta.shape[0], 1))
    iseq = np.equal(ts2.T, tl)*1
    H=compute_probabilities(X,theta,temp_parameter)
    #(iseq-res.T)
    #H => H.T
    gradtheta = (p1*np.dot(X.T, (iseq-H.T))).T + lambda_factor*theta

    return theta - alpha*gradtheta

In [8]:
print('softmax test_error=', run_softmax_on_MNIST(temp_parameter=1))