In [169]:
import numpy as np 
import pandas as pd
import sklearn as sk
import tensorflow as tf
from sklearn import preprocessing
from sklearn import datasets
from sklearn.model_selection import train_test_split
import plotly.express as px

# Part 1

In [36]:
def softmax(u):
    return np.exp(u)/np.sum(np.exp(u))

In [58]:
def cross_entropy(p, q):
    return -np.sum(p*np.log(q))

In [59]:
def f(xi, beta):
    K = beta.shape[0]
    weighted_combos[k] = np.zeros(K)
    
    for k in range(K):
        weighted_combos[k] = beta[k, 0] + np.sum(beta[k, 1:]*xi)
    
    return softmax(weighted_combos)

In [60]:
def f_fast(xi, beta):
    xiHat = np.insert(xi,0,1)
    return softmax(beta @ xiHat)

In [88]:
def eval_L(X, y, beta):
    N = X.shape[0]
    L = 0.0
    
    for i in range(N):
        xiHat = X[i]
        yi = y[i]
        qi = softmax(beta @ xiHat)
        
        #predicted probability vector
        L += cross_entropy(yi, qi)
        
    return L
        

## Gradient Descent 
Gradient descent function, if stochastic, then batchsize is set to size of the entire dataset and thus is no longer stocastic gradient descent. If no num is given but is Stocastic, then batch size is set to 4

In [172]:
def gradientDescent(X, y, alpha, iterations, isStocastic = False, batchSize = 4):
    if not isStocastic:
        batchSize = len(X)

    num_epochs = iterations
    N, d = X.shape
    X = np.insert(X,0,1,axis = 1)
    K = y.shape[1]
    
    beta = np.zeros((K, d + 1))
    Lvals = []
    
    for ep in range(num_epochs):
        L = eval_L(X,y,beta)
        Lvals.append(L)
        
        prm = np.random.permutation(N)
        for i in prm:
            XiHat = X[i]
            Yi = y[i]
            
            qi = softmax(beta @ XiHat)
            grad_Li = np.outer(qi - Yi, XiHat)
            
            beta = beta - alpha*grad_Li
            
    return beta, Lvals

# Make predictions
Function to make predictions given predetermined beta values

In [112]:
def predictLabels(X, beta):
    X = np.insert(X,0,1, axis = 1)
    N = X.shape[0]
    
    predictions = []
    
    for i in range(N):
        XiHat = X[i]
        qi = softmax(beta@XiHat)
        
        k = np.argmax(qi)
        predictions.append(k)
        
    return predictions

# Iris Data
Stochastic set to False

In [176]:
def trainAndTestIris():
    iris_data = sk.datasets.load_iris()
    X = iris_data.data
    y = iris_data.target
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)
    Y_train = pd.get_dummies(Y_train).values

    alpha = 0.01

    beta, Lvals = gradientDescent(X_train, Y_train, alpha, 5000, False)
    
    Predicted_y = predictLabels(X_test, beta)
    
    df = pd.DataFrame({'True_y':Y_test, 'Predicted_y':Predicted_y})
    dfRight = df.loc[df['True_y'] == df['Predicted_y']]
    dfWrong = df.loc[df['True_y'] != df['Predicted_y']]
    print("Number correct: " + str(dfRight.shape[0]))
    print("Number wrong: " + str(dfWrong.shape[0]))
    print("Percent correct: " + str(dfRight.shape[0]*100/df.shape[0]) + "%")

    
    df = pd.DataFrame({'Iterations':np.arange(len(Lvals)), 'L-Value':Lvals})
    fig = px.line(df, x='Iterations', y="L-Value", title='Objective function value versus iteration of gradient descent')
    fig.show()
trainAndTestIris()

Number correct: 28
Number wrong: 2
Percent correct: 93.33333333333333%


# Part 2

## Train and Test MNIST Data
The classifier cosistantly gets around 92% accuracy with the highest error rate being found in the numbers 3, 2, 8, 5, and 9. I believe this could be in part because they can all somewhat look like an 8 if you stretch out the ends which would alsob confuse the 8 with others. Zero and one are the least to be mistaken which makes sense to me as they are very specific shapes and very different than the rest.

In [175]:
def trainAndTestMnist():
    (X_train, Y_train), (X_test, Y_test) = tf.keras.datasets.mnist.load_data()

    X_train = X_train/255.0
    X_test = X_test/255.0
    

    N_train, numRows, NumCols = X_train.shape
    X_train = np.reshape(X_train, (N_train, numRows*NumCols))

    Y_train = pd.get_dummies(Y_train).values

    alpha = 0.01

    beta, Lvals = gradientDescent(X_train, Y_train, alpha, 5, True, 4)
    
    N_test = X_test.shape[0]
    X_test = np.reshape(X_test, (N_test, numRows*NumCols))
    Predicted_y = predictLabels(X_test, beta)
    
    df = pd.DataFrame({'True_y':Y_test, 'Predicted_y':Predicted_y})
    dfRight = df.loc[df['True_y'] == df['Predicted_y']]
    dfWrong = df.loc[df['True_y'] != df['Predicted_y']]
    print("Number correct: " + str(dfRight.shape[0]))
    print("Number wrong: " + str(dfWrong.shape[0]))
    print("Percent correct: " + str(dfRight.shape[0]*100/df.shape[0]) + "%")
    print("Counts of errors: ")
    return dfWrong['True_y'].value_counts()
trainAndTestMnist()

Number correct: 9180
Number wrong: 820
Percent correct: 91.8%
Counts of errors: 


5    202
9    113
2    106
7     90
8     75
4     65
3     63
6     62
1     31
0     13
Name: True_y, dtype: int64