In [20]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
import numpy as np
import matplotlib as plt
import requests

In [22]:

try:
    from tqdm import tqdm
except ImportError:
    tqdm = lambda x, total, unit: x  # If tqdm doesn't exist, replace it with a function that does nothing
    print('**** Could not import tqdm. Please install tqdm for download progressbars! (pip install tqdm) ****')

# Python2 compatibility
try:
    input = raw_input
except NameError:
    pass

download_dict = {
    '1) Kuzushiji-MNIST (10 classes, 28x28, 70k examples)': {
        '1) MNIST data format (ubyte.gz)':
            ['http://codh.rois.ac.jp/kmnist/dataset/kmnist/train-images-idx3-ubyte.gz',
             'http://codh.rois.ac.jp/kmnist/dataset/kmnist/train-labels-idx1-ubyte.gz',
             'http://codh.rois.ac.jp/kmnist/dataset/kmnist/t10k-images-idx3-ubyte.gz',
             'http://codh.rois.ac.jp/kmnist/dataset/kmnist/t10k-labels-idx1-ubyte.gz'],
        '2) NumPy data format (.npz)':
            ['http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-train-imgs.npz',
             'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-train-labels.npz',
             'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-test-imgs.npz',
             'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-test-labels.npz'],
    }
}

# Download a list of files
def download_list(url_list):
    for url in url_list:
        path = url.split('/')[-1]
        r = requests.get(url, stream=True)
        with open(path, 'wb') as f:
            total_length = int(r.headers.get('content-length'))
            print('Downloading {} - {:.1f} MB'.format(path, (total_length / 1024000)))

            for chunk in tqdm(r.iter_content(chunk_size=1024), total=int(total_length / 1024) + 1, unit="KB"):
                if chunk:
                    f.write(chunk)
    print('All dataset files downloaded!')

def traverse_dict(d):
    if isinstance(d, list):  # If we've hit a list of downloads, download that list
        download_list(d)
    else:
        selected = list(d.keys())[0]  # Select the first option by default
        traverse_dict(d[selected])     # Repeat with the next level

traverse_dict(download_dict['1) Kuzushiji-MNIST (10 classes, 28x28, 70k examples)']['2) NumPy data format (.npz)'])


Downloading kmnist-train-imgs.npz - 18.0 MB


100%|██████████| 17954/17954 [00:15<00:00, 1187.56KB/s]


Downloading kmnist-train-labels.npz - 0.0 MB


100%|██████████| 30/30 [00:00<00:00, 212.82KB/s]


Downloading kmnist-test-imgs.npz - 3.0 MB


100%|██████████| 3008/3008 [00:02<00:00, 1079.47KB/s]


Downloading kmnist-test-labels.npz - 0.0 MB


100%|██████████| 6/6 [00:00<00:00, 19298.94KB/s]

All dataset files downloaded!





In [23]:
X_train = np.load('kmnist-train-imgs.npz')['arr_0']
y_train = np.load('kmnist-train-labels.npz')['arr_0']

X_test = np.load('kmnist-test-imgs.npz')['arr_0']
y_test = np.load('kmnist-test-labels.npz')['arr_0']

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(60000, 28, 28)
(60000,)
(10000, 28, 28)
(10000,)


In [24]:
X_train = X_train.reshape(60000,28 * 28) / 255
X_test = X_test.reshape(10000,28 * 28) / 255

print(X_train.shape)
print(X_test.shape)


(60000, 784)
(10000, 784)


###ACCURACY

In [25]:
def accuracy(prediction,actual):
  return np.mean(prediction == actual) * 100

###CONFUSION MATRIX

In [26]:
def confusion_matrix(predicted,actual,n_classes):
  conf_mat = np.zeros((n_classes, n_classes))
  for i in range(len(predicted)):
    conf_mat[int(actual[i])][int(predicted[i])] = conf_mat[int(actual[i])][int(predicted[i])] + 1

  return conf_mat

###F1 SCORE

In [27]:
def f1_score(predicted,actual,n_classes):
  conf_mat = confusion_matrix(predicted,actual,n_classes)
  f1_score = np.zeros(n_classes)
  for i in range(n_classes):
    tp = conf_mat[i][i]
    fn = sum([conf_mat[k][i] for k in range(n_classes)]) - tp
    fp = np.sum(conf_mat[i]) -tp
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1_score[i] = 2 * (precision * recall) / (precision + recall)

  return f1_score


In [28]:
class OneVsRest:
  def __init__(self,n_classes,n_features):
    # [bias,W] for all classes
    self.theta_cross = np.random.rand(n_classes, n_features + 1)


  # Sigmoid function
  def sigmoid(self,z):
      return 1 / (1 + np.exp(-z))

  # gradient descent for gradient descent
  def gradient_descent_cross(self,X_bias, y, theta, learning_rate, n_iterations,tolrence):
      m = len(y)
      i = 0
      grad_norm = 1
      while grad_norm > tolrence:
          h = self.sigmoid(np.dot(X_bias, theta))
          gradient = np.dot(X_bias.T, (h - y)) / m
          grad_norm = np.linalg.norm(gradient)
          theta -= learning_rate * gradient
          i = i+1
          if i > n_iterations:
            break
      return theta


  def fit(self,X_train,y_train):
    # no of classes
    classes = np.unique(y_train)

    # no of features
    n_features = X_train.shape[1]

    # X_train and 1 is stacked
    X_bias = np.hstack((np.ones((X_train.shape[0], 1)), X_train))

    for i,c in enumerate(classes):
      # modified labels for binary one vs rest
      new_y_train = np.where(c==y_train,1,0)

      # apply gradient descent using cross entropy loss function
      self.theta_cross[i] = self.gradient_descent_cross(X_bias,new_y_train,self.theta_cross[i],0.01,1000,1e-6)

  def predict(self,X_test,y_test):
    # stacking X_test with 1
    new_X_test = np.hstack((np.ones((X_test.shape[0], 1)), X_test))

    # storing predictions data of all n binary classifiers for all test data points
    predictions_cross = []

    # predicting probabilities of all classes and for testing data point
    predictions_probs_cross = np.zeros((self.theta_cross.shape[0],X_test.shape[0]))


    for i in range(self.theta_cross.shape[0]):
      predictions_probs_cross[i] = self.sigmoid(np.dot(new_X_test,self.theta_cross[i]))

    predictions_cross = np.argmax(predictions_probs_cross, axis=0)

    # Compute cross-entropy loss
    loss_cross = 0.0
    for i in range(X_test.shape[0]):
        for j in range(self.theta_cross.shape[0]):
            loss_cross += - (y_test[i] * np.log(predictions_probs_cross[j][i]) + (1 - y_test[i]) * np.log(1 - predictions_probs_cross[j][i]))

    # Average the loss over all testing samples
    loss_cross /= X_test.shape[0]

    return predictions_cross,loss_cross



In [31]:
train_limit = 60001
test_limit = 10001
#no of classes
n_classes = np.unique(y_train).shape[0]
n_features = X_train.shape[1]
linear_classifier = OneVsRest(n_classes,n_features)
linear_classifier.fit(X_train[:train_limit],y_train[:train_limit])
predictions_cross,loss_cross = linear_classifier.predict(X_test[:test_limit],y_test[:test_limit])

#Accuracy
print(f"Accuracy of Linear Classifier using Cross Entropy: {accuracy(predictions_cross,y_test[:test_limit])}")


# f1 score Euclidian
print("Linear Classifier f1 scores classwise using cross entropy")
print(f1_score(predictions_cross,y_test[:test_limit],int(n_classes)))

#confusion matrix
print("CONFUSION MATRIX Linear Classifier using cross Entropy: ")
print(confusion_matrix(predictions_cross,y_test[:test_limit],int(n_classes)))


# #Emperical Loss for testing
# print(f"Emperical loss Testing using Cross Entropy = {loss_cross}")

# #Emperical Loss for training
# print(f"Emperical loss Training using Cross Entropy = {loss_cross}")

Accuracy of Linear Classifier using Cross Entropy: 43.580000000000005
Linear Classifier f1 scores classwise using cross entropy
[0.65625    0.43191489 0.37302726 0.50621891 0.46027172 0.25792812
 0.46091205 0.41331709 0.35542169 0.39690107]
CONFUSION MATRIX Linear Classifier using cross Entropy: 
[[651.   2.   1.  43.  84.  43.   0. 143.  15.  18.]
 [ 38. 406.  82.   7.  91.   5. 186.  41.  91.  53.]
 [  8. 103. 390.  53.  87.  12. 144. 103.  58.  42.]
 [ 22.  93.  19. 407.  71.  69.  55.  93. 114.  57.]
 [ 80.  35.  49.  20. 559.  19.  40.  94.  38.  66.]
 [ 29.  40. 273.  38.  35. 183. 189. 116.  88.   9.]
 [ 10.  31. 104.   3.  74.   1. 566. 170.  17.  24.]
 [ 21.  39.  45.   7. 159.   0.  63. 509. 119.  38.]
 [ 78.  61.  75.  27.  63.  84. 181.  39. 354.  38.]
 [ 47.  70.  53.   3. 206.   3.  32. 155.  98. 333.]]
