---
Logistic Regression on [MNIST](http://yann.lecun.com/exdb/mnist/) digits
---

train-images-idx3-ubyte.gz:  training set images (9912422 bytes)
train-labels-idx1-ubyte.gz:  training set labels (28881 bytes)
t10k-images-idx3-ubyte.gz:   test set images (1648877 bytes)
t10k-labels-idx1-ubyte.gz:   test set labels (4542 bytes)


In [2]:
import numpy as np
import matplotlib.pyplot as plt

# for confusion matrix
from sklearn import metrics
import seaborn as sns

# for loading MNIST
from struct import unpack

%matplotlib inline

In [12]:
import os

# Load downloaded dataset
data_root = '../../books1000'
def loadMNIST(imagefile, labelfile):
    #img_filename = os.path.join(data_root, imagefile)
    #lbl_filename = os.path.join(data_root, labelfile)
    images = open(imagefile, 'rb')
    labels = open(labelfile, 'rb')
    
    images.read(4)
    number_of_images = images.read(4)
    number_of_images = unpack('>I', number_of_images)[0]
    rows = images.read(4)
    rows = unpack('>I', rows)[0]
    cols = images.read(4)
    cols = unpack('>I', cols)[0]
    
    labels.read(4)
    N = labels.read(4)
    N = unpack('>I', N)[0]
    
    x = np.zeros((N, rows*cols), dtype=np.uint8)
    y = np.zeros(N, dtype=np.uint8)
    
    for i in range(N):
        for j in range(rows*cols):
            tmp_pixel = images.read(1)
            tmp_pixel = unpack('>B', tmp_pixel)[0]
            x[i][j] = tmp_pixel
        tmp_label = labels.read(1)
        y[i] = unpack('>B', tmp_label)[0]
        
    images.close()
    labels.close()
    return (x, y)

In [14]:
train_img_filename = os.path.join(data_root, 'train-images-idx3-ubyte')
train_lbl_filename = os.path.join(data_root, 'train-labels-idx1-ubyte')
train_dataset, train_labels = loadMNIST(train_img_filename,
                                        train_lbl_filename)

test_img_filename = os.path.join(data_root, 't10k-images-idx3-ubyte')
test_lbl_filename = os.path.join(data_root, 't10k-labels-idx1-ubyte')
test_dataset, test_labels = loadMNIST(test_img_filename,
                                      test_lbl_filename)

In [19]:
print(train_dataset.shape)
print(train_labels.shape)

print(test_dataset.shape)
print(test_labels.shape)

(60000, 784)
(60000,)
(10000, 784)
(10000,)


## Train a Logistic Regression model using [cross validation](https://scikit-learn.org/stable/modules/cross_validation.html). 

We will split the data into 5 sets and perform five-fold cross validation. This will help generalize the model.

In [22]:
# Import
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# Instantiate
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=42, verbose=1,
                           max_iter=1000, n_jobs=-1)

scores = cross_val_score(model, train_dataset, train_labels, cv=5)
print(scores)
print(scores.mean())

[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 

In [24]:
print(scores)
print(model)


[0.91378592 0.90526579 0.90975    0.90339251 0.91905635]
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='multinomial',
          n_jobs=-1, penalty='l2', random_state=42, solver='lbfgs',
          tol=0.0001, verbose=1, warm_start=False)
