In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_openml

In [2]:
mnist = fetch_openml('mnist_784')

In [3]:
data, target = mnist["data"], mnist["target"]
print(f"data의 type : {type(data)},  data의 shape : {data.shape}")
print(f"target의 type : {type(target)},  target의 shape : {target.shape}")

data의 type : <class 'pandas.core.frame.DataFrame'>,  data의 shape : (70000, 784)
target의 type : <class 'pandas.core.series.Series'>,  target의 shape : (70000,)


In [4]:
target.dtypes

CategoricalDtype(categories=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], ordered=False)

In [5]:
data, target = mnist["data"], mnist["target"]

In [6]:
# Since data type of all elements of target is string, we'll convert them into integer
target = target.astype(np.int8)

# We'll extract only values from data and target.
data, target = data.values, target.values

print(f"data의 type :   {type(data)},  data의 shape : {data.shape}")
print(f"target의 type : {type(target)},  target의 shape : {target.shape},   target element의 type : {type(target[0])}")

data의 type :   <class 'numpy.ndarray'>,  data의 shape : (70000, 784)
target의 type : <class 'numpy.ndarray'>,  target의 shape : (70000,),   target element의 type : <class 'numpy.int8'>


In [7]:
target_dict = {}
label, freq = np.unique(target, return_counts=True)
for l, f in zip(label, freq):
    target_dict[l] = f
    
print(target_dict)

{0: 6903, 1: 7877, 2: 6990, 3: 7141, 4: 6824, 5: 6313, 6: 6876, 7: 7293, 8: 6825, 9: 6958}


In [8]:
# Normalize data to keep our gradients manageable
data = data / 255
print(np.max(data), np.min(data))

1.0 0.0


In [9]:
# To build a zero-classifier, map target value of 0 into 1 and map all other target values into 0
target_new = np.zeros(target.shape)
target_new[np.where(target == 0)] = 1
target = target_new

In [10]:
np.unique(target, return_counts=True)

(array([0., 1.]), array([63097,  6903], dtype=int64))

In [11]:
# We'll split data into train and test set. We also transpose data to keep each example as a column.
m = 60000
m_test = data.shape[0] - m

x_train, x_test = data[:m].T, data[m:].T
y_train, y_test = target[:m], target[m:]

print(f"x_train의 shape : {x_train.shape},   y_train의 shape : {y_train.shape}")
print(f"x_test의  shape : {x_test.shape},   y_test의 shape :  {y_test.shape}")

x_train의 shape : (784, 60000),   y_train의 shape : (60000,)
x_test의  shape : (784, 10000),   y_test의 shape :  (10000,)


In [12]:
# We'll shuffle the training set for a good measure
np.random.seed(138)
shuffle_index = np.random.permutation(m)
x_train, y_train = x_train[:, shuffle_index], y_train[shuffle_index]