In [2]:
import sklearn
import numpy as np

In [3]:
from sklearn.datasets import load_iris
datasets = load_iris()

# Iris Datasets

In [4]:
x_data = datasets["data"]
x_data[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [5]:
y_data = datasets["target"]
y_data

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [6]:
y_data = y_data.reshape(-1,1)
y_data[:3]

array([[0],
       [0],
       [0]])

In [7]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
enc.fit(y_data)
y_data = enc.transform(y_data).toarray()
y_data[-1]

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([0., 0., 1.])

In [8]:
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()
x_data_minmax = min_max_scaler.fit_transform(x_data)
x_data_minmax[:3]

array([[0.22222222, 0.625     , 0.06779661, 0.04166667],
       [0.16666667, 0.41666667, 0.06779661, 0.04166667],
       [0.11111111, 0.5       , 0.05084746, 0.04166667]])

In [9]:
x_0 = np.ones(x_data_minmax.shape[0])
x_data_minmax = np.column_stack((x_0, x_data_minmax)) 


x_data_minmax[:3]

array([[1.        , 0.22222222, 0.625     , 0.06779661, 0.04166667],
       [1.        , 0.16666667, 0.41666667, 0.06779661, 0.04166667],
       [1.        , 0.11111111, 0.5       , 0.05084746, 0.04166667]])

#### 참고. np.column_stack()
Stack 1-D arrays as columns into a 2-D array.

Take a sequence of 1-D arrays and stack them as columns to make a single 2-D array. 2-D arrays are stacked as-is, just like with hstack. 1-D arrays are turned into 2-D columns first.

In [10]:
weights = np.random.uniform(size=(3,5)) # 왜 3* 5로 weight를 만들지? 5는 알겠는데 3은 모르겠다. 3은 구분되는게 3개니까 k개 클래스가 있다. 즉 k = 3
weights

array([[0.74039972, 0.87900992, 0.39592   , 0.77698922, 0.698704  ],
       [0.01859898, 0.74324088, 0.09284932, 0.26521326, 0.9850767 ],
       [0.83815136, 0.18965369, 0.20894612, 0.5704041 , 0.16314294]])

In [11]:
import matplotlib.pyplot as plt
plt.hist(weights)

([array([1., 0., 0., 0., 0., 0., 0., 1., 1., 0.]),
  array([0., 1., 0., 0., 0., 0., 0., 1., 1., 0.]),
  array([1., 1., 0., 1., 0., 0., 0., 0., 0., 0.]),
  array([0., 0., 1., 0., 0., 1., 0., 1., 0., 0.]),
  array([0., 1., 0., 0., 0., 0., 0., 1., 0., 1.])],
 array([0.01859898, 0.11524675, 0.21189453, 0.3085423 , 0.40519007,
        0.50183784, 0.59848562, 0.69513339, 0.79178116, 0.88842893,
        0.9850767 ]),
 <a list of 5 Lists of Patches objects>)

# Softmax function

In [12]:
def softmax(z):
    e = np.exp(z)
    p = e / np.sum(np.exp(z), axis=1).reshape(-1,1)
    return p

In [13]:
y_data.shape, x_data_minmax.shape, weights.shape

((150, 3), (150, 5), (3, 5))

# Cross Entropy

In [14]:
def cross_entropy_function(y, x, weights):
    z = x_data_minmax.dot(weights.T) # W.T * X 부분 :3*5->5*3 되고 이를 dot 하면 150*5 x 5*3 = 150 * 3
    result = -np.sum(np.sum((y * np.log(softmax(z))), axis=1).reshape(-1,1))
    return result

In [15]:
cross_entropy_function(y_data, x_data_minmax, weights)

199.0380421538121

# Weights update

In [16]:
def minimize_gradient(y, x, initial_weights, iterations = 500000, alpha = 0.001):
    cost_history = []
    theta_history = []
    m = y.shape[0]
    theta = np.copy(initial_weights)
    
    number_of_classes = theta.shape[0]
    number_of_weights = theta.shape[1]

    for _ in range(iterations):
        original_theta = np.copy(theta)  
        for k in range(number_of_classes):  # theta[0] = 3
            for j in range(number_of_weights):  # theta[1] = 5
                partial_x = x[:, j]
                partial_entropy = y - softmax(x.dot(original_theta.T))
                theta[k][j] = original_theta[k][j] + (alpha*partial_entropy[:,k].dot(partial_x.T))/m
        if (_ % 10000) == 0:
            print(cross_entropy_function(y,x,theta)/m)
            cost_history.append(cross_entropy_function(y,x,theta))
    return theta, cost_history               

In [17]:
theta, cost_history = minimize_gradient(y_data, x_data_minmax, weights)

1.3266406351640767
0.7866795660773792
0.6355289091552657
0.5551441686707586
0.5033182812457716
0.46597425252306096
0.4371557093540883
0.4138809242827577
0.39446954147150737
0.377889684636821
0.36346574605342663
0.35073306532390625
0.3393594241782117
0.3290997381099756
0.31976846733128655
0.31122206086044374
0.30334736791189004
0.2960537482434219
0.28926755732622844
0.282928202797947
0.2769852679655028
0.27139637659190424
0.26612558306472267
0.2611421416010665
0.25641955329011645
0.25193481973200543
0.24766785230849048
0.24360100009176866
0.23971866918052492
0.23600701320365475
0.23245367973831113
0.22904760103896663
0.22577882016592563
0.22263834560775744
0.21961802900183916
0.21671046170387542
0.21390888683560452
0.2112071241181813
0.20859950532647323
0.20608081861302405
0.20364626027669744
0.2012913928100208
0.19901210826615173
0.19680459615259166
0.19466531519303562
0.1925909684077651
0.1905784810519442
0.18862498102412986
0.18672778141737564
0.1848843649350221


In [18]:
rand_index = np.random.randint(0,150,30)
rand_index

array([ 97,  63, 104,  67,  66,  93, 108,  37,  44,  43,  75, 148,  13,
        87,  82, 132,   4, 101, 139,  98,  62,  81, 106,  11,  58,   7,
       147,  10,   7, 100])

In [19]:
y_pred = np.argmax(softmax(x_data_minmax[rand_index].dot(theta.T)),axis=1)
y_pred

array([1, 1, 2, 1, 1, 1, 2, 0, 0, 0, 1, 2, 0, 1, 1, 2, 0, 2, 2, 1, 1, 1,
       1, 0, 1, 0, 2, 0, 0, 2], dtype=int64)

In [20]:
y_true = np.argmax(y_data[rand_index],axis=1)
y_true

array([1, 1, 2, 1, 1, 1, 2, 0, 0, 0, 1, 2, 0, 1, 1, 2, 0, 2, 2, 1, 1, 1,
       2, 0, 1, 0, 2, 0, 0, 2], dtype=int64)

In [21]:
y_pred == y_true

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True])

In [22]:
sum(y_pred == y_true) / len(rand_index)

0.9666666666666667