# softmax regression 함수를 사용한 iris 프로젝트

- toc:true
- branch: master
- badges: true
- comments: true
- author: 



## 11_1_iris



`Q` iris_onehot.csv 파일 읽어서 품종 구분하는 딥러닝 모델 구축

In [27]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn import model_selection, preprocessing

- x, y 데이터 설정

In [38]:
# 데이터 자체를 인코딩 시켜 설정하기
def make_xy_1():
  iris = pd.read_csv('/content/iris_onehot.csv')
  
  return iris.valuse[:, :-3], np.int32(iris.values[:, -3:])

# y의 문자데이터를 코딩을 이용해 0,1로 변환해 설정하기
def make_xy_onehot():
  iris = pd.read_csv('/content/iris.csv')

  x = iris.values[:, :-1]
  variety = iris.values[:, -1]

  bin = preprocessing.LabelBinarizer()
  y = bin.fit_transform(variety)
  #print(y)

  return x, y

# y데이터를 LabelBinarizer를 이용해 변환 후 x데이터 설정하기
def make_xy_onehot_2():
  iris = pd.read_csv('/content/iris.csv')
  # print(iris)

  variety = iris.variety
  print(variety)

  bin = preprocessing.LabelBinarizer()
  y = bin.fit_transform(variety)
  print(y)

  df = iris.drop(['variety'], axis = 1)
  print(df)

  x = df.values

  return x, y

- softmax 모델 구축


In [46]:
def softmax_regression_iris():
  def Dense(x, w, b):
    # (150, 3) = (150, 4) @ (4, 3)
    return x @ w + b

  x, y = make_xy_onehot()
  print(x.shape, y.shape)

# shuffle 해주기 !
  indices = np.arange(len(x))
  np.random.shuffle(indices)
  x = x[indices]
  y = y[indices]

  x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, train_size = 0.7)

  w = tf.Variable(tf.random.uniform([4, 3]))   
  b = tf.Variable(tf.random.uniform([3]))
 
  optimizer = tf.keras.optimizers.SGD(0.01)

  for i in range(100):
    with tf.GradientTape() as tape:
      z = Dense(x_train, w, b)
      hx = softmax(z)

      cce = keras.losses.CategoricalCrossentropy()
      loss = cce(y_train, hx)

    gradient1 = tape.gradient(loss, [w, b])
    optimizer.apply_gradients(zip(gradient1, [w, b]))

    print('loss', i,':', loss.numpy())

# 예측하기

    z = Dense(x_test, w, b)
    p = keras.activations.softmax(z)

# y와 predict 비교하기

    print(y_test[:5])
    print(p.numpy()[:5])

    p_arg = np.argmax(p.numpy(), axis=1)    # argmax 가장 큰 값의 위치를 찾음. 
    print(p_arg)
    y_arg = np.argmax(y_test, axis=1)
    print(y_arg)

    print('acc :', np.mean(p_arg == y_arg))

softmax_regression_iris()

(150, 4) (150, 3)
loss 0 : 2.5280607
[[0 1 0]
 [0 1 0]
 [0 0 1]
 [0 1 0]
 [0 0 1]]
[[0.16958746 0.8050397  0.02537283]
 [0.1672936  0.8144725  0.01823393]
 [0.21000955 0.7747021  0.01528837]
 [0.19635046 0.78429633 0.01935316]
 [0.21480352 0.7684872  0.01670929]]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1]
[1 1 2 1 2 2 2 0 2 1 1 0 2 0 0 0 0 2 1 0 2 2 1 0 2 2 2 0 2 2 0 0 0 1 0 0 1
 2 1 2 0 1 1 0 0]
acc : 0.26666666666666666
loss 1 : 2.2871163
[[0 1 0]
 [0 1 0]
 [0 0 1]
 [0 1 0]
 [0 0 1]]
[[0.21140496 0.75096595 0.03762908]
 [0.21609282 0.75518525 0.02872196]
 [0.2745011  0.700351   0.02514793]
 [0.2518183  0.71797955 0.03020217]
 [0.2778974  0.69507647 0.02702617]]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1]
[1 1 2 1 2 2 2 0 2 1 1 0 2 0 0 0 0 2 1 0 2 2 1 0 2 2 2 0 2 2 0 0 0 1 0 0 1
 2 1 2 0 1 1 0 0]
acc : 0.26666666666666666
loss 2 : 2.0829644
[[0 1 0]
 [0 1 0]
 [0 0 1]
 [0 1 0]
 [0 0 1]]
[

In [None]:
'''
머신러신 중요한 세가지 
1. 정규화 표준화 하기 
2. 셔플하기
3. 문자데이터 숫자로 인코딩 preprocessing
'''

## 11_2_iris_sparse

In [47]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn import model_selection, preprocessing

- x, y 데이터 설정 (sparse버전)

In [48]:
def make_xy_sparse():
    iris = pd.read_csv('/content/iris.csv')
    #print(iris)

    variety = iris.variety
    print(variety)

    enc = preprocessing.LabelEncoder()        # binarizer 대신 encoder 사용
    y = enc.fit_transform(variety)
    print(y)

    df = iris.drop(['variety'], axis = 1)
    print(df)

    x = df.values

    return x, y

- softmax 모델 구축 (sparse버전)

In [52]:
def softmax_regression_iris():
    def Dense(x, w, b):
        # (150, 3) = (150, 4) @ (4, 3)
        return x @ w + b

    x, y = make_xy_sparse()
    print(x.shape,y.shape) # (150, 4) (150,)

    x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, train_size = 0.7)

    w = tf.Variable(tf.random.uniform([4, 3]))
    b = tf.Variable(tf.random.uniform([3]))

    optimizer = tf.keras.optimizers.SGD(0.01)

    for i in range(100):
        with tf.GradientTape() as tape:
            z = Dense(x_train, w, b)
            hx = keras.activations.softmax(z)

            scce = keras.losses.SparseCategoricalCrossentropy()
            loss = scce(y_train, hx)

        gradient = tape.gradient(loss, [w, b])
        optimizer.apply_gradients(zip(gradient, [w, b]))

        print(i, loss.numpy())

## 예측하기

    z = Dense(x_test, w, b)
    p = keras.activations.softmax(z)
    print(p.numpy())
    print(p.numpy().shape)              # (45, 3)

## y와 predict 비교하기 

    print(y_test[:5])
    print(p.numpy()[:5])

    p_arg = np.argmax(p.numpy(), axis=1)    # argmax 가장 큰 값의 위치를 찾음. 
    print(p_arg)

    print('acc :', np.mean(p_arg == y_test))
    ## 한번 데이터를 섞어야 (셔플) 값이 나온다. 


softmax_regression_iris()

0         Setosa
1         Setosa
2         Setosa
3         Setosa
4         Setosa
         ...    
145    Virginica
146    Virginica
147    Virginica
148    Virginica
149    Virginica
Name: variety, Length: 150, dtype: object
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
     sepal.length  sepal.width  petal.length  petal.width
0             5.1          3.5           1.4          0.2
1             4.9          3.0           1.4          0.2
2             4.7          3.2           1.3          0.2
3             4.6          3.1           1.5          0.2
4             5.0          3.6           1.4          0.2
..            ...          ...           ...          ...
145           6.7          3.0           5.2          2.3
1