### 데이터 전처리
##### 1. 레이블 인코딩

In [2]:
from sklearn.preprocessing import LabelEncoder

In [3]:
items = ['TV', '냉장고', '전자렌지', '컴퓨터', '선풍기', '선풍기', '믹서', '믹서']

In [5]:
# 객체생성
le = LabelEncoder()

In [6]:
# 학습
le.fit(items)

In [8]:
# 인코딩 실행, 변환 작업
labels = le.transform(items)
labels

array([0, 1, 4, 5, 3, 3, 2, 2])

In [10]:
# 단축 형태
le2 = LabelEncoder()
labels = le2.fit_transform(items)
labels

array([0, 1, 4, 5, 3, 3, 2, 2])

In [13]:
# 고수들은
labels = LabelEncoder().fit_transform(items)
labels

array([0, 1, 4, 5, 3, 3, 2, 2])

In [14]:
le.inverse_transform([2,3,5,0,1])

array(['믹서', '선풍기', '컴퓨터', 'TV', '냉장고'], dtype='<U4')

#### 2. One-hot encoding

In [17]:
from sklearn.preprocessing import OneHotEncoder

In [21]:
ohe = OneHotEncoder()
oh_labels = ohe.fit_transform(labels.reshape(-1,1))
oh_labels.toarray()

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]])

In [22]:
from tensorflow.keras.utils import to_categorical
to_categorical(labels)

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]], dtype=float32)

#### 3. 표준화
- 평균 0, 표준편차 1인 가우시안 표준정규분포

In [24]:
from sklearn.datasets import load_iris
iris = load_iris()

In [27]:
from sklearn.preprocessing import StandardScaler
iris_std = StandardScaler().fit_transform(iris.data)
iris_std[:5]

array([[-0.90068117,  1.01900435, -1.34022653, -1.3154443 ],
       [-1.14301691, -0.13197948, -1.34022653, -1.3154443 ],
       [-1.38535265,  0.32841405, -1.39706395, -1.3154443 ],
       [-1.50652052,  0.09821729, -1.2833891 , -1.3154443 ],
       [-1.02184904,  1.24920112, -1.34022653, -1.3154443 ]])

- 로지스틱 회귀(Logistic Regression)

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [32]:
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, stratify=iris.target, test_size=0.2, random_state=2023
)
lr = LogisticRegression()
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [35]:
lr.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [36]:
X_train, X_test, y_train, y_test = train_test_split(
    iris_std, iris.target, stratify=iris.target, test_size=0.2, random_state=2023
)
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [38]:
lr.score(X_test, y_test)

0.9666666666666667

#### 4. 정규화
- 최소값 0, 최대값 1

In [39]:
from sklearn.preprocessing import MinMaxScaler
iris_mm = MinMaxScaler().fit_transform(iris.data)

In [41]:
for i in range(4):
  print((iris_mm[:,i]).min(), iris_mm[:,i].max())

0.0 1.0
0.0 1.0
0.0 1.0
0.0 1.0


In [42]:
X_train, X_test, y_train, y_test = train_test_split(
    iris_mm, iris.target, stratify=iris.target, test_size=0.2, random_state=2023
)
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [43]:
lr.score(X_test, y_test)

0.9333333333333333