## Encoding

Label Encoding

In [1]:
from sklearn.preprocessing import LabelEncoder
items = ['TV','냉장고','전자레인지','컴퓨터','선풍기','선풍기','믹서','믹서']

encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)
print("인코딩 변환 값 :",labels)

인코딩 변환 값 : [0 1 4 5 3 3 2 2]


In [2]:
print("인코딩 클래스 : ",encoder.classes_)

인코딩 클래스 :  ['TV' '냉장고' '믹서' '선풍기' '전자레인지' '컴퓨터']


decoding

In [3]:
print("디코딩 원본값 : ",encoder.inverse_transform([4,5,2,0,1,1,3,3]))

디코딩 원본값 :  ['전자레인지' '컴퓨터' '믹서' 'TV' '냉장고' '냉장고' '선풍기' '선풍기']


One-Hot Encoding

In [5]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

items = ['TV','냉장고','전자레인지','컴퓨터','선풍기','선풍기','믹서','믹서']

#LabelEncoder로 먼저 변환
encoder = LabelEncoder()
labels = encoder.fit_transform(items)

#2차원 데이터로 변환
labels = labels.reshape(-1,1)

#one-hot encoding 적용
ohe = OneHotEncoder()
oh_labels = ohe.fit_transform(labels)

print("원핫 인코딩 데이터 ;\n",oh_labels.toarray())
print('원핫 인코딩 데이터 차원 :',oh_labels.shape)

원핫 인코딩 데이터 ;
 [[1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]]
원핫 인코딩 데이터 차원 : (8, 6)


get_dummies 이용

In [7]:
import pandas as pd
df = pd.DataFrame({'item':['TV','냉장고','전자레인지','컴퓨터','선풍기','선풍기','믹서','믹서']})
pd.get_dummies(df)

Unnamed: 0,item_TV,item_냉장고,item_믹서,item_선풍기,item_전자레인지,item_컴퓨터
0,1,0,0,0,0,0
1,0,1,0,0,0,0
2,0,0,0,0,1,0
3,0,0,0,0,0,1
4,0,0,0,1,0,0
5,0,0,0,1,0,0
6,0,0,1,0,0,0
7,0,0,1,0,0,0


## 피처 스케일링, 정규화

StandardScaler

In [10]:
from sklearn.datasets import load_iris
import pandas as pd

iris = load_iris()
iris_data = iris.data
iris_df = pd.DataFrame(data = iris.data,columns = iris.feature_names)

print('features 평균 : \n',iris_df.mean())
print("\nfeatures 분산 : \n",iris_df.var())

features 평균 : 
 sepal length (cm)    5.843333
sepal width (cm)     3.057333
petal length (cm)    3.758000
petal width (cm)     1.199333
dtype: float64

features 분산 : 
 sepal length (cm)    0.685694
sepal width (cm)     0.189979
petal length (cm)    3.116278
petal width (cm)     0.581006
dtype: float64


In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
iris_scaled = scaler.fit_transform(iris_df)

iris_df_scaled = pd.DataFrame(data = iris_scaled,columns=iris.feature_names)
print("feature 평균 : \n",iris_df_scaled.mean())
print('\nfeature 분산 : \n',iris_df_scaled.var())

feature 평균 : 
 sepal length (cm)   -1.690315e-15
sepal width (cm)    -1.842970e-15
petal length (cm)   -1.698641e-15
petal width (cm)    -1.409243e-15
dtype: float64

feature 분산 : 
 sepal length (cm)    1.006711
sepal width (cm)     1.006711
petal length (cm)    1.006711
petal width (cm)     1.006711
dtype: float64


minmaxscaler

In [15]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
iris_scaled = scaler.fit_transform(iris_df)

iris_df_scaled = pd.DataFrame(data = iris_scaled,columns=iris.feature_names)
print("feature 최소값 : \n",iris_df_scaled.min())
print("feature 최대값 : \n",iris_df_scaled.max())

feature 최소값 : 
 sepal length (cm)    0.0
sepal width (cm)     0.0
petal length (cm)    0.0
petal width (cm)     0.0
dtype: float64
feature 최대값 : 
 sepal length (cm)    1.0
sepal width (cm)     1.0
petal length (cm)    1.0
petal width (cm)     1.0
dtype: float64


test data에 fit을 적용할 때 주의 사항

In [17]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

train_array = np.arange(0,11).reshape(-1,1)
test_array = np.arange(0,6).reshape(-1,1)

In [None]:
#MinMaxScaler 객체에 별도의 feature_range파라미터 지정하지 않으면 0-1 값으로 변환
scaler = MinMaxScaler()

scaler.fit(train_array)

#1/10 scale로 train_array변환
train_scaled = scaler.transform(train_array)

print()