In [124]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
import pandas as pd
import numpy as np
import sklearn


In [8]:
iris = load_iris(as_frame=False)

In [9]:
dt_clf = DecisionTreeClassifier(random_state=156)
score = cross_val_score(dt_clf, iris.data, iris.target, scoring='accuracy', cv=3)
print('교차 검증별 정확도: {}'.format(score))
print('평균 교차 검증 정확도: {}'.format(np.mean(score)))

교차 검증별 정확도: [0.98 0.94 0.98]
평균 교차 검증 정확도: 0.9666666666666667


In [30]:
grid_param = {
    'max_depth': [1, 2, 3],
    'min_samples_split': [2, 3],
}
dtree = DecisionTreeClassifier()
X_train, X_test, Y_train, Y_test = train_test_split(iris.data, 
    iris.target, test_size=0.2, shuffle=True, random_state=121)

grid_dtree = GridSearchCV(dtree, grid_param, cv=3, refit=True)
grid_dtree.fit(X_train, Y_train)
scores_df = pd.DataFrame(grid_dtree.cv_results_)
scores_df = scores_df[['params', 'mean_test_score', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score']]

grid_dtree.best_score_
grid_dtree.best_index_
pred = grid_dtree.best_estimator_.predict(X_test)
print('best esimator acc: {}'.format(accuracy_score(Y_test, pred)))


best esimator acc: 0.9666666666666667


In [None]:
## 레이블 인코딩

In [44]:

items = ['TV', '냉장고', '전자레인지', '컴퓨터', '선풍기', '선풍기', '믹서', '믹서']

In [42]:
encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)
encoder.inverse_transform(labels)

array(['TV', '냉장고', '전자레인지', '컴퓨터', '선풍기', '선풍기', '믹서', '믹서'], dtype='<U5')

In [None]:
## one hot encoding

In [120]:
encoder = LabelEncoder()
labels = encoder.fit_transform(items)
labels = labels.reshape(-1, 1)   # -1: 원래 길이, 8과 같은 의미 len()
items = np.array(items).reshape(-1, 1)

one_hot_encoder = OneHotEncoder(sparse=True, dtype=np.int64)

# one_hot_encoder_labels = one_hot_encoder.fit_transform(labels)
one_hot_encoder_labels = one_hot_encoder.fit_transform(items)
pd.DataFrame(one_hot_encoder_labels.toarray(), columns=one_hot_encoder.get_feature_names_out())


  y = column_or_1d(y, warn=True)


Unnamed: 0,x0_TV,x0_냉장고,x0_믹서,x0_선풍기,x0_전자레인지,x0_컴퓨터
0,1,0,0,0,0,0
1,0,1,0,0,0,0
2,0,0,0,0,1,0
3,0,0,0,0,0,1
4,0,0,0,1,0,0
5,0,0,0,1,0,0
6,0,0,1,0,0,0
7,0,0,1,0,0,0


In [113]:
df = pd.DataFrame({'items': ['TV', '냉장고', '전자레인지', '컴퓨터', '선풍기', '선풍기', '믹서', '믹서']})
pd.get_dummies(df)

Unnamed: 0,items_TV,items_냉장고,items_믹서,items_선풍기,items_전자레인지,items_컴퓨터
0,1,0,0,0,0,0
1,0,1,0,0,0,0
2,0,0,0,0,1,0
3,0,0,0,0,0,1
4,0,0,0,1,0,0
5,0,0,0,1,0,0
6,0,0,1,0,0,0
7,0,0,1,0,0,0


In [None]:
## 스케일링
# 표준화(Standardization): 평균0, 분산1 가우시안 정규분포, 값에서 평균을 빼고 표준편차로 나눔
# 정규화(Normalization): 최대1, 최소0 사이의 값, 값에서 최소값을 빼고 최대-최소로 나눔

In [122]:
# 표준화

In [126]:
iris = load_iris()

In [174]:
df = pd.DataFrame(iris.data, columns=iris.feature_names)
# df.mean(), df.var() # 평균, 분산
# df = (df - df.mean()) / df.std()
# df

scaler = StandardScaler()
iris_scaled = scaler.fit_transform(df)
df_scaled = pd.DataFrame(iris_scaled, columns=map(lambda x: x[:-4] + '(scale)', iris.feature_names))
df_scaled

Unnamed: 0,sepal length (scale),sepal width (scale),petal length (scale),petal width (scale)
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444
...,...,...,...,...
145,1.038005,-0.131979,0.819596,1.448832
146,0.553333,-1.282963,0.705921,0.922303
147,0.795669,-0.131979,0.819596,1.053935
148,0.432165,0.788808,0.933271,1.448832


In [176]:
df = pd.DataFrame(iris.data, columns=iris.feature_names)

scaler = MinMaxScaler()
# scaler.fit(df)
iris_scaled = scaler.fit_transform(df)
df_scaled = pd.DataFrame(iris_scaled, columns=map(lambda x: x[:-4] + '(scale)', iris.feature_names))
df_scaled

Unnamed: 0,sepal length (scale),sepal width (scale),petal length (scale),petal width (scale)
0,0.222222,0.625000,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.500000,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
4,0.194444,0.666667,0.067797,0.041667
...,...,...,...,...
145,0.666667,0.416667,0.711864,0.916667
146,0.555556,0.208333,0.677966,0.750000
147,0.611111,0.416667,0.711864,0.791667
148,0.527778,0.583333,0.745763,0.916667


In [189]:
train_array = np.arange(0, 11).reshape(-1, 1)
test_array = np.arange(0, 6).reshape(-1, 1)

scaler = MinMaxScaler()

scaler.fit(train_array)
train_scaled = scaler.transform(train_array)
print(train_array.reshape(-1))
print(train_scaled.reshape(-1))

# scaler.fit(test_array) # 기존 데이터를 스케일한 비율 그대로 test 또한 스케일해주어야 함, 기준이 변경되면 안됨
test_scaled = scaler.transform(test_array)
print(test_array.reshape(-1))
print(test_scaled.reshape(-1))

scaler.fit(test_array)
test_scaled = scaler.transform(test_array)
print(test_array.reshape(-1))
print(test_scaled.reshape(-1))



[ 0  1  2  3  4  5  6  7  8  9 10]
[0.  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]
[0 1 2 3 4 5]
[0.  0.1 0.2 0.3 0.4 0.5]
[0 1 2 3 4 5]
[0.  0.2 0.4 0.6 0.8 1. ]


In [210]:
temp = pd.read_csv('./kaggle/titanic/train.csv')
temp = temp.drop(['PassengerId', 'Ticket', 'Cabin', 'Name', 'Embarked'], axis=1)
temp

# X_train = temp[temp.columns.difference(['Survived'])]
# Y_train = temp[['Survived']]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.2500
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.9250
3,1,1,female,35.0,1,0,53.1000
4,0,3,male,35.0,0,0,8.0500
...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000
887,1,1,female,19.0,0,0,30.0000
888,0,3,female,,1,2,23.4500
889,1,1,male,26.0,0,0,30.0000
