# 머신러닝 기본

In [1]:
import sklearn

In [2]:
sklearn.__version__

'0.24.2'

In [3]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split #데이터분리
import pandas as pd

In [4]:
iris = load_iris()

In [5]:
type(iris) # sklearn.utils.Bunch -> dict형태처럼 사용가능

sklearn.utils.Bunch

In [6]:
iris['data']
iris.keys()
iris['target']
iris['target_names']

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [7]:
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df.head(2)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2


In [8]:
iris_df['label'] = iris.target

In [9]:
iris_df.sample(2)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
124,6.7,3.3,5.7,2.1,2
145,6.7,3.0,5.2,2.3,2


In [10]:
X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                    iris.target,
                                                    test_size=0.2,
                                                    random_state=11)

In [11]:
X_train.shape, X_test.shape

((120, 4), (30, 4))

In [12]:
y_train.shape, y_test.shape

((120,), (30,))

In [13]:
dt_clf = DecisionTreeClassifier(random_state=11)

In [14]:
dt_clf.fit(X_train, y_train)

DecisionTreeClassifier(random_state=11)

In [15]:
pred = dt_clf.predict(X_test)  # 예측,평가할때 사용함

In [16]:
from sklearn.metrics import accuracy_score

In [17]:
accuracy_score(y_test, pred) #정답, 예측값
accuracy_score(y_test, pred)*100 #백분율로 퍼센트 나옴

93.33333333333333

## 데이터 분리 없이

In [18]:
iris = load_iris()
dt_clf = DecisionTreeClassifier()
train_data = iris.data
train_label = iris.target
dt_clf.fit(train_data, train_label)

DecisionTreeClassifier()

In [19]:
pred = dt_clf.predict(train_data)

In [20]:
accuracy_score(train_label, pred)

1.0

## K-Fold

In [21]:
from sklearn.model_selection import KFold
import numpy as np

In [22]:
iris = load_iris()
dt_clf = DecisionTreeClassifier(random_state=156)

In [23]:
kfold = KFold(n_splits=5,shuffle=True) #K값 = 5
cv_accuracy = [] #값을 저장할 리스트

In [24]:
n_iter = 0
for train_index, test_index in kfold.split(iris.data):
    # print(train_index, test_index)
    X_train, X_test = iris.data[train_index],iris.data[test_index]
    # print(X_train, X_test)
    y_train, y_test = iris.target[train_index],iris.target[test_index]
    # print(y_train, y_test)
    dt_clf.fit(X_train, y_train) #학습시키기
    pred = dt_clf.predict(X_test) #예측값받기
    n_iter += 1
    accuracy = np.round(accuracy_score(y_test, pred), 4) #소수점 4자리 반올림(round)
    train_size = X_train.shape[0]
    test_size = X_test.shape[0]
    print(f'\n{n_iter}번째 교차검증정확도 : {accuracy}, 학습데이터크기 : {train_size}, 검증데이터크기 : {test_size}')
    print(f'\n{n_iter} 검증데이터 인덱스 : {test_index}')
    cv_accuracy.append(accuracy)


1번째 교차검증정확도 : 0.9333, 학습데이터크기 : 120, 검증데이터크기 : 30

1 검증데이터 인덱스 : [ 12  13  14  15  19  21  23  33  35  44  50  63  65  71  74  77  81  83
  84  96 104 108 111 121 127 128 136 141 142 149]

2번째 교차검증정확도 : 0.9, 학습데이터크기 : 120, 검증데이터크기 : 30

2 검증데이터 인덱스 : [  4   8  31  32  38  42  43  45  56  57  59  62  64  70  73  80  87  93
  95  97 102 105 106 120 129 133 135 139 144 146]

3번째 교차검증정확도 : 1.0, 학습데이터크기 : 120, 검증데이터크기 : 30

3 검증데이터 인덱스 : [  1   7   9  18  26  37  39  41  47  48  51  60  69  72  76  79  82  86
  89  94  99 103 109 110 114 125 126 130 134 137]

4번째 교차검증정확도 : 0.9667, 학습데이터크기 : 120, 검증데이터크기 : 30

4 검증데이터 인덱스 : [  0   6  10  17  20  22  28  29  30  34  36  40  46  55  58  61  67  75
  78  88  90  98 101 112 113 119 123 131 138 147]

5번째 교차검증정확도 : 1.0, 학습데이터크기 : 120, 검증데이터크기 : 30

5 검증데이터 인덱스 : [  2   3   5  11  16  24  25  27  49  52  53  54  66  68  85  91  92 100
 107 115 116 117 118 122 124 132 140 143 145 148]


In [25]:
np.mean(cv_accuracy)

0.96

### statified K-Fold

In [26]:
iris_df

iris_df['label'].value_counts()

from sklearn.model_selection import StratifiedKFold

dkf = StratifiedKFold(n_splits=3)
n_iter = 0
for train_index, test_index in dkf.split(iris_df, iris_df['label']):
    n_iter += 1
    label_train = iris_df['label'].iloc[train_index] #iloc->인덱스로 선택
    label_test = iris_df['label'].iloc[test_index]
    print(n_iter)
    print('학습\n', label_train.value_counts())
    print('검증\n', label_test.value_counts())
    print('-----------------------------------------')



1
학습
 2    34
0    33
1    33
Name: label, dtype: int64
검증
 0    17
1    17
2    16
Name: label, dtype: int64
-----------------------------------------
2
학습
 1    34
0    33
2    33
Name: label, dtype: int64
검증
 0    17
2    17
1    16
Name: label, dtype: int64
-----------------------------------------
3
학습
 0    34
1    33
2    33
Name: label, dtype: int64
검증
 1    17
2    17
0    16
Name: label, dtype: int64
-----------------------------------------


In [27]:
iris = load_iris()
dt_clf = DecisionTreeClassifier(random_state=156)

skfold = StratifiedKFold(n_splits=3,shuffle=True) 
cv_accuracy = []
n_iter = 0
for train_index, test_index in skfold.split(iris.data,iris.target):
    X_train, X_test = iris.data[train_index],iris.data[test_index]
    y_train, y_test = iris.target[train_index],iris.target[test_index]
    dt_clf.fit(X_train, y_train) 
    pred = dt_clf.predict(X_test) 
    n_iter += 1
    accuracy = np.round(accuracy_score(y_test, pred), 4)
    train_size = X_train.shape[0]
    test_size = X_test.shape[0]
    print(f'\n{n_iter}번째 교차검증정확도 : {accuracy}, 학습데이터크기 : {train_size}, 검증데이터크기 : {test_size}')
    print(f'\n{n_iter} 검증데이터 인덱스 : {test_index}')
    cv_accuracy.append(accuracy)


1번째 교차검증정확도 : 0.98, 학습데이터크기 : 100, 검증데이터크기 : 50

1 검증데이터 인덱스 : [  0   5   7  11  13  17  18  22  23  24  27  29  30  35  38  44  46  54
  58  59  62  63  66  68  69  73  74  76  78  79  85  89  90  95 101 109
 112 114 116 117 120 121 123 127 129 130 131 134 136 139]

2번째 교차검증정확도 : 0.9, 학습데이터크기 : 100, 검증데이터크기 : 50

2 검증데이터 인덱스 : [  2   3   4   9  10  14  19  20  21  26  28  33  34  37  39  42  47  51
  52  53  55  56  61  67  71  75  77  82  83  86  91  92  97 104 107 108
 113 115 119 126 128 132 133 135 137 138 141 142 144 145]

3번째 교차검증정확도 : 0.96, 학습데이터크기 : 100, 검증데이터크기 : 50

3 검증데이터 인덱스 : [  1   6   8  12  15  16  25  31  32  36  40  41  43  45  48  49  50  57
  60  64  65  70  72  80  81  84  87  88  93  94  96  98  99 100 102 103
 105 106 110 111 118 122 124 125 140 143 146 147 148 149]


### cross_val_score

In [28]:
from sklearn.model_selection import cross_val_score

In [29]:
iris = load_iris()
dt_clf = DecisionTreeClassifier(random_state=156)

In [30]:
cross_val_score(dt_clf, iris.data, iris.target, scoring='accuracy', cv=3) #accuracy로 평가, cv->3등분 #cv변경가능

array([0.98, 0.94, 0.98])

In [31]:
grid_parameters = {'max_depth':[1,2,3],
                   'min_samples_split':[2,3]}

In [32]:
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                    iris.target,
                                                    test_size=0.2,
                                                    random_state=121)
dt_clf = DecisionTreeClassifier()

In [33]:
from sklearn.model_selection import GridSearchCV
grid_dtree = GridSearchCV(dt_clf,
                          param_grid=grid_parameters,
                          cv=3,
                          refit=True)

In [34]:
grid_dtree.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [1, 2, 3], 'min_samples_split': [2, 3]})

In [35]:
grid_dtree.cv_results_

{'mean_fit_time': array([0.00033236, 0.00025717, 0.00066479, 0.00033243, 0.00033236,
        0.00033236]),
 'std_fit_time': array([0.00047002, 0.0003637 , 0.00047008, 0.00047013, 0.00047002,
        0.00047002]),
 'mean_score_time': array([0.        , 0.00033243, 0.        , 0.        , 0.00033251,
        0.00033259]),
 'std_score_time': array([0.        , 0.00047013, 0.        , 0.        , 0.00047025,
        0.00047036]),
 'param_max_depth': masked_array(data=[1, 1, 2, 2, 3, 3],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_split': masked_array(data=[2, 3, 2, 3, 2, 3],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 1, 'min_samples_split': 2},
  {'max_depth': 1, 'min_samples_split': 3},
  {'max_depth': 2, 'min_samples_split': 2},
  {'max_depth': 2, 'min_samples_split': 3},
  {'max_depth': 3, 'min_sample

In [36]:
grid_dtree.best_params_

{'max_depth': 3, 'min_samples_split': 2}

In [37]:
score_df = pd.DataFrame(grid_dtree.cv_results_)
score_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000332,0.00047,0.0,0.0,1,2,"{'max_depth': 1, 'min_samples_split': 2}",0.7,0.7,0.7,0.7,1.110223e-16,5
1,0.000257,0.000364,0.000332,0.00047,1,3,"{'max_depth': 1, 'min_samples_split': 3}",0.7,0.7,0.7,0.7,1.110223e-16,5
2,0.000665,0.00047,0.0,0.0,2,2,"{'max_depth': 2, 'min_samples_split': 2}",0.925,1.0,0.95,0.958333,0.03118048,3
3,0.000332,0.00047,0.0,0.0,2,3,"{'max_depth': 2, 'min_samples_split': 3}",0.925,1.0,0.95,0.958333,0.03118048,3
4,0.000332,0.00047,0.000333,0.00047,3,2,"{'max_depth': 3, 'min_samples_split': 2}",0.975,1.0,0.95,0.975,0.02041241,1
5,0.000332,0.00047,0.000333,0.00047,3,3,"{'max_depth': 3, 'min_samples_split': 3}",0.975,1.0,0.95,0.975,0.02041241,1


In [38]:
score_df.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_max_depth', 'param_min_samples_split', 'params',
       'split0_test_score', 'split1_test_score', 'split2_test_score',
       'mean_test_score', 'std_test_score', 'rank_test_score'],
      dtype='object')

In [39]:
score_df[['params','mean_test_score', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,"{'max_depth': 1, 'min_samples_split': 2}",0.7,5,0.7,0.7,0.7
1,"{'max_depth': 1, 'min_samples_split': 3}",0.7,5,0.7,0.7,0.7
2,"{'max_depth': 2, 'min_samples_split': 2}",0.958333,3,0.925,1.0,0.95
3,"{'max_depth': 2, 'min_samples_split': 3}",0.958333,3,0.925,1.0,0.95
4,"{'max_depth': 3, 'min_samples_split': 2}",0.975,1,0.975,1.0,0.95
5,"{'max_depth': 3, 'min_samples_split': 3}",0.975,1,0.975,1.0,0.95


## 데이터 전처리

### 레이블 인코딩

In [40]:
from sklearn.preprocessing import LabelEncoder

In [41]:
items = ['TV','냉장고','전자레인지','컴퓨터','선풍기','선풍기','믹서','믹서']
encoder = LabelEncoder()
encoder.fit(items) 
labels = encoder.transform(items) #transform ->사전(dict)처리
labels #유일값을 뽑아서 일련번호 부여함

array([0, 1, 4, 5, 3, 3, 2, 2])

In [42]:
encoder.classes_ #->transform한것/ 순서대로 인덱스 0부터 부여됨

array(['TV', '냉장고', '믹서', '선풍기', '전자레인지', '컴퓨터'], dtype='<U5')

In [43]:
encoder.inverse_transform([0,2,4,5]) #숫자로 변경된것을 문자로 변경시켜줌/ 없는 숫자입력하면 에러

array(['TV', '믹서', '전자레인지', '컴퓨터'], dtype='<U5')

In [44]:
from sklearn.preprocessing import OneHotEncoder

In [45]:
encoder = LabelEncoder()
# encoder.fit(items)
# labels = encoder.transform(items)
labels = encoder.fit_transform(items) #위에 두 과정을 하나로 줄여줌(받는 items가 같을때 가능) 
labels = labels.reshape(-1,1) # 모양이 다르면 에러가 나므로 reshape해줌

In [46]:
oh_encoder = OneHotEncoder()
oh_labels = oh_encoder.fit_transform(labels)

In [47]:
oh_labels.toarray()

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]])

In [48]:
oh_labels.shape

(8, 6)

In [49]:
import pandas as pd

In [50]:
df = pd.DataFrame({'item':['TV','냉장고','전자레인지','컴퓨터','선풍기','선풍기','믹서','믹서']})
df

Unnamed: 0,item
0,TV
1,냉장고
2,전자레인지
3,컴퓨터
4,선풍기
5,선풍기
6,믹서
7,믹서


In [51]:
pd.get_dummies(df) #get_dummies를 사용하면 한번에 문자를 숫자로 변경시켜줌

Unnamed: 0,item_TV,item_냉장고,item_믹서,item_선풍기,item_전자레인지,item_컴퓨터
0,1,0,0,0,0,0
1,0,1,0,0,0,0
2,0,0,0,0,1,0
3,0,0,0,0,0,1
4,0,0,0,1,0,0
5,0,0,0,1,0,0
6,0,0,1,0,0,0
7,0,0,1,0,0,0


In [52]:
iris = load_iris()
iris_df = pd.DataFrame(data=iris,columns=iris.feature_names)

In [53]:
iris_df.mean() #평균

sepal length (cm)    NaN
sepal width (cm)     NaN
petal length (cm)    NaN
petal width (cm)     NaN
dtype: object

In [54]:
iris_df.var() #분산

sepal length (cm)    NaN
sepal width (cm)     NaN
petal length (cm)    NaN
petal width (cm)     NaN
dtype: object

In [55]:
from sklearn.preprocessing import StandardScaler

In [56]:
scaler = StandardScaler()

In [57]:
iris_scaled = scaler.fit_transform(iris_df)

ValueError: Found array with 0 sample(s) (shape=(0, 4)) while a minimum of 1 is required by StandardScaler.

In [None]:
iris_df_scaled = pd.DataFrame(data=iris_scaled, columns=iris.feature_names)

In [None]:
iris_df_scaled.mean()

In [None]:
iris_df_scaled.var()

In [58]:
from sklearn.preprocessing import MinMaxScaler

In [59]:
scaler = MinMaxScaler()

In [60]:
iris_scaled = scaler.fit_transform(iris_df)

ValueError: Found array with 0 sample(s) (shape=(0, 4)) while a minimum of 1 is required by MinMaxScaler.

In [None]:
iris_df_scaled = pd.DataFrame(data=iris_scaled, columns=iris.feature_names)

In [61]:
iris_df.min(), iris_df.max()

(sepal length (cm)    NaN
 sepal width (cm)     NaN
 petal length (cm)    NaN
 petal width (cm)     NaN
 dtype: object,
 sepal length (cm)    NaN
 sepal width (cm)     NaN
 petal length (cm)    NaN
 petal width (cm)     NaN
 dtype: object)

In [None]:
iris_df_scaled.min(), iris_df_scaled.max()

In [62]:
train_array = np.arange(11).reshape(-1,1)
test_array = np.arange(6).reshape(-1,1)

In [64]:
train_array
test_array

array([[0],
       [1],
       [2],
       [3],
       [4],
       [5]])

In [65]:
scaler = MinMaxScaler()

In [66]:
scaler.fit(train_array)

MinMaxScaler()

In [67]:
scaler.transform(train_array)

array([[0. ],
       [0.1],
       [0.2],
       [0.3],
       [0.4],
       [0.5],
       [0.6],
       [0.7],
       [0.8],
       [0.9],
       [1. ]])

In [68]:
scaler.transform(test_array)

array([[0. ],
       [0.1],
       [0.2],
       [0.3],
       [0.4],
       [0.5]])

In [69]:
scaler.fit(test_array)

MinMaxScaler()

In [71]:
scaler.transform(test_array)

array([[0. ],
       [0.2],
       [0.4],
       [0.6],
       [0.8],
       [1. ]])