### RandomForest
- 집단 학습을 기반으로 고정밀 분류, 회귀, 클러스터링 등을 구현
- 학습 전용 데이터를 기반으로 다수의 의사결정트리를 만들고 만들어진 의사결정 트리를 기반으로 다수 결과를 유도하는 모델 

In [1]:
import pandas as pd

In [2]:
iris = pd.read_csv("../Data/iris.csv")
iris.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Name
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


### Feature와 Target 분리


In [4]:
iris_data = iris.loc[:,'SepalLength':'PetalWidth']
iris_data.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [6]:
iris_target = iris.loc[:,'Name']
iris_target[:6]

0    Iris-setosa
1    Iris-setosa
2    Iris-setosa
3    Iris-setosa
4    Iris-setosa
5    Iris-setosa
Name: Name, dtype: object

In [7]:
# 전체 갯수 확인
print(iris_data.shape)
print(iris_target.shape)

(150, 4)
(150,)


### Train과 Test분리

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
train_data, test_data, train_target, test_target = \
                        train_test_split(
                            iris_data,
                            iris_target,
                            random_state=42,
                            stratify=iris_target
                        )

In [15]:
print(train_data.shape)
print(test_data.shape)
print(train_target.shape)
print(test_target.shape)

(112, 4)
(38, 4)
(112,)
(38,)


In [16]:
# Sampling 판단
print(train_data.describe())
print(test_data.describe())

       SepalLength  SepalWidth  PetalLength  PetalWidth
count   112.000000  112.000000   112.000000  112.000000
mean      5.877679    3.061607     3.765179    1.192857
std       0.853262    0.440770     1.782584    0.771533
min       4.300000    2.000000     1.100000    0.100000
25%       5.100000    2.800000     1.575000    0.300000
50%       5.800000    3.000000     4.300000    1.300000
75%       6.400000    3.300000     5.100000    1.825000
max       7.900000    4.400000     6.900000    2.500000
       SepalLength  SepalWidth  PetalLength  PetalWidth
count    38.000000   38.000000    38.000000   38.000000
mean      5.742105    3.031579     3.739474    1.215789
std       0.750363    0.416618     1.733071    0.747799
min       4.400000    2.300000     1.000000    0.200000
25%       5.125000    2.825000     1.700000    0.300000
50%       5.700000    3.000000     4.500000    1.400000
75%       6.375000    3.300000     5.075000    1.800000
max       7.300000    4.000000     6.300000    2

In [17]:
print(train_target.describe())
print(test_target.describe())

count             112
unique              3
top       Iris-setosa
freq               38
Name: Name, dtype: object
count                  38
unique                  3
top       Iris-versicolor
freq                   13
Name: Name, dtype: object


### RandomForest로 분류

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [31]:
# 모델 만들기
clf = RandomForestClassifier(n_estimators=1) # n_estimator = 100 <- 기본값 

In [32]:
# 학습 시키기
clf.fit(train_data, train_target)

In [33]:
# 평가하기 
print(clf.score(train_data, train_target))
print(clf.score(test_data, test_target))

0.9732142857142857
0.9210526315789473


---
### 독버섯 관련된 데이터를 사용한 머신러닝
- 8124종류의 버섯의 특징과 독의 유무로 구성되어 있는 데이터셋
- 버섯의 특징을 기반으로 독의 유무를 판단

In [34]:
# 데이터 획득하기
import urllib.request as req
local = "../Data/mushroom.csv"
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
req.urlretrieve(url, local)
print('OK')

OK


In [35]:
import pandas as pd

In [37]:
mr = pd.read_csv("../Data/mushroom.csv", header=None)
mr.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


- 한줄이 버섯 한종류
- 첫번째 열 : p(독버섯), e(식용)
- 두번째 열 : 버섯의 머리 모양 => b(벨형태), c(원뿔), x(볼록),f(평평),k(혹), s(오목)
- 네번째 열 : 버섯의 머리 색깔 => n(갈색), b(황갈색), c(연한갈색), .... 
- 일단 각각의 기호가 알파벳 1자리 -> 숫자로 변환 

In [38]:
mr.shape

(8124, 23)

In [40]:
# 연습 : 알파벳 -> ASCII 
ord('x')

# ASCII -> 알파벳
chr(120)

'x'

In [43]:
# 연습 : mr의 1번컬럼을 숫자로 변경
for i in mr.iloc[:,1]:
    print(i, ord(i))

x 120
x 120
b 98
x 120
x 120
x 120
b 98
b 98
x 120
b 98
x 120
x 120
b 98
x 120
x 120
s 115
f 102
x 120
x 120
x 120
b 98
x 120
b 98
b 98
b 98
f 102
x 120
x 120
f 102
x 120
b 98
x 120
x 120
x 120
b 98
x 120
s 115
x 120
x 120
b 98
b 98
x 120
x 120
x 120
x 120
x 120
x 120
x 120
x 120
f 102
x 120
x 120
b 98
x 120
x 120
b 98
f 102
b 98
x 120
x 120
s 115
b 98
b 98
b 98
b 98
f 102
x 120
f 102
x 120
x 120
f 102
b 98
f 102
x 120
b 98
f 102
x 120
f 102
x 120
f 102
x 120
x 120
f 102
x 120
x 120
x 120
b 98
x 120
f 102
s 115
x 120
b 98
x 120
x 120
x 120
x 120
f 102
x 120
b 98
x 120
x 120
b 98
f 102
x 120
b 98
x 120
x 120
b 98
b 98
x 120
x 120
s 115
x 120
x 120
x 120
x 120
s 115
x 120
x 120
s 115
x 120
x 120
f 102
f 102
x 120
x 120
b 98
f 102
x 120
b 98
b 98
b 98
f 102
x 120
f 102
x 120
f 102
x 120
x 120
b 98
x 120
b 98
s 115
f 102
x 120
x 120
f 102
x 120
b 98
b 98
x 120
x 120
x 120
s 115
x 120
x 120
b 98
x 120
b 98
b 98
b 98
b 98
x 120
f 102
x 120
f 102
b 98
b 98
x 120
b 98
x 120
b 98
x 120
b 98
f 1

In [51]:
target = []
data = []

for row_index, row in mr.iterrows():
    # print(row_index, ":", row)
    # print(row.loc[0], ":", row.loc[1])
    target.append(row.loc[0])
    row_data = []
    for v in row.loc[1:]:
        row_data.append(ord(v))
    data.append(row_data)

In [57]:
data[:2]

[[120,
  115,
  110,
  116,
  112,
  102,
  99,
  110,
  107,
  101,
  101,
  115,
  115,
  119,
  119,
  112,
  119,
  111,
  112,
  107,
  115,
  117],
 [120,
  115,
  121,
  116,
  97,
  102,
  99,
  98,
  107,
  101,
  99,
  115,
  115,
  119,
  119,
  112,
  119,
  111,
  112,
  110,
  110,
  103]]

### DataFrame 만들기

In [58]:
# Target을 기준으로 DataFrame 만들기
targetTemp = pd.DataFrame(target)
targetTemp.head()

Unnamed: 0,0
0,p
1,e
2,e
3,p
4,e


In [60]:
# data를 dataframe으로 만들고 Column Name을 겹치지 않게 하기 
dataTemp = pd.DataFrame(data)
dataTemp.rename(
    columns=lambda x: x+1, 
    inplace=True
)
dataTemp.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,13,14,15,16,17,18,19,20,21,22
0,120,115,110,116,112,102,99,110,107,101,...,115,119,119,112,119,111,112,107,115,117
1,120,115,121,116,97,102,99,98,107,101,...,115,119,119,112,119,111,112,110,110,103
2,98,115,119,116,108,102,99,98,110,101,...,115,119,119,112,119,111,112,110,110,109
3,120,121,119,116,112,102,99,110,110,101,...,115,119,119,112,119,111,112,107,115,117
4,120,115,103,102,110,102,119,98,107,116,...,115,119,119,112,119,111,101,110,97,103


In [61]:
# 2개의 dataframe 합치기
mr2 = pd.concat([targetTemp, dataTemp], axis='columns')
mr2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,p,120,115,110,116,112,102,99,110,107,...,115,119,119,112,119,111,112,107,115,117
1,e,120,115,121,116,97,102,99,98,107,...,115,119,119,112,119,111,112,110,110,103
2,e,98,115,119,116,108,102,99,98,110,...,115,119,119,112,119,111,112,110,110,109
3,p,120,121,119,116,112,102,99,110,110,...,115,119,119,112,119,111,112,107,115,117
4,e,120,115,103,102,110,102,119,98,107,...,115,119,119,112,119,111,101,110,97,103


In [63]:
mr.shape == mr2.shape

True

In [64]:
mr2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       8124 non-null   object
 1   1       8124 non-null   int64 
 2   2       8124 non-null   int64 
 3   3       8124 non-null   int64 
 4   4       8124 non-null   int64 
 5   5       8124 non-null   int64 
 6   6       8124 non-null   int64 
 7   7       8124 non-null   int64 
 8   8       8124 non-null   int64 
 9   9       8124 non-null   int64 
 10  10      8124 non-null   int64 
 11  11      8124 non-null   int64 
 12  12      8124 non-null   int64 
 13  13      8124 non-null   int64 
 14  14      8124 non-null   int64 
 15  15      8124 non-null   int64 
 16  16      8124 non-null   int64 
 17  17      8124 non-null   int64 
 18  18      8124 non-null   int64 
 19  19      8124 non-null   int64 
 20  20      8124 non-null   int64 
 21  21      8124 non-null   int64 
 22  22      8124 non-null   

In [65]:
mr2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,p,120,115,110,116,112,102,99,110,107,...,115,119,119,112,119,111,112,107,115,117
1,e,120,115,121,116,97,102,99,98,107,...,115,119,119,112,119,111,112,110,110,103
2,e,98,115,119,116,108,102,99,98,110,...,115,119,119,112,119,111,112,110,110,109
3,p,120,121,119,116,112,102,99,110,110,...,115,119,119,112,119,111,112,107,115,117
4,e,120,115,103,102,110,102,119,98,107,...,115,119,119,112,119,111,101,110,97,103


### RandomForest

In [66]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [67]:
# Train과 Test 분리
train_data, test_data, train_target, test_target = \
                    train_test_split(
                        mr2.loc[:,1:],
                        mr2.loc[:,0],
                        random_state=42,
                        stratify=mr2.loc[:,0]
                    )

In [68]:
# 데이터 학습시키고 예측하기
clf = RandomForestClassifier()
clf.fit(train_data, train_target)
print("Training :", clf.score(train_data, train_target))
print("Test     :", clf.score(test_data, test_target))

Training : 1.0
Test     : 1.0


In [69]:
mr2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,p,120,115,110,116,112,102,99,110,107,...,115,119,119,112,119,111,112,107,115,117
1,e,120,115,121,116,97,102,99,98,107,...,115,119,119,112,119,111,112,110,110,103
2,e,98,115,119,116,108,102,99,98,110,...,115,119,119,112,119,111,112,110,110,109
3,p,120,121,119,116,112,102,99,110,110,...,115,119,119,112,119,111,112,107,115,117
4,e,120,115,103,102,110,102,119,98,107,...,115,119,119,112,119,111,101,110,97,103


----
### 희소행렬(Sparse Matrix) <= One-hot Encoding
- 숫자 데이터가 숫자로써 의미가 있으면 상관없지만, 위의 데이터는 분류를 위한 데이터이므로 숫자 크기의 의미가 없다.
- 이때 사용하는 것이 One-hot Encoding 이다. 

In [None]:
# 연습 : 1번열의 data 종류
mr2.loc[:,1].unique()

array([120,  98, 115, 102, 107,  99], dtype=int64)

In [71]:
dataTemp.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,13,14,15,16,17,18,19,20,21,22
0,120,115,110,116,112,102,99,110,107,101,...,115,119,119,112,119,111,112,107,115,117
1,120,115,121,116,97,102,99,98,107,101,...,115,119,119,112,119,111,112,110,110,103
2,98,115,119,116,108,102,99,98,110,101,...,115,119,119,112,119,111,112,110,110,109
3,120,121,119,116,112,102,99,110,110,101,...,115,119,119,112,119,111,112,107,115,117
4,120,115,103,102,110,102,119,98,107,116,...,115,119,119,112,119,111,101,110,97,103


In [72]:
pd.get_dummies(
    data=dataTemp,
    columns=[1],
    prefix='1'
)

Unnamed: 0,2,3,4,5,6,7,8,9,10,11,...,19,20,21,22,1_98,1_99,1_102,1_107,1_115,1_120
0,115,110,116,112,102,99,110,107,101,101,...,112,107,115,117,False,False,False,False,False,True
1,115,121,116,97,102,99,98,107,101,99,...,112,110,110,103,False,False,False,False,False,True
2,115,119,116,108,102,99,98,110,101,99,...,112,110,110,109,True,False,False,False,False,False
3,121,119,116,112,102,99,110,110,101,101,...,112,107,115,117,False,False,False,False,False,True
4,115,103,102,110,102,119,98,107,116,101,...,101,110,97,103,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,115,110,102,110,97,99,98,121,101,63,...,112,98,99,108,False,False,False,True,False,False
8120,115,110,102,110,97,99,98,121,101,63,...,112,98,118,108,False,False,False,False,False,True
8121,115,110,102,110,97,99,98,110,101,63,...,112,98,99,108,False,False,True,False,False,False
8122,121,110,102,121,102,99,110,98,116,63,...,101,119,118,108,False,False,False,True,False,False


In [73]:
# 적용하기
for i in range(1,22+1):
    dataTemp = pd.get_dummies(
                    data=dataTemp,
                    columns=[i],
                    prefix=str(i)
                )

dataTemp.head()

Unnamed: 0,1_98,1_99,1_102,1_107,1_115,1_120,2_102,2_103,2_115,2_121,...,21_115,21_118,21_121,22_100,22_103,22_108,22_109,22_112,22_117,22_119
0,False,False,False,False,False,True,False,False,True,False,...,True,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,True,False,False,True,False,...,False,False,False,False,True,False,False,False,False,False
2,True,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False
3,False,False,False,False,False,True,False,False,False,True,...,True,False,False,False,False,False,False,False,True,False
4,False,False,False,False,False,True,False,False,True,False,...,False,False,False,False,True,False,False,False,False,False


In [74]:
# dataframe 만들기
mr3 = pd.concat(
    [targetTemp, dataTemp],
    axis='columns'
)
mr3.head()

Unnamed: 0,0,1_98,1_99,1_102,1_107,1_115,1_120,2_102,2_103,2_115,...,21_115,21_118,21_121,22_100,22_103,22_108,22_109,22_112,22_117,22_119
0,p,False,False,False,False,False,True,False,False,True,...,True,False,False,False,False,False,False,False,True,False
1,e,False,False,False,False,False,True,False,False,True,...,False,False,False,False,True,False,False,False,False,False
2,e,True,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False
3,p,False,False,False,False,False,True,False,False,False,...,True,False,False,False,False,False,False,False,True,False
4,e,False,False,False,False,False,True,False,False,True,...,False,False,False,False,True,False,False,False,False,False


In [None]:
# Train과 Test 분리
train_data, test_data, train_target, test_target = \
                    train_test_split(
                        mr3.iloc[:,1:],
                        mr3.iloc[:,0],
                        random_state=42,
                        stratify=mr3.iloc[:,0]
                    )

In [76]:
# 데이터 학습시키고 예측하기
clf = RandomForestClassifier()
clf.fit(train_data, train_target)
print("Training :", clf.score(train_data, train_target))
print("Test     :", clf.score(test_data, test_target))

Training : 1.0
Test     : 1.0
