# Categorical Dataset 분석

In [1]:
# Pandas는 python에서 dataset의 특성을 분석하고, dataset을 다양하게 처리할 수 있도록 하는 모듈
import pandas as pd

In [2]:
car_path='./car.csv'

In [3]:
# read_csv 명령어를 통해 path로부터 csv 파일을 읽어올 수 있음
car = pd.read_csv(car_path)

In [4]:
car

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
5,vhigh,vhigh,2,2,med,high,unacc
6,vhigh,vhigh,2,2,big,low,unacc
7,vhigh,vhigh,2,2,big,med,unacc
8,vhigh,vhigh,2,2,big,high,unacc
9,vhigh,vhigh,2,4,small,low,unacc


In [5]:
# describe() 모듈을 통해 dataset의 특성을 간략하게 살펴볼 수 있음
car.describe()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
count,1728,1728,1728,1728,1728,1728,1728
unique,4,4,4,3,3,3,4
top,high,high,3,4,small,low,unacc
freq,432,432,432,576,576,576,1210


In [6]:
# dtype을 통해 각 feature의 데이터형태를 확인 가능
car.dtypes

buying      object
maint       object
doors       object
persons     object
lug_boot    object
safety      object
class       object
dtype: object

In [7]:
# car dataset을 input과 target으로 분리
car_x = car[['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']]

In [8]:
car_x = car.drop(columns='class')

In [9]:
car_y = car['class']

# Scikit-learn 모듈을 통한 모델 학습

In [10]:
from sklearn.tree import DecisionTreeClassifier

In [11]:
decision = DecisionTreeClassifier('entropy')

In [12]:
# 지금 car dataset은 categorical이기 때문에, 이를 encoding해서 숫자형태로 표현할 필요가 있음
decision.fit(car_x, car_y)

ValueError: could not convert string to float: 'vhigh'

In [13]:
# pandas의 get_dummies 함수는 dataset을 onehot(dummy) 인코딩 수행하는 역할을 함
car_x = pd.get_dummies(car_x)

In [14]:
car_x

Unnamed: 0,buying_high,buying_low,buying_med,buying_vhigh,maint_high,maint_low,maint_med,maint_vhigh,doors_2,doors_3,...,doors_5more,persons_2,persons_4,persons_more,lug_boot_big,lug_boot_med,lug_boot_small,safety_high,safety_low,safety_med
0,0,0,0,1,0,0,0,1,1,0,...,0,1,0,0,0,0,1,0,1,0
1,0,0,0,1,0,0,0,1,1,0,...,0,1,0,0,0,0,1,0,0,1
2,0,0,0,1,0,0,0,1,1,0,...,0,1,0,0,0,0,1,1,0,0
3,0,0,0,1,0,0,0,1,1,0,...,0,1,0,0,0,1,0,0,1,0
4,0,0,0,1,0,0,0,1,1,0,...,0,1,0,0,0,1,0,0,0,1
5,0,0,0,1,0,0,0,1,1,0,...,0,1,0,0,0,1,0,1,0,0
6,0,0,0,1,0,0,0,1,1,0,...,0,1,0,0,1,0,0,0,1,0
7,0,0,0,1,0,0,0,1,1,0,...,0,1,0,0,1,0,0,0,0,1
8,0,0,0,1,0,0,0,1,1,0,...,0,1,0,0,1,0,0,1,0,0
9,0,0,0,1,0,0,0,1,1,0,...,0,0,1,0,0,0,1,0,1,0


In [15]:
# fit을 통해 dataset으로부터 모델 학습
decision.fit(car_x, car_y)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [16]:
# score를 통해 학습이 제대로 되었는지 accuracy를 확인 가능
decision.score(car_x, car_y)

1.0

In [17]:
decision.classes_

array(['acc', 'good', 'unacc', 'vgood'], dtype=object)

In [18]:
# export_graphviz 또는 plot_tree 명령어를 이용해서 생성된 tree를 시각화 할 수 있음.
# 자세한 사용법은 인터넷에 검색을 하기 바람
from sklearn.tree import export_graphviz, plot_tree

In [19]:
print(export_graphviz(decision))

digraph Tree {
node [shape=box] ;
0 [label="X[12] <= 0.5\nentropy = 1.206\nsamples = 1728\nvalue = [384, 69, 1210, 65]"] ;
1 [label="X[19] <= 0.5\nentropy = 1.48\nsamples = 1152\nvalue = [384, 69, 634, 65]"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
2 [label="X[7] <= 0.5\nentropy = 1.641\nsamples = 768\nvalue = [384, 69, 250, 65]"] ;
1 -> 2 ;
3 [label="X[1] <= 0.5\nentropy = 1.686\nsamples = 576\nvalue = [312, 69, 130, 65]"] ;
2 -> 3 ;
4 [label="X[2] <= 0.5\nentropy = 1.429\nsamples = 432\nvalue = [259, 23, 124, 26]"] ;
3 -> 4 ;
5 [label="X[17] <= 0.5\nentropy = 0.954\nsamples = 288\nvalue = [180, 0, 108, 0]"] ;
4 -> 5 ;
6 [label="X[4] <= 0.5\nentropy = 0.803\nsamples = 192\nvalue = [145, 0, 47, 0]"] ;
5 -> 6 ;
7 [label="X[16] <= 0.5\nentropy = 0.449\nsamples = 128\nvalue = [116, 0, 12, 0]"] ;
6 -> 7 ;
8 [label="entropy = 0.0\nsamples = 64\nvalue = [64, 0, 0, 0]"] ;
7 -> 8 ;
9 [label="X[20] <= 0.5\nentropy = 0.696\nsamples = 64\nvalue = [52, 0, 12, 0]"] ;
7 -> 9 ;

In [20]:
from sklearn.linear_model import LogisticRegression

In [21]:
lr = LogisticRegression()

In [22]:
lr.fit(car_x, car_y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [23]:
# coef_를 통해, regression model이 각각의 feature에 어떤 가중치를 부여하고 있는지 확인 가능.
# 양수일 경우 해당 class를 결정하는데 긍정적으로, 음수일 경우 부정적으로 작용
# 절대값이 클 수록 더 큰 영향을 끼침
lr.coef_

array([[-0.0409255 , -0.43687438,  0.10557239, -0.80419097, -0.10308289,
        -0.37419011,  0.10621792, -0.80536338, -0.61488184, -0.22942697,
        -0.16605483, -0.16605483, -3.5918722 ,  1.27357782,  1.14187592,
        -0.13650159, -0.27898241, -0.76093446,  1.33742505, -3.58834165,
         1.07449814],
       [-2.09466093,  1.48858237,  0.51860272, -2.09466093, -2.09466093,
         1.48858237,  0.51860272, -2.09466093, -0.71861261, -0.48784139,
        -0.48784139, -0.48784139, -2.69955334,  0.32021693,  0.19719963,
        -0.66931042, -0.66931042, -0.84351594,  0.07047747, -2.69213047,
         0.43951622],
       [ 1.24550601, -1.39558575, -0.93649209,  2.49116483,  0.8468249 ,
        -0.97408097, -0.97408097,  2.50593004,  1.27407489,  0.25699234,
        -0.06323711, -0.06323711,  5.71095435, -2.26012289, -2.04623846,
        -0.58931501,  0.1552569 ,  1.83865111, -2.91797005,  5.54546459,
        -1.22290153],
       [-2.37569719,  1.43550501,  0.66871487, -2.37569719

In [24]:
lr.classes_

array(['acc', 'good', 'unacc', 'vgood'], dtype=object)

In [25]:
car_x.dtypes

buying_high       uint8
buying_low        uint8
buying_med        uint8
buying_vhigh      uint8
maint_high        uint8
maint_low         uint8
maint_med         uint8
maint_vhigh       uint8
doors_2           uint8
doors_3           uint8
doors_4           uint8
doors_5more       uint8
persons_2         uint8
persons_4         uint8
persons_more      uint8
lug_boot_big      uint8
lug_boot_med      uint8
lug_boot_small    uint8
safety_high       uint8
safety_low        uint8
safety_med        uint8
dtype: object

In [26]:
lr.score(car_x, car_y)

0.8912037037037037

In [27]:
from sklearn.neural_network import MLPClassifier

In [28]:
# MLPClassifier는 hidden_layer의 node 갯수를 설정해줘야함
mlp = MLPClassifier((20,),activation='logistic')

In [29]:
mlp.fit(car_x, car_y)



MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
              beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(20,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [30]:
mlp.score(car_x, car_y)

0.9184027777777778

# Numeric Dataset 분석

In [31]:
wine_path = './winequality-red.csv'

In [32]:
wine = pd.read_csv(wine_path)

In [33]:
wine.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [34]:
wine_x = wine.drop(columns='quality')
wine_y = wine['quality']

In [35]:
decision_wine = DecisionTreeClassifier('entropy')

In [36]:
decision_wine.fit(wine_x, wine_y)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [37]:
decision_wine.classes_

array([3, 4, 5, 6, 7, 8], dtype=int64)

In [38]:
print(export_graphviz(decision_wine))

digraph Tree {
node [shape=box] ;
0 [label="X[10] <= 10.525\nentropy = 1.709\nsamples = 1599\nvalue = [10, 53, 681, 638, 199, 18]"] ;
1 [label="X[9] <= 0.555\nentropy = 1.396\nsamples = 983\nvalue = [7, 34, 575, 328, 37, 2]"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
2 [label="X[1] <= 0.752\nentropy = 1.117\nsamples = 320\nvalue = [3, 19, 235, 62, 1, 0]"] ;
1 -> 2 ;
3 [label="X[10] <= 9.65\nentropy = 0.986\nsamples = 267\nvalue = [0, 10, 201, 55, 1, 0]"] ;
2 -> 3 ;
4 [label="X[8] <= 3.215\nentropy = 0.801\nsamples = 158\nvalue = [0, 7, 131, 20, 0, 0]"] ;
3 -> 4 ;
5 [label="entropy = 0.0\nsamples = 45\nvalue = [0, 0, 45, 0, 0, 0]"] ;
4 -> 5 ;
6 [label="X[10] <= 9.35\nentropy = 0.991\nsamples = 113\nvalue = [0, 7, 86, 20, 0, 0]"] ;
4 -> 6 ;
7 [label="X[6] <= 48.0\nentropy = 1.179\nsamples = 29\nvalue = [0, 6, 20, 3, 0, 0]"] ;
6 -> 7 ;
8 [label="X[8] <= 3.325\nentropy = 1.248\nsamples = 16\nvalue = [0, 6, 9, 1, 0, 0]"] ;
7 -> 8 ;
9 [label="X[0] <= 7.75\nentropy = 0.8

In [39]:
lr_wine = LogisticRegression()

In [40]:
lr_wine.fit(wine_x, wine_y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [41]:
lr_wine.score(wine_x, wine_y)

0.5834896810506567

In [42]:
decision_wine.score(wine_x, wine_y)

1.0

In [43]:
lr_wine.coef_

array([[ 1.09708390e-01,  2.42796665e+00, -2.23410824e-01,
         1.64857128e-01,  3.24699685e-01,  7.55254984e-02,
        -7.95550204e-02, -2.99807232e-02,  6.21057142e-01,
        -3.82170157e-01, -7.79665146e-01],
       [-2.10323106e-01,  2.41543387e+00, -8.33229153e-02,
         1.75220599e-01,  1.38489113e-01, -3.16965043e-02,
        -9.70928903e-03, -2.49562731e-01,  3.02405449e-01,
        -6.43140563e-01, -2.71117245e-01],
       [ 2.71481388e-02,  1.63716615e+00,  3.70049878e-01,
        -5.51552166e-02,  1.42692805e+00, -2.19455573e-02,
         2.02353838e-02,  1.71287847e+00,  1.33928879e+00,
        -1.74444767e+00, -8.53826298e-01],
       [ 2.37815915e-02, -1.42735117e+00, -9.33357167e-01,
        -2.50214424e-02, -3.18853527e-01,  2.45841461e-02,
        -1.31603183e-02, -5.69696913e-01, -2.12555644e-01,
         7.52847980e-01,  1.97619286e-01],
       [-4.15934476e-02, -2.88317226e+00,  1.30454101e-01,
         1.05685545e-01, -1.36620130e+00,  1.42124608e-02,
  

# Cross validation을 통한 모델 검증

In [44]:
from sklearn.model_selection import cross_val_score, KFold, ShuffleSplit

In [45]:
# kfold는 dataset을 n개로 쪼개서, n-1개는 training set, 1개는 validation set으로 사용하도록 도와줌.
kfold = KFold(n_splits=5)

In [46]:
# cross_val_score는 kfold로 생성된 cross_validation의 성능(accuracy)를 계산
# Decision Tree의 경우, training accuracy는 100%였지만
# Cross validation을 수행해본 결과 50% 미만의 validation accuracy를 보이고 있기 때문에, model이 overfit되었다고 이야기할 수 있음.
cross_val_score(DecisionTreeClassifier(), wine_x, wine_y, cv=kfold)

array([0.496875  , 0.496875  , 0.415625  , 0.409375  , 0.49529781])

In [47]:
# Logistic Regression은 training accuracy와 validation accuracy가 거의 비슷함.
# 이를 통해 Logistic Regression은 overfit은 발생하지 않았다는 것을 알 수 있음
# 하지만 데이터셋 자체를 제대로 학습하지 못하는(58%) 것을 통해 underfit이 발생하였다고 생각해볼 수 있음.
# Underfit이 발생하는 경우에는 더 복잡한 기계 학습 모델을 사용하여 이 문제를 해결해야함.
cross_val_score(LogisticRegression(), wine_x, wine_y, cv=kfold)



array([0.60625   , 0.50625   , 0.56875   , 0.55      , 0.61442006])