- Kaggle Red Wine Quality
    - 3, 4, 5 => Bad(0)
    - 6, 7, 8 = > Good(1)

In [3]:
import numpy as np
import pandas as pd

In [4]:
rw = pd.read_csv('../data/winequality-red.csv')
rw.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [5]:
rw.shape, rw.quality.value_counts()

((1599, 12),
 quality
 5    681
 6    638
 7    199
 4     53
 8     18
 3     10
 Name: count, dtype: int64)

In [6]:
rw.isna().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [7]:
rw['target'] = rw.quality.apply(lambda x: 1 if x>=6 else 0)
rw.head(2)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,target
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0


In [8]:
X = rw.iloc[:, :-2].values
y = rw.target.values

In [9]:
# 표준화
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)

In [10]:
# 데이터 분리
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_std, y, stratify=y, test_size=0.2, random_state=2023
)

- 1. 결정트리

In [11]:
# 결정트리 1번
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=2023)
dtc.fit(X_train, y_train)
dtc.score(X_test, y_test)


0.7375

In [12]:
dtc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 2023,
 'splitter': 'best'}

In [13]:
# GridSearchCV로 찾기
from sklearn.model_selection import GridSearchCV

params={'max_depth' : [8, 9, 10], 'min_samples_split':[2, 3, 4]}
grid_dtc = GridSearchCV(dtc, params, scoring='accuracy', cv=5)
grid_dtc.fit(X_train, y_train)
grid_dtc.best_params_


{'max_depth': 9, 'min_samples_split': 2}

In [14]:
dtc = grid_dtc.best_estimator_
dtc.score(X_test, y_test)


0.740625

- 2. SVM

In [15]:
from sklearn.svm import SVC
svc = SVC(random_state=2023)
params = {'C':[0.8, 0.95, 0.9999, 1, 1.1] }
grid_svc = GridSearchCV(svc, params, scoring='accuracy', cv=5)
grid_svc.fit(X_train, y_train)
grid_svc.best_params_

{'C': 0.9999}

In [16]:
svc = grid_svc.best_estimator_
svc.score(X_test, y_test)


0.790625

- 3. KNN

In [17]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn.score(X_test, y_test)

0.734375

- 4. Logisitc회귀

In [18]:
from sklearn.linear_model import LogisticRegression
lrc = LogisticRegression(random_state=2023)
lrc.fit(X_train, y_train)
lrc.score(X_test, y_test)

0.778125

- 5. Random Forest - Bagging 방식

In [31]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=2000)
params = {'max_depth':[9, 10, 11], 'min_samples_split':[4, 5, 6]}
grid_rfc = GridSearchCV(rfc, params, scoring='accuracy', cv=5)
grid_rfc.fit(X_train, y_train)
grid_rfc.best_params_

{'max_depth': 10, 'min_samples_split': 4}

In [33]:
rfc = grid_rfc.best_estimator_
rfc.score(X_test, y_test)

0.8625

- 6. Hard voting

In [21]:
from sklearn.ensemble import VotingClassifier
voc = VotingClassifier(
    estimators=[('DC', dtc), ('SVC', svc), ('KNN', knn), ('LRC', lrc), ('RFC', rfc)],
    voting='hard'
)
voc.fit(X_train, y_train)
voc.score(X_test, y_test)

0.825

- 7. Soft voting

In [22]:
svc2 = SVC(probability=True, random_state=2023)
params = {'C':[0.8, 0.95, 0.9999, 1, 1.1] }
grid_svc2 = GridSearchCV(svc2, params, scoring='accuracy', cv=5)
grid_svc2.fit(X_train, y_train)
grid_svc2.best_params_

{'C': 0.9999}

In [23]:
svc2 = grid_svc2.best_estimator_
svc2.score(X_test, y_test)

0.790625

In [24]:

voc2 = VotingClassifier(
    estimators=[('DC', dtc), ('SVC', svc2), ('KNN', knn), ('LRC', lrc), ('RFC', rfc)],
    voting='soft'
)
voc2.fit(X_train, y_train)
voc2.score(X_test, y_test)

0.8125

In [25]:
# 전체 GridSearchCV
voc2 = VotingClassifier(
    estimators=[('SVC', svc2), ('KNN', knn), ('LRC', lrc)],
    voting='soft'
)

params = {
    'LRC__C':[0.9, 1, 1.2],
    'SVC__C':[0.4, 0.5, 0.6]
}

grid_voc2 = GridSearchCV(voc2, params, scoring='accuracy', cv=5)
grid_voc2.fit(X_train, y_train)
grid_voc2.best_params_


{'LRC__C': 0.9, 'SVC__C': 0.5}

In [26]:
grid_voc2.best_estimator_.score(X_test, y_test)

0.79375

- 8. 결과
    - Random Forest - Bagging 방식
    - 0.859375 로 제일 높음