In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# wineのデータを読み込む(下記はネット上のデータから直接読み込んでいる)
# セパレーターに注意！
df=pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv', sep=';')

# Quality を目的変数に、それ以外を説明変数に
X = df.drop(columns=['quality'])
y = df['quality']

# 上記はこうとも書ける（他にも書き方あり）
# 0-11列目を特徴データ
#X = df.iloc[:,0:11]
# 12列目をラベルデータ
#y = df.iloc[:,11]


# Xを確認
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9


In [2]:
y.head()

0    6
1    6
2    6
3    6
4    6
Name: quality, dtype: int64

In [3]:
# 訓練データ、テストデータの作成
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# アンサンブル手法を適用
#from sklearn.ensemble import RandomForestClassifier
#clf = RandomForestClassifier(n_estimators=100)
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)

#正答率を求める
print("正答率(学習) = ", clf.score(X_train,y_train))
print("正答率(テスト) = ", clf.score(X_test,y_test))

正答率(学習) =  0.7348216716580452
正答率(テスト) =  0.6024489795918367


In [4]:
# y_predに、予測値を入れる
y_pred=clf.predict(X_test)
# 分類結果のリポートを表示
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         5
           4       0.53      0.24      0.33        41
           5       0.65      0.61      0.63       364
           6       0.58      0.76      0.66       550
           7       0.59      0.35      0.44       220
           8       0.73      0.25      0.37        44
           9       0.00      0.00      0.00         1

    accuracy                           0.60      1225
   macro avg       0.44      0.32      0.35      1225
weighted avg       0.61      0.60      0.59      1225



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
# それぞれのクラスに属しているデータ数をカウント
df.groupby('quality').size()

quality
3      20
4     163
5    1457
6    2198
7     880
8     175
9       5
dtype: int64

In [6]:
# データフレームをコピー
df_new = df.copy()

# ３つのクラスに分けた新しいラベルデータを作る（品質が、4以下=0, (5,6,7)=1, 8以上=2）
df_new['quality'].replace({3:0, 4:0, 5:1, 6:1, 7:1, 8:2, 9:2},inplace = True)

# 新しいラベルデータ
y_new = df_new.iloc[:,11]

# それぞれのクラスに属しているデータ数をカウント
df_new.groupby('quality').size()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_new['quality'].replace({3:0, 4:0, 5:1, 6:1, 7:1, 8:2, 9:2},inplace = True)


quality
0     183
1    4535
2     180
dtype: int64

In [7]:
# 新しい訓練・テストデータを作成
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X, y_new, stratify=y_new)

# 学習を実行
clf.fit(X_train_new, y_train_new)

#再度正答率を求める
print("正答率(学習) = ", clf.score(X_train_new, y_train_new))
print("正答率(テスト) = ", clf.score(X_test_new, y_test_new))

正答率(学習) =  0.9564388783011163
正答率(テスト) =  0.923265306122449


In [8]:
# y_pred_newに、予測値を入れる
y_pred_new = clf.predict(X_test_new)
# 分類結果のリポートを表示
from sklearn.metrics import classification_report
print(classification_report(y_test_new, y_pred_new))

              precision    recall  f1-score   support

           0       0.33      0.11      0.16        46
           1       0.93      0.99      0.96      1134
           2       0.62      0.11      0.19        45

    accuracy                           0.92      1225
   macro avg       0.63      0.40      0.44      1225
weighted avg       0.90      0.92      0.90      1225



In [9]:
# オーバーサンプリング手法”SMOTE”でデータセットを調整
from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_sm, y_sm = sm.fit_resample(X, y_new)

# 新しい訓練・テストデータを作成
X_train_sm, X_test_sm, y_train_sm, y_test_sm = train_test_split(X_sm, y_sm, stratify=y_sm)

# 学習を実行
clf.fit(X_train_sm, y_train_sm)

# y_pred_smに、予測値を入れる
y_pred_sm = clf.predict(X_test_sm)
# 分類結果のリポートを表示
print(classification_report(y_test_sm, y_pred_sm))

              precision    recall  f1-score   support

           0       0.90      0.92      0.91      1134
           1       0.90      0.80      0.85      1134
           2       0.88      0.96      0.92      1134

    accuracy                           0.89      3402
   macro avg       0.89      0.89      0.89      3402
weighted avg       0.89      0.89      0.89      3402

