# 第6回課題
前処理と特徴量選択により，SVM の最初のスコアよりも1割程度良いテストスコアを出してください．
ちょこっといじった程度では，線形回帰はスコアが変わらなかったので，参考程度に使ってください
feature_names から対象の変数を削除する．feature_names.remove()などを駆使することで容易に扱う特徴量を変更できます．

### 必須事項
- 前処理：正規化，標準化，外れ値の排除など
- 特徴量選択: 検証は必須．増やす・減らす・変えないの結果は自由
- テストスコアの向上: mse で 0.41 くらいは出ると思います

### 自由事項
- 指標の変更
- ハイパーパラメータの変更（モデルの変更は想定してません）

### 余談
特徴量選択で正解を用意するのは，やはり難しいなと解答を作る時に感じました．解答の方は最低限の考察と検証をしていますが，4時間かかりました( ;∀;)

## 注意事項
特徴量の分析時に分割したデータを使ってない時点でせこいというか，おかしいです．ここでは無視していますが，本来はできません．

In [1]:
# warning 無視しないと エラー文に含まれるディレクトリ構造が github 上に上がる可能性があります
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)
# すべての warning を無視する場合，これでも解決しない場合は該当の cell を再実行すると消えるはずです．
# import warnings
# warnings.filterwarnings('ignore')
# warnings.simplefilter('ignore')\

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import random
import seaborn as sns
from sklearn.linear_model import Ridge
from sklearn.svm import SVC, SVR
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, accuracy_score, make_scorer

In [3]:
random.seed(0)
np.random.seed(0)

Input variables (based on physicochemical tests):

- fixed acidity1, 2
- volatile acidity
- citric acid
- residual sugar
- chlorides
- free sulfur dioxide
- total sulfur dioxide
- density
- pH1, 2
- color intensity
- sulphates
- alcohol, 2

Output variable (based on sensory data):
- quality (score between 0 and 10)

In [4]:
wine_quality_df = pd.read_csv("wine_quality_noise.csv")
print(wine_quality_df.shape)
wine_quality_df.head()

(1599, 16)


Unnamed: 0,alcohol,alcohol2,chlorides,citric acid,color intensity,density,fixed acidity,fixed acidity2,free sulfur dioxide,pH1,pH2,quality,residual sugar,sulphates,total sulfur dioxide,volatile acidity
0,9.4,9.439316,0.076,0.0,0.773702,0.9978,2.040854,7.4,11.0,3.51,10.49,5,1.9,0.56,34.0,0.7
1,9.8,9.536271,0.098,0.0,0.554324,0.9968,12.107711,7.8,25.0,3.2,10.8,5,2.6,0.68,67.0,0.88
2,9.8,9.399997,0.092,0.04,0.647387,0.997,6.301084,7.8,15.0,3.26,10.74,5,2.3,0.65,54.0,0.76
3,9.8,9.910335,0.075,0.56,0.8504,0.998,1.139631,11.2,17.0,3.16,10.84,6,1.9,0.58,60.0,0.28
4,9.4,8.98353,0.076,0.0,0.79035,0.9978,-0.815498,7.4,11.0,3.51,10.49,5,1.9,0.56,34.0,0.7


In [5]:
wine_quality_df.dtypes

alcohol                 float64
alcohol2                float64
chlorides               float64
citric acid             float64
color intensity         float64
density                 float64
fixed acidity           float64
fixed acidity2          float64
free sulfur dioxide     float64
pH1                     float64
pH2                     float64
quality                   int64
residual sugar          float64
sulphates               float64
total sulfur dioxide    float64
volatile acidity        float64
dtype: object

In [6]:
wine_quality_df.describe()

Unnamed: 0,alcohol,alcohol2,chlorides,citric acid,color intensity,density,fixed acidity,fixed acidity2,free sulfur dioxide,pH1,pH2,quality,residual sugar,sulphates,total sulfur dioxide,volatile acidity
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,10.422983,10.422622,0.087467,0.270976,0.487514,0.996747,7.87767,8.319637,15.874922,3.311113,10.688887,5.636023,2.538806,0.658149,46.467792,0.527821
std,1.065668,1.110855,0.047065,0.194801,0.157708,0.001887,9.858849,1.741096,10.460157,0.154386,0.154386,0.807569,1.409928,0.169507,32.895324,0.17906
min,8.4,8.23237,0.012,0.0,0.0,0.99007,-24.368566,4.6,1.0,2.74,9.99,3.0,0.9,0.33,6.0,0.12
25%,9.5,9.570963,0.07,0.09,0.380724,0.9956,0.892154,7.1,7.0,3.21,10.6,5.0,1.9,0.55,22.0,0.39
50%,10.2,10.191857,0.079,0.26,0.486193,0.99675,7.684445,7.9,14.0,3.31,10.69,6.0,2.2,0.62,38.0,0.52
75%,11.1,11.140724,0.09,0.42,0.590375,0.997835,14.510449,9.2,21.0,3.4,10.79,6.0,2.6,0.73,62.0,0.64
max,14.9,14.554818,0.611,1.0,1.0,1.00369,44.616602,15.9,72.0,4.01,11.26,8.0,15.5,2.0,289.0,1.58


In [7]:
feature_names = list(np.copy(wine_quality_df.columns))
feature_names.remove("quality")

In [8]:
X_train, X_test, y_train, y_test = \
    train_test_split(wine_quality_df[feature_names], wine_quality_df["quality"], 
                     test_size=0.3, random_state=0)
X_train.shape, X_test.shape

((1119, 15), (480, 15))

## 注意
ここで，test score まで同時に出してしまっているんですが，1つの関数で実行してしまう方が楽だったという理由だけで，これらは分けた方が良いです．
パラメータや特徴量について考えるときには，CV だけで調整すべきです

In [9]:
kfold = KFold(n_splits=5, random_state=0)
def cross_validation(model, test=True, method="reg"):
    global X_train, X_test, y_train, y_test, feature_names
    evaluation_method = mean_squared_error if method == "reg" else accuracy_score
    scores = cross_val_score(model, X_train[feature_names], y_train, cv=kfold, 
                             scoring=make_scorer(evaluation_method))
    # 各分割におけるスコア
    print('Cross-Validation scores: {}'.format(scores))
    # スコアの平均値
    print('Average score: {}'.format(np.mean(scores)))
    if test:
        model.fit(X_train[feature_names], y_train)
        pred = model.predict(X_test[feature_names])
        print('Test score: {}'.format(evaluation_method(y_test, pred)))

# モデルによる予測

In [10]:
linear_reg = Ridge(random_state=0)
cross_validation(linear_reg)

Cross-Validation scores: [0.48873469 0.4972921  0.43186159 0.42859749 0.36981067]
Average score: 0.4432593072010188
Test score: 0.4013760723024126


In [11]:
svm_reg = SVR(kernel="rbf")
cross_validation(svm_reg)

Cross-Validation scores: [0.63541597 0.68327096 0.58850596 0.59033828 0.50432621]
Average score: 0.6003714763595538
Test score: 0.554464275505919


In [12]:
svm_clf = SVC(kernel="rbf", random_state=0)
cross_validation(svm_clf, method="clf")

Cross-Validation scores: [0.52232143 0.44642857 0.50446429 0.44196429 0.47085202]
Average score: 0.47720611787315814
Test score: 0.48125


# 以降にコードを追加